In [None]:
%pip install -r requirements.txt

In [None]:
import re
import pandas as pd
import numpy as np
import emoji
import plotly.express as px
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [None]:
def date_time(s):
    pattern = '^([0-9]+)(\/)([0-9]+)(\/)([0-9]+), ([0-9]+):([0-9]+)[ ]?(AM|PM|am|pm)? -'
    result = re.match(pattern, s)
    if result:
        return True
    return False

def find_author(s):
    s = s.split(":")
    if len(s)==2:
        return True
    else:
        return False

def getDatapoint(line):
    splitline = line.split(' - ')
    dateTime = splitline[0]
    date, time = dateTime.split(", ")
    message = " ".join(splitline[1:])
    if find_author(message):
        splitmessage = message.split(": ")
        author = splitmessage[0]
        message = " ".join(splitmessage[1:])
    else:
        author= None
    return date, time, author, message

In [None]:
data = []
conversation = '.txt'
with open(conversation, encoding="utf-8") as fp:
    fp.readline()
    messageBuffer = []
    date, time, author = None, None, None
    while True:
        line = fp.readline()
        if not line:
            break
        line = line.strip()
        if date_time(line):
            if len(messageBuffer) > 0:
                data.append([date, time, author, ' '.join(messageBuffer)])
            messageBuffer.clear()
            date, time, author, message = getDatapoint(line)
            messageBuffer.append(message)
        else:
            messageBuffer.append(line)

chat = pd.DataFrame(data, columns=['Date', 'Time', 'Author', 'Message'])

In [None]:
chat["Date"] = pd.to_datetime(chat["Date"], dayfirst=True)
chat["Time"] = pd.to_datetime(chat["Time"], dayfirst=True)

chat['weekday'] = chat['Date'].apply(lambda x: x.day_name())
chat['month_sent'] = chat['Date'].apply(lambda x: x.month_name()) 
chat['date'] = [d.date() for d in chat['Date']] 
chat['hour'] = [d.time().hour for d in chat['Time']]

In [None]:
URLPATTERN = r'(https?://\S+)'
chat['urlcount'] = chat.Message.apply(lambda x: re.findall(URLPATTERN, x)).str.len()
chat['Letter_Count'] = chat['Message'].apply(lambda s : len(s))
chat['Word_Count'] = chat['Message'].apply(lambda s : len(s.split(' ')))

In [None]:
def split_count(text):
    emoji_list = []
    data = re.findall(r'\\x', text)
    for word in data:
        try:
            if any(char in emoji.UNICODE_EMOJI and char != "\U0001f3fc" for char in word):
                for char in word:
                    if char in emoji.UNICODE_EMOJI:
                        emoji_list.append(hex(ord(char)))
        except:
            pass
    return emoji_list

chat["emoji"] = chat["Message"].apply(split_count)

In [None]:
chat.head()

In [None]:
date_grouped = chat.groupby('date')['Message'].count().plot(kind='line', figsize=(20,10))

In [None]:
weekday_grouped_msg = (chat.groupby('weekday')['Message']
                      .count()
                      .reset_index(name='count'))

fig = px.line_polar(weekday_grouped_msg, r='count', theta='weekday', line_close=True)
fig.update_traces(fill='toself')
fig.update_layout(polar=dict(radialaxis=dict(visible=False)), showlegend=False)
fig.update_layout(title_text='Total Messages by Day of the Week')

fig.show()

In [None]:
hour_grouped_msg =  (chat.set_index('hour')['Message']
                          .groupby(level=0)
                          .value_counts()
                          .groupby(level=0)
                          .sum()
                          .reset_index(name='count'))

fig = px.bar(hour_grouped_msg, x='hour', y='count',
                 labels={'hour':'24 Hour Period'},)
fig.update_traces(marker_line_width=1.5, opacity=0.6)
fig.update_layout(title_text='Total Messages by Hour of the Day')
fig.show()

In [None]:
grouped_by_month_and_day = (
    chat.groupby(["month_sent", "weekday"])["Message"]
    .value_counts()
    .reset_index(name="count"))

months = [
    "January",
    "February",
    "March",
    "April",
    "May",
    "June",
    "July",
    "August",
    "September",
    "October",
    "November",
    "December",]

days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
pt = grouped_by_month_and_day.pivot_table(
    index="month_sent", columns="weekday", values="count"
    ).reindex(index=months, columns=days)
fig = px.imshow(pt, labels=dict(x="Day of Week", y="Months", color="Count"), x=days, y=months)
fig.update_layout(width=700, height=700)
fig.show()

In [None]:
total_messages = chat.shape[0]
media_messages = chat[chat['Message'] == '<Multimedia omtted>'].shape[0]
average_message_words = chat['Word_Count'].mean()
average_message_letters = chat['Letter_Count'].mean()
average_message_day = chat.groupby('date')['Message'].count().mean()
print('Total Messages ',total_messages)
print('Media Messages', media_messages)
print('Average Words by Messages', round(average_message_words, 2))
print('Average Letters by Messages', round(average_message_letters, 2))
print('Average Message Per Day', round(average_message_day, 2))

In [None]:
qty_message_author = chat['Author'].value_counts()
qty_message_author.plot(kind='barh',figsize=(20,10), color=['#D4A29C', '#E8B298', '#EDCC8B', '#BDD1C5', '#9DAAA2'])
qty_message_author

In [None]:
common_words = chat[['Author','Message']].copy()
common_words["Message"] = common_words["Message"].replace("'", "")
# print(common_words["Message"].head(50))

from nltk.corpus import stopwords
STOPWORDS = stopwords.words('english')

stopwords = list(STOPWORDS)

extra = ["the", "be", "to", "of", "and", "a", "in", "that", "have", "I", "i", "it", "for",
        "not", "on", "with", "he", "as", "you", "do", "at", "this", "but", "his", "by",
        "from", "they", "we", "say", "her", "she", "or", "an", "will", "my", "one",
        "all", "would", "there", "their", "what", "so", "up", "out", "if", "about",
        "who", "get", "which", "go", "me", "when", "make", "can", "like", "time", "no",
        "just", "him", "know", "take", "people", "into", "year", "your", "good", "some",
        "could", "them", "see", "other", "then", "than", "now", "look", "only", "come",
        "its", "over", "think", "also", "back", "after", "use", "two", "how", "our",
        "work", "first", "well", "way", "even", "new", "want", "because", "any", "these",
        "give", "day", "most", "us", "media", "omitted>", "<media", "omitted", "multimedia",
        "<<", "idk", "lol", "ok", "right", "much", "got", "i'll", "i'm", "it's", "its",
        "don't", "dont", "thought", "cus", "cos", "oh", "u", "im", "okay", "yeah", "sure",
        "haha", "yes", "cuz", "==", "ill", "gonna", "though", "thing", "-", "ya", "yah",
        "that's", "going", ]

stopwords += extra
common_words["Message"] = (common_words["Message"]
                           .str.lower()
                           .str.split()
                           .apply(lambda x: [item for item in x if item not in stopwords])
                           .explode()
                           .reset_index(drop=True))

# common_words['Message']= common_words['Message'].apply(remove_emoji)
common_words['Message']= common_words['Message'].replace('nan', np.NaN)
common_words['Message']= common_words['Message'].replace('', np.NaN)
common_words['Message']= common_words.Message.str.replace(r"(a|h)?(ha)+(a|h)?", "haha")
common_words['Message']= common_words.Message.str.replace(r"(a|h)?(haha)+(a|h)?", "haha")
common_words['Message']= common_words.Message.str.replace(r"(a|h)?(hahaha)+(a|h)?", "haha")
common_words['Message']= common_words.Message.str.replace(r"(a|h)?(hahhahha)+(a|h)?", "haha")

words_dict = dict(Counter(common_words.Message))
words_dict = sorted(words_dict.items(), key=lambda x: x[1], reverse=True)

words_dict = pd.DataFrame(words_dict, columns=['words', 'count'])

fig = px.bar(words_dict.head(10).dropna(), x='words', y='count',
                 labels={'words':'Common Words'}, 
                 height=400)
fig.update_traces(marker_color='#EDCC8B', marker_line_color='#D4A29C',
                  marker_line_width=1.5, opacity=0.6)
fig.update_layout(title_text='Common Words Chart')
fig.show()

In [None]:
TopTen = 10
author_common_words = (
    common_words.set_index("Author")["Message"]
    .dropna()
    .groupby(level=0)
    .value_counts()
    .groupby(level=0)
    .head(TopTen)
    .rename_axis(("Author", "words"))
    .reset_index(name="count"))

l = author_common_words.Author.unique()
for i in range(len(l)):
    dummy_df = author_common_words[author_common_words["Author"] == l[i]]
    print(dummy_df)
    print("Most Common Words by", l[i])
    fig = px.bar(
        dummy_df,
        x="words",
        y="count",
        labels={"words": "Author Common Words"},
        height=380,)
    fig.update_traces(
        marker_color="#EDCC8B",
        marker_line_color="#D4A29C",
        marker_line_width=1.5,
        opacity=0.6,)
    fig.update_layout(title_text="Author Common Words Chart")
    fig.show()

In [None]:
def plot_cloud(wordcloud):
    plt.figure(figsize=(40, 30))
    plt.imshow(wordcloud)
    plt.axis("off")

def remove_urls(text):
    url_pattern = re.compile(r"https?://\S+|www\.\S+")
    return url_pattern.sub(r"", text)

In [None]:
chat_word_cloud = chat[["Message"]].copy()
# chat_word_cloud['Message']= chat_word_cloud['Message'].apply(remove_emoji)
chat_word_cloud["Message"] = chat_word_cloud["Message"].apply(remove_urls)
chat_word_cloud["Message"] = chat_word_cloud["Message"].replace("nan", np.NaN)
chat_word_cloud["Message"] = chat_word_cloud["Message"].replace("", np.NaN)
text = " ".join(review for review in chat_word_cloud.Message.dropna())
wordcloud = WordCloud(
    width=3000,
    height=2000,
    random_state=1,
    background_color="black",
    colormap="Set2",
    collocations=False,
    stopwords=stopwords,).generate(text)

plot_cloud(wordcloud)