<a href="https://colab.research.google.com/github/techmalik/whatsapp-chats-analysis/blob/master/Whatsapp%20Chat%20Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import re
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [2]:
# The whatsApp chat data is exported as a .txt file
#Red txt data file (note: the text file should be in the same directory as this notebook)

with open("EXPORTED CHAT.txt", "r", encoding = "utf-8") as file:
    chats = file.readlines()
print(len(chats))


35061


In [3]:
#Remove new lines
chats = [line.strip() for line in chats]
chats[:10]


#Getting join notification (when new members join the group) lines
join_notif = [line for line in chats if  "joined using this" in line]
print(join_notif[:3]) # you and the next two people who joined after you
print()
print('number of people that joined after you = ' +str(len(join_notif)-1))
print()

#Clean out the join notification lines and admin added notification
rifesh = [line for line in chats if not "joined using this" in line]
rifesh = [line for line in rifesh if not 'added +234' in line]

#Further cleaning
#Remove empty lines
rifesh = [line for line in rifesh if len(line) > 1]
print("number of chats = " + str(len(rifesh)))

[]

number of people that joined after you = -1

number of chats = 34866


In [None]:
#Remove even more auto lines
rifesh = [line for line in rifesh if not "You revoked" in line]
print(len(rifesh))

In [None]:
#Drop notification for people who left the group'left-ers'
left1 = [line for line in rifesh if line.endswith("left")]
print("number of people who left since you joined = " + str(len(left1)))
print()
rifesh = [line for line in rifesh if not line.endswith("left")]
print("number of chats remaining = " + str(len(rifesh)))

In [None]:
#Merge messages that belong together
msgs = [] #message container
pos = 0 #counter for position of msgs in the container
"""
Flow:
For every line, see if it matches the expression which is starting with the format "number(s)+slash" eg "12/"
If it does, it is a new line of conversion as they begin with dates, add it to msgs container
Else, it is a continuation of the previous line, add it to the previous line and append to msgs, then pop previous line.
"""
for line in rifesh:
    if re.findall("\A\d+[/]", line):
        msgs.append(line)
        pos += 1
    else:
        take = msgs[pos-1] + ". " + line
        msgs.append(take)
        msgs.pop(pos-1)
        
        

print(len(msgs))

In [None]:
msgs[1:9]

In [None]:
#Drop first two lines: auto msg (the admin created and the you joined messages)
msgs = msgs[2:]
len(msgs)

In [None]:
#peep message format so far
msgs[50:55]

In [None]:

#make arrays of different parts of the messages
Date = []
Time = []
Number_Author = []
msg = []
counter = 0
for line in msgs:
    x = line.split()
    #Get the date
    dating = x[0]
    Date.append(dating.replace(',',''))
    
    #Get the time
    timing = x[1] + " " +x[2]
    Time.append(timing)
    
    #Get number
    num = " ".join(x[3:9]) #the parts numbers/names fall in
    try:
        match = re.search(r"- ([^']*):", num).group(1) #match between '- ' and ':' where they are per se
        Number_Author.append(match)
    except:
        Number_Author.append("-")
        pass
    #Get message
    
    part = line.partition(match)[2] #partition according to the match and extract the last part, the msg
    msging = part[2:] #strip off the first two characters as they are ": "
    msg.append(msging)
    


print(len(Date), len(Time), len(Number_Author), len(msg))

In [None]:
#Put the different parts in a dataframe
rifesh_df = pd.DataFrame()
rifesh_df["Date"] = Date
rifesh_df["Time"] = Time
rifesh_df["Number_Author"] = Number_Author
rifesh_df["msg"] = msg

rifesh_df.head()

In [None]:
rifesh_df['Date'].head(50)

In [None]:
# top most engaged time on the group
toptimeengaged = rifesh_df.Time.value_counts(ascending=False).head(10)
ax1 = toptimeengaged.plot(kind="bar", color='Darkblue')
ax1.set_xlabel ('Time')
ax1.set_ylabel ('Frequency')
ax1.set_title("Top 10 Time of Engagement")

plt.show()
%matplotlib inline

In [None]:
# top 20 posters/noisemakers on the group
top20messengers = rifesh_df.Number_Author.value_counts(ascending=False).head(20)

ax = top20messengers.plot.barh(color='Darkblue')
ax.set_xlabel ('Number of sent message')
ax.set_ylabel("Users")
ax.set_title("Top 20 Users that sent more messages")
plt.show()

In [None]:
# add the letters count and the word count of each message to the dataframe
rifesh_df['Letter_Count'] = rifesh_df['msg'].apply(lambda s : len(s))
rifesh_df['Word_Count'] = rifesh_df['msg'].apply(lambda s : len(s.split(' ')))
rifesh_df.head()

In [None]:
rifesh_df.describe(include='object')

In [None]:
# The number of letters, words used in the group till date
rifesh_df['Letter_Count'].sum(), rifesh_df['Word_Count'].sum()

In [None]:
# Top 10 most engaged dates

top10date = rifesh_df['Date'].value_counts().head(10)# Top 10 Dates on which the most number of messages were sent
ax = top10date.plot.barh(color= 'Darkblue')
ax.set_xlabel('Number of Messages')
ax.set_ylabel('Date')
ax.set_title('Top 10 date of engagement')

In [None]:
# adding the engagement hour  to the dataframe 
rifesh_df['Hour'] = rifesh_df['Time'].apply(lambda x : x.split(':')[0]) # The first token of a value in the Time Column contains the hour (Eg., "20" in "20:15")
rifesh_df.head()

In [None]:
# top 10 most engaged hour of the day

top10hourtochat = rifesh_df['Hour'].value_counts().head(10).sort_index(ascending=False) # Top 10 Hours of the day during which the most number of messages were sent
ax = top10hourtochat.plot.line(color='Darkblue')
ax.set_xlabel('Hour of Day')
ax.set_ylabel('Number of messages')
ax.set_title('Top Hours of engagement by Users')

In [None]:
# Generating the words used and their frequencies of use.
def gen_freq(text):
    #Will store the list of words
    word_list = []

    #Loop over all the messages and extract words into word_list
    for tw_words in text.split():
        word_list.extend(tw_words)

    #Create word frequencies using word_list
    word_freq = pd.Series(word_list).value_counts()

    #Print top 20 words
    word_freq[:20]
    
    return word_freq

gen_freq(rifesh_df.msg.str)[:10]

In [None]:
# Determine the frequency of a word in the group chat
def word_frequency(word, text):
    word_list = []

    #Loop over all the chats and extract words into word_list
    for tw_words in text.split():
        word_list.extend(tw_words)
    if word in word_list:
        
        return (str(word.upper())+", appeared in the chat "+str(word_list.count(word.lower())) + " times.")
    else:
        return str(word) + " hasn't been used yet."

#you can replace the below words with any word you may want to target in the chat
print(word_frequency('google', rifesh_df.msg.str))
print(word_frequency('git', rifesh_df.msg.str))
print(word_frequency('stack', rifesh_df.msg.str))
print(word_frequency('money', rifesh_df.msg.str))
print(word_frequency('community', rifesh_df.msg.str))
print(word_frequency('Dev', rifesh_df.msg.str))

In [None]:
# cleaning the texts of the chats further

def clean_text(text):
    #Remove RT
    text = re.sub(r'RT', '', text)
    
    #Fix &
    text = re.sub(r'&amp;', '&', text)
    
    #Remove punctuations
    text = re.sub(r'[?!.;:,#@-]', '', text)
    
    #Remove unneccessary word
    text = re.sub(r'omitted>', '', text)
    text = re.sub(r'<Media', '<', text)
    text = re.sub(r'will', '', text)
    text = re.sub(r'thanks', '', text)
    
    #Convert to lowercase to maintain consistency
    text = text.lower()
    return text

In [None]:
#Import list of stopwords (frequently used words)

from wordcloud import STOPWORDS
from wordcloud import WordCloud


print(STOPWORDS)

In [None]:
# Remove stopwords from the messages and visualise the most used words after the stop words.

text = rifesh_df.msg.apply(lambda x: clean_text(x))
word_freq = gen_freq(text.str)*100
word_freq = word_freq.drop(labels=STOPWORDS, errors='ignore')

#Generate word cloud
wc = WordCloud(width=450, height=400, max_words=300, background_color='black').generate_from_frequencies(word_freq)

plt.figure(figsize=(12, 14))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()
