In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import re
import dateparser
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

Next we need to load our Chat.txt into Python and read it. We will do this using the function below:

In [2]:
def read_file(file):
    '''Reads Whatsapp text file into a list of strings''' 
    x = open(file,'r', encoding = 'utf-8') #Opens the text file into variable x but the variable cannot be explored yet
    y = x.read() #By now it becomes a huge chunk of string that we need to separate line by line
    content = y.splitlines() #The splitline method converts the chunk of string into a list of strings
    return content

In [23]:
chat = read_file('HND2-NYSC OFFICIAL GROUP.txt')
len(chat)

113

In [24]:
join = [line for line in chat if  "joined using this" in line]
len(join)

23

In [25]:
#Remove new lines
chat = [line.strip() for line in chat]
print("length of chat is:")
print(len(chat))
#Clean out the join notification lines
clean_chat = [line for line in chat if not "joined using this" in line]
#Further cleaning
#Remove empty lines
clean_chat = [line for line in clean_chat if len(line) > 1]
print("length of clean_chat is:")
print(len(clean_chat))

length of chat is:
113
length of clean_chat is:
65


In [26]:
#Drop 'left-ers'
left = [line for line in clean_chat if line.endswith("left")]
len(left)

1

In [27]:
#Clean out the left notification lines
clean_chat = [line for line in clean_chat if not line.endswith("left")]
print(len(clean_chat))

64


In [28]:
#Merge messages that belong together
msgs = [] #message container
pos = 0 #counter for position of msgs in the container
"""
Flow:
For every line, see if it matches the expression which is starting with the format "number(s)+slash" eg "12/"
If it does, it is a new line of conversion as they begin with dates, add it to msgs container
Else, it is a continuation of the previous line, add it to the previous line and append to msgs, then pop previous line.
"""
for line in clean_chat:
    if re.findall("\A\d+[/]", line):
        msgs.append(line)
        pos += 1
    else:
        take = msgs[pos-1] + ". " + line
        msgs.append(take)
        msgs.pop(pos-1)
len(msgs)

31

In [29]:
msgs[0:10]

['7/24/22, 8:57 AM - Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them. Tap to learn more.',
 '7/16/22, 8:36 PM - +234 810 004 9756 created group "HGC ‘22 Mega Jam Choir"',
 '7/24/22, 4:08 PM - +234 810 004 9756: Happy Sunday beautiful people of God!!',
 '7/24/22, 4:10 PM - +234 810 004 9756: This message was deleted',
 "7/24/22, 4:10 PM - +234 810 004 9756: _Before the Lord our God we've come to Bow_. And. _Roar_ both by *Dunsin Oyekan*.. Above are part of the songs to listen to towards MegaJam",
 '7/24/22, 4:11 PM - +234 810 004 9756: Kindly acknowledge the message once you read it, thank you!',
 '7/24/22, 4:11 PM - +234 708 954 5383: Acknowledged',
 '7/24/22, 4:11 PM - +234 705 030 4772: Acknowledge.',
 '7/24/22, 4:14 PM - Iyin🐼🦋: Acknowledged',
 '7/24/22, 4:47 PM - +234 904 890 5812: Acknowledge']

In [30]:
time = [msgs[i].split(',')[1].split('-')[0] for i in range(len(msgs))]
time = [s.strip(' ') for s in time] # Remove spacing
print("length of time is:")
print(len(time))

length of time is:
31


In [31]:
date = [msgs[i].split(',')[0] for i in range(len(msgs))]
len(date)
name = [msgs[i].split('-')[1].split(':')[0] for i in range(len(msgs))]
len(name)
content = []
for i in range(len(msgs)):
  try:
    content.append(msgs[i].split(':')[2])
  except IndexError:
    content.append('Missing Text')
len(content)

31

In [32]:
df = pd.DataFrame(list(zip(date, time, name, content)), columns = ['Date', 'Time', 'Name', 'Content'])
df

Unnamed: 0,Date,Time,Name,Content
0,7/24/22,8:57 AM,Messages and calls are end,Missing Text
1,7/16/22,8:36 PM,"+234 810 004 9756 created group ""HGC ‘22 Mega...",Missing Text
2,7/24/22,4:08 PM,+234 810 004 9756,Happy Sunday beautiful people of God!!
3,7/24/22,4:10 PM,+234 810 004 9756,This message was deleted
4,7/24/22,4:10 PM,+234 810 004 9756,_Before the Lord our God we've come to Bow_. ...
5,7/24/22,4:11 PM,+234 810 004 9756,Kindly acknowledge the message once you read ...
6,7/24/22,4:11 PM,+234 708 954 5383,Acknowledged
7,7/24/22,4:11 PM,+234 705 030 4772,Acknowledge.
8,7/24/22,4:14 PM,Iyin🐼🦋,Acknowledged
9,7/24/22,4:47 PM,+234 904 890 5812,Acknowledge


In [16]:
df = df[df["Content"]!='Missing Text']
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,Date,Time,Name,Content
0,5/7/22,7:01 PM,S+N Ayoade,Waiting for this message
1,5/7/22,7:03 PM,+234 816 484 7452,Finally we have people that can answer any qu...
2,5/7/22,7:03 PM,+234 816 484 7452,I am moving over to the side of this that bri...
3,5/7/22,7:04 PM,+234 812 091 9501,No cap! 🔥
4,5/7/22,7:04 PM,S+N Ayoade,Na to ask for tips before going for interview...
...,...,...,...,...
19578,8/3/22,11:14 PM,Joe,"I'm not sure if you could pass ""skip this num..."
19579,8/3/22,11:15 PM,Joe,"I am assuming, date time author and message i..."
19580,8/3/22,11:17 PM,S+N Ayoade,Yea
19581,8/3/22,11:18 PM,S+N Ayoade,Should I share with you ?. This your message ...


In [17]:
df['DateTime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])
df['DateTime']

0       2022-05-07 19:01:00
1       2022-05-07 19:03:00
2       2022-05-07 19:03:00
3       2022-05-07 19:04:00
4       2022-05-07 19:04:00
                ...        
19578   2022-08-03 23:14:00
19579   2022-08-03 23:15:00
19580   2022-08-03 23:17:00
19581   2022-08-03 23:18:00
19582   2022-08-03 23:28:00
Name: DateTime, Length: 19583, dtype: datetime64[ns]

In [18]:
df['weekday'] = df['DateTime'].apply(lambda x: x.day_name())

In [19]:
df['Letter_Count'] = df['Content'].apply(lambda s : len(s))
df['Word_Count'] = df['Content'].apply(lambda s : len(s.split(' ')))

In [20]:
df['Hour'] = df['Time'].apply(lambda x : x.split(':')[0]) 
# The first token of a value in the Time Column contains the hour (Eg., "12" in "12:15")

In [21]:
#print first five rows of our dataframe
df.head()

Unnamed: 0,Date,Time,Name,Content,DateTime,weekday,Letter_Count,Word_Count,Hour
0,5/7/22,7:01 PM,S+N Ayoade,Waiting for this message,2022-05-07 19:01:00,Saturday,25,5,7
1,5/7/22,7:03 PM,+234 816 484 7452,Finally we have people that can answer any qu...,2022-05-07 19:03:00,Saturday,69,14,7
2,5/7/22,7:03 PM,+234 816 484 7452,I am moving over to the side of this that bri...,2022-05-07 19:03:00,Saturday,65,15,7
3,5/7/22,7:04 PM,+234 812 091 9501,No cap! 🔥,2022-05-07 19:04:00,Saturday,10,4,7
4,5/7/22,7:04 PM,S+N Ayoade,Na to ask for tips before going for interview...,2022-05-07 19:04:00,Saturday,50,11,7


In [22]:
#saving to csv format
df.to_csv("ydp_cleaned_data.csv")