In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import re
import dateparser
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

Next we need to load our Chat.txt into Python and read it. We will do this using the function below:

In [2]:
def read_file(file):
    '''Reads Whatsapp text file into a list of strings''' 
    x = open(file,'r', encoding = 'utf-8') #Opens the text file into variable x but the variable cannot be explored yet
    y = x.read() #By now it becomes a huge chunk of string that we need to separate line by line
    content = y.splitlines() #The splitline method converts the chunk of string into a list of strings
    return content

In [3]:
chat = read_file('HND2-NYSC OFFICIAL GROUP.txt')
len(chat)

55067

In [4]:
join = [line for line in chat if  "joined using this" in line]
len(join)

1

In [5]:
#Remove new lines
chat = [line.strip() for line in chat]
print("length of chat is:")
print(len(chat))
#Clean out the join notification lines
clean_chat = [line for line in chat if not "joined using this" in line]
#Further cleaning
#Remove empty lines
clean_chat = [line for line in clean_chat if len(line) > 1]
print("length of clean_chat is:")
print(len(clean_chat))

length of chat is:
55067
length of clean_chat is:
49131


In [6]:
#Drop 'left-ers'
left = [line for line in clean_chat if line.endswith("left")]
len(left)

26

In [7]:
#Clean out the left notification lines
clean_chat = [line for line in clean_chat if not line.endswith("left")]
print(len(clean_chat))

49105


In [8]:
#Merge messages that belong together
msgs = [] #message container
pos = 0 #counter for position of msgs in the container
"""
Flow:
For every line, see if it matches the expression which is starting with the format "number(s)+slash" eg "12/"
If it does, it is a new line of conversion as they begin with dates, add it to msgs container
Else, it is a continuation of the previous line, add it to the previous line and append to msgs, then pop previous line.
"""
for line in clean_chat:
    if re.findall("\A\d+[/]", line):
        msgs.append(line)
        pos += 1
    else:
        take = msgs[pos-1] + ". " + line
        msgs.append(take)
        msgs.pop(pos-1)
len(msgs)

39975

In [9]:
msgs[0:10]

['6/10/21, 3:15 PM - Juliana: Orisirisi',
 '6/10/21, 3:54 PM - Adewole Paul: I dey here.. I think I like that department.  Mr. President you are in a safe hand.',
 '6/10/21, 3:59 PM - Oluwatayo Raheem: This is Adebayo,',
 '6/10/21, 4:00 PM - Tobi Mechatronics: Abeg this calls for celebration',
 '6/10/21, 4:01 PM - Oluwatayo Raheem: Light na u oooo',
 '6/10/21, 4:01 PM - Tobi Mechatronics: @2348132027307 how we Dey celebrate am',
 "6/10/21, 4:01 PM - Mr A'zeez Akanbi: Before nko. Are you not the organiser. As inlaw concern.",
 '6/10/21, 4:02 PM - Tobi Mechatronics: Dem never book me as their event planner ooo',
 '6/10/21, 4:07 PM - Abiodun Ridwan: Kai check for your name and post',
 '6/10/21, 7:26 PM - +234 812 225 1535: <Media omitted>']

In [10]:
time = [msgs[i].split(',')[1].split('-')[0] for i in range(len(msgs))]
time = [s.strip(' ') for s in time] # Remove spacing
print("length of time is:")
print(len(time))

IndexError: list index out of range

In [None]:
date = [msgs[i].split(',')[0] for i in range(len(msgs))]
len(date)
name = [msgs[i].split('-')[1].split(':')[0] for i in range(len(msgs))]
len(name)
content = []
for i in range(len(msgs)):
  try:
    content.append(msgs[i].split(':')[2])
  except IndexError:
    content.append('Missing Text')
len(content)

In [None]:
df = pd.DataFrame(list(zip(date, time, name, content)), columns = ['Date', 'Time', 'Name', 'Content'])
df

In [None]:
df = df[df["Content"]!='Missing Text']
df.reset_index(inplace=True, drop=True)
df

In [None]:
df['DateTime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])
df['DateTime']

In [None]:
df['weekday'] = df['DateTime'].apply(lambda x: x.day_name())

In [None]:
df['Letter_Count'] = df['Content'].apply(lambda s : len(s))
df['Word_Count'] = df['Content'].apply(lambda s : len(s.split(' ')))

In [None]:
df['Hour'] = df['Time'].apply(lambda x : x.split(':')[0]) 
# The first token of a value in the Time Column contains the hour (Eg., "12" in "12:15")

In [None]:
#print first five rows of our dataframe
df.head()

In [None]:
#saving to csv format
df.to_csv("mtr_cleaned_data.csv")