In [20]:
import pandas as pd
import numpy as np
from datetime import datetime
import re
import warnings
warnings.filterwarnings("ignore")

Next we need to load our Chat.txt into Python and read it. We will do this using the function below:

In [21]:
with open('gbwhatsapp.txt', "r", encoding='utf-8') as infile:
    output_Data = { 'DateTime': [], 'Name': [], 'Content': [] }
    for line in infile:
        matches = re.match(r'^(\d{1,2})\/(\d{1,2})\/(\d\d), (24:00|2[0-3]:[0-5][0-9]|[0-1][0-9]:[0-5][0-9]) - ((\S[^:]*?): )?(.*)$', line)
        if matches:
          output_Data['DateTime'].append(
            datetime(
              int(matches.group(3))+2000,
              int(matches.group(1)),
              int(matches.group(2)),
              hour=int(matches.group(4)[0:2]),
              minute=int(matches.group(4)[3:])
            ))
          output_Data['Name'].append(matches.group(6) or "{undefined}")
          output_Data['Content'].append(matches.group(7))

        elif len(output_Data['Content']) > 0:
          output_Data['Content'][-1] += "\n" + line[0:-1]


In [22]:
df = pd.DataFrame(output_Data)
df.head()

Unnamed: 0,DateTime,Name,Content
0,2021-07-07 17:50:00,{undefined},Messages and calls are end-to-end encrypted. N...
1,2019-12-03 18:30:00,{undefined},"Group creator created group ""GBfoods Nigeria"""
2,2021-07-07 17:49:00,{undefined},GB IT Deji added you
3,2021-07-07 17:51:00,{undefined},GB IT Deji added GB Soji
4,2021-07-07 17:56:00,+234 812 990 1693,Stephanie


Remove messages where Name is 'undefined' as these represent system messages.

In [23]:
print("length of df before:{}".format(len(df)))
#Remove messages where Name is 'undefined' as these represent system messages.
df = df[~df["Name"].str.contains("undefined")]
print("length of df after:{}".format(len(df)))

length of df before:11042
length of df after:10911


In [24]:
df.head(10)

Unnamed: 0,DateTime,Name,Content
4,2021-07-07 17:56:00,+234 812 990 1693,Stephanie
5,2021-07-07 18:02:00,+234 814 849 1766,🤣🤣
6,2021-07-07 18:03:00,+234 806 207 1591,So closely far-away
7,2021-07-07 18:03:00,+234 806 207 1591,It was supposed to be you but....
8,2021-07-07 18:08:00,+234 806 207 1591,Reminder... The much anticipated Hot Seat Q&A ...
9,2021-07-07 18:09:00,+234 806 207 1591,Lolzzxx
10,2021-07-07 18:10:00,+234 812 990 1693,Which mail o
11,2021-07-07 18:11:00,+234 809 507 3153,https://teams.microsoft.com/l/meetup-join/19%3...
12,2021-07-07 18:11:00,+234 806 207 1591,https://teams.microsoft.com/l/meetup-join/19%3...
13,2021-07-07 18:11:00,+234 812 990 1693,👌🏾


In [25]:
df[df["Content"].str.contains('\n')].head()

Unnamed: 0,DateTime,Name,Content
31,2021-07-07 18:33:00,GB Lynda,Guys another 10mins to try...\nOur sales Team ...
38,2021-07-07 18:36:00,GB Lynda,My people.. You won't want to miss this...\n\n...
49,2021-07-07 18:54:00,+234 812 912 2073,"Bayo, you are confused.\nIs Dr. Teddy not just..."
60,2021-07-07 19:13:00,GB Lynda,People join the Hot seat session\nOur own Dr T...
76,2021-07-08 14:48:00,+234 812 912 2011,"Good afternoon Team, please remember to connec..."


In [26]:
df["Content"] = df["Content"].replace('\n', ' ', regex=True)

In [27]:
df[df["Content"].str.contains('\n')]

Unnamed: 0,DateTime,Name,Content


 Create Columns for Date, Time, Word Count etc.

In [28]:
df['Date'] = [datetime.date(d) for d in df['DateTime']] 
df["Date"].head()

4    2021-07-07
5    2021-07-07
6    2021-07-07
7    2021-07-07
8    2021-07-07
Name: Date, dtype: object

In [29]:
df['Time'] = [datetime.time(d) for d in df['DateTime']]
df["Time"].shape

(10911,)

In [30]:
df['Hour'] = df.DateTime.dt.hour
df["Hour"].head()

4    17
5    18
6    18
7    18
8    18
Name: Hour, dtype: int64

In [31]:
df['weekday'] = df['DateTime'].apply(lambda x: x.day_name())

In [32]:
df['Word_Count'] = df['Content'].str.count(' ') + 1
df['Letter_Count'] = df['Content'].apply(lambda s : len(s))

In [33]:
df.reset_index(drop=True, inplace=True)

In [34]:
df.head()

Unnamed: 0,DateTime,Name,Content,Date,Time,Hour,weekday,Word_Count,Letter_Count
0,2021-07-07 17:56:00,+234 812 990 1693,Stephanie,2021-07-07,17:56:00,17,Wednesday,1,9
1,2021-07-07 18:02:00,+234 814 849 1766,🤣🤣,2021-07-07,18:02:00,18,Wednesday,1,2
2,2021-07-07 18:03:00,+234 806 207 1591,So closely far-away,2021-07-07,18:03:00,18,Wednesday,3,19
3,2021-07-07 18:03:00,+234 806 207 1591,It was supposed to be you but....,2021-07-07,18:03:00,18,Wednesday,7,33
4,2021-07-07 18:08:00,+234 806 207 1591,Reminder... The much anticipated Hot Seat Q&A ...,2021-07-07,18:08:00,18,Wednesday,20,117


In [35]:
#saving to csv format
df.to_csv("Gb_whatsapp.csv")