In [1]:
import re as regex, pandas, numpy
pandas.set_option('max_colwidth', 700)
from datetime import date as get_date, datetime
import plotly.graph_objects as graph
from collections import Counter

## Cleaning stage

In [2]:
with open(file = "./data/WhatsApp Chat with Hluteko.txt", mode = "r") as file:
    data = file.read()

In [3]:
def assign(hash, name, chat):
    hash[name]['full'] = hash[name]['full'] + [chat]
    if (".mp4" in chat):
        hash[name]['mp4'] = hash[name]['mp4'] + [chat]

    elif (".opus" in chat):
        hash[name]['opus'] = hash[name]['opus'] + [chat]

    elif (".jpg" in chat):
        hash[name]['jpg'] = hash[name]['jpg'] + [chat]

    elif (".webp" in chat):
        hash[name]['webp'] = hash[name]['webp'] + [chat]

    else:
        hash[name]['text'] = hash[name]['text'] + [chat]

    return hash

def date_time_to_seconds(date_time, format = "%Y%m%d%H%M"):
    return int(datetime.strptime(date_time, format).timestamp()/60)

Split chats

In [4]:
splits = regex.split(pattern = r"\n+", string = data)
full_splits = []
for chat in splits:
    if (" - Hluteko: " in  chat) or ("MS563" in chat):
        full_splits.append(chat)

    else:
        full_splits[-1] = f"{full_splits[-1]}\n{chat}"

Total messages

In [5]:
split_chats = {'Hluteko': {'full': [], 'text': [], 'mp4': [], 'opus': [], 'jpg': [], 'webp': []},
               'MS563': {'full': [], 'text': [], 'mp4': [], 'opus': [], 'jpg': [], 'webp': []}}

for chat in full_splits:
    if ("Hluteko" in chat):
        split_chats = assign(split_chats, "Hluteko", chat)

    else:
        split_chats = assign(split_chats, "MS563", chat)

for name, data in split_chats.items():
    print(name, "\n")
    for type_, hdata in data.items():
        print(f"{type_}: {len(hdata)}")

    print()

Hluteko 

full: 18418
text: 16044
mp4: 657
opus: 114
jpg: 592
webp: 1011

MS563 

full: 17091
text: 14158
mp4: 2032
opus: 35
jpg: 423
webp: 443



In [6]:
frame = pandas.DataFrame()
for name in list(split_chats.keys()):
    for data in split_chats[name]['full']:
        temp = pandas.DataFrame(data = [regex.split(pattern = r"[,]*\s|[:]\s", string = data, maxsplit = 4)],
                                columns = ['Date', 'Time', 'Dash', 'Name', 'Message'])
        
        frame = pandas.concat(objs = [frame, temp])
draft_frame = frame.copy(deep = True)

In [7]:
frame = draft_frame.copy(deep = True)
date_times = []
dates = []
times = []
hours = []
days = []
months = []
for i, row in frame.iterrows():
    date_splits = row['Date'].split("/")
    date = "".join(date_splits)
    time = "".join(row['Time'].split(":"))
    constructor = get_date(*[int(item) for item in date_splits])
    date_times.append(date + time)
    dates.append(date)
    months.append(constructor.strftime(format = "%B"))
    times.append(time)
    hours.append(time[:2] + "00")
    days.append(constructor.strftime(format = "%A"))

frame['Date'] = dates
frame['Month'] = months
frame['Time'] = times
frame['DateTime'] = date_times
frame['Hour'] = hours
frame['Day'] = days
frame = frame.reset_index(drop = True)
frame = frame.loc[:, ['DateTime', 'Date', 'Month', 'Day', 'Time', 'Hour', 'Message', 'Name']]
frame['Timestamp(minutes)'] = frame['DateTime'].apply(date_time_to_seconds)
frame = frame.sort_values(by = ['Timestamp(minutes)']).reset_index(drop = True)
frame.to_excel(excel_writer = "Structured Chats.xlsx", sheet_name = 'data', index = False)
frame.to_csv(path_or_buf = "Structured Chats.csv", index = False)

## Topic Modelling

To be completed**

In [8]:
captions = [item.split("\n")[1] for name in split_chats.keys() for item in split_chats[name]['mp4'] if len(item.split("\n")) > 1]
texts = [item.split(": ", 1)[1] for name in split_chats.keys() for item in split_chats[name]['text']]
texts = texts + captions

words = []

for text in texts:
    line = regex.sub(pattern = r"\n+", string = text, repl = " ")
    words = words + regex.split(pattern = r"\s+", string = line)

counts = {k: v for k, v in sorted(dict(Counter(words)).items(), key = lambda item: item[1], reverse = True)}

## Delay between responses

In [9]:
d0 = [0]

for index, row in frame.iterrows():
    if index > 0:
        d0.append(row['Timestamp(minutes)']-lag)

    lag = row['Timestamp(minutes)']

frame['Delay'] = d0

In [13]:
response_distribution = pandas.DataFrame(frame['Delay'].value_counts()).reset_index(drop = False)
response_distribution = dict(sorted(dict(Counter(d0)).items(), key = lambda item: item[1], reverse = True))

Average response time for all messages

In [14]:
print(numpy.mean(list(response_distribution.values())), "minutes")

101.16524216524216 minutes


Average response time without big breaks

In [15]:
print(numpy.mean([value for value in response_distribution.values() if value < 30]), "minutes")

4.440993788819876 minutes


Average break time

In [16]:
print(numpy.mean([value for value in response_distribution.values() if value > 30]), "minutes")

1216.0357142857142 minutes


In [17]:
figure1 = graph.Figure(data = graph.Scatter(y = frame['Delay'],
                                            line = dict(color = 'royalblue', width = 2)))
figure1.update_layout(title = 'Delay Between Any 2 Consecutive Replies',
                   xaxis_title = 'Message Index',
                   yaxis_title = 'Delay in minutes',)
figure1.show()

In [18]:
d1 = []

for index, row in frame.iterrows():
    if index > 0:
        if responder != row['Name']:
            d1.append([f"{responder} -> {row['Name']}", row['Timestamp(minutes)']-lag])
            lag = row['Timestamp(minutes)']

    else:
        lag = row['Timestamp(minutes)']
    
    responder = row['Name']

frame1 = pandas.DataFrame(data = d1, columns = ['Sequence', 'Delay'])

figure1 = graph.Figure(data = graph.Scatter(y = frame1['Delay'],
                                            line = dict(color = 'green', width = 2)))
figure1.update_layout(title = 'Delay Between Our Replies (Your first message and my first message [and visa versa])',
                   xaxis_title = 'Message Index',
                   yaxis_title = 'Delay in minutes')
figure1.show()

In [19]:
d2 = []

for index, row in frame.iterrows():
    if index > 0:
        if responder != row['Name']:
            d2.append([f"{responder} -> {row['Name']}", row['Timestamp(minutes)']-lag])

    lag = row['Timestamp(minutes)']
    responder = row['Name']

frame2 = pandas.DataFrame(data = d2, columns = ['Sequence', 'Delay'])

figure1 = graph.Figure(data = graph.Scatter(y = frame2['Delay'],
                                            line = dict(color = 'red', width = 2)))
figure1.update_layout(title = 'Delay Between Our Replies (Your last message and my first message [and visa versa])',
                   xaxis_title = 'Message Index',
                   yaxis_title = 'Delay in minutes')
figure1.show()

### Behaviour

In [20]:
dates_data = lambda n: pandas.DataFrame(frame['Date'].value_counts()).reset_index().iloc[:n, :]
figure2 = graph.Figure(data = graph.Bar(y = dates_data(10)['count'], x = dates_data(10)['Date'], text = dates_data(10)['count'], textposition = 'auto',
                                        marker_color = ['red'] * 10))
figure2.update_layout(title = 'Top 10 Dates',
                   xaxis_title = 'Date',
                   yaxis_title = 'Count',)
figure2.show()

In [21]:
months_data = lambda n: pandas.DataFrame(frame['Month'].value_counts()).reset_index().iloc[:n, :]
figure2 = graph.Figure(data = graph.Bar(y = months_data(10)['count'], x = months_data(10)['Month'], text = months_data(10)['count'], textposition = 'auto',
                                        marker_color = ['yellow'] * 10))
figure2.update_layout(title = 'Monthly Chats',
                   xaxis_title = 'Month',
                   yaxis_title = 'Count',)
figure2.show()

In [22]:
days_data = lambda n: pandas.DataFrame(frame['Day'].value_counts()).reset_index().iloc[:n, :]
figure2 = graph.Figure(data = graph.Bar(y = days_data(10)['count'], x = days_data(10)['Day'], text = days_data(10)['count'], textposition = 'auto',
                                        marker_color = ['green'] * 10))
figure2.update_layout(title = 'Weekly Chats',
                   xaxis_title = 'Day',
                   yaxis_title = 'Count',)
figure2.show()

In [23]:
times_data = lambda n: pandas.DataFrame(frame['Time'].value_counts()).reset_index().iloc[:n, :]
figure2 = graph.Figure(data = graph.Bar(y = times_data(20)['count'], x = times_data(20)['Time'], text = times_data(20)['count'], textposition = 'auto',
                                        marker_color = ['blue'] * 20))
figure2.update_layout(title = 'Top 20 Favourite Times',
                   xaxis_title = 'Time',
                   yaxis_title = 'Count',)
figure2.show()

In [24]:
hours_data = lambda n: pandas.DataFrame(frame['Hour'].value_counts()).reset_index().iloc[:n, :]
figure2 = graph.Figure(data = graph.Bar(y = hours_data(30)['count'], x = hours_data(30)['Hour'], text = hours_data(30)['count'], textposition = 'auto',
                                        marker_color = ['pink'] * 30))
figure2.update_layout(title = 'Favourite Periods',
                   xaxis_title = 'Hour',
                   yaxis_title = 'Count',)
figure2.show()