# WhatsApp group Chat data preprocessing

In [None]:
import re, json
from datetime import datetime, timedelta
import pandas as pd
from duckdb import sql as sqldf

from plotly import express as px
from plotly import graph_objs as go

## Step 0: Export the WhatsApp chat data
you can find instructions anywhere on the internet  
Save it as _chat.txt in the same directory


# Step 1: Load WhatsApp export

`[24/04/24, 7:54:07 PM] ~ Jigyasu: Hello, I am Jigyasu from NIT Agartala`

In [None]:
with open("_chat.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

# ambiguous_chars_pattern = r'[\u202b\u200f\u202c\u202f]'
bad_space_patt = r'[\u202f]'
bad_empty_patt = r'[\u202a\u202b\u202c\u200f\u200e]'
# Use re.sub() to replace all occurrences of the characters with an empty string
#cleaned_text = 

# === Step 2: Parse WhatsApp lines ===
date_patt = r'([0-9]{1,2}\/[0-9]{1,2}\/[0-9]{2,4})'
time_patt = r'([0-9]{1,2}:[0-9]{2}:[0-9]{2}\s*[AaPp][Mm])'
auth_patt = r'(?:~)?\s*([^:]+)'
text_patt = r'(.*)'
full_patt = f'^\[{date_patt},\s*{time_patt}\]\s*{auth_patt}:\s*{text_patt}$'
line_re = re.compile(full_patt)

#print(full_patt)

dt_format = "%d/%m/%y %H:%M:%S %p"

rows = []
row = {}

for raw_line in raw_text.splitlines():
    raw_line = re.sub(bad_space_patt, ' ', raw_line) 
    raw_line = re.sub(bad_empty_patt, '', raw_line) 
    #raw_line.strip("\u200f").strip("\u202f").strip()
    m = line_re.match(raw_line)
    if m:
        rows.append(row)       
        
        date_str, time_str, author, text = m.groups()
        # print(raw_line)
        #print(date_str, time_str, author, text)
        dt = datetime.strptime(f"{date_str} {time_str}", dt_format)
        row = {"timestamp": dt, "author": author.strip(), "text": text.strip()}
        
    else:
        row["text"] += '\n' + raw_line

In [None]:
len(rows)

In [None]:
df = pd.DataFrame(rows[1:]).sort_values("timestamp").reset_index(drop=True)

In [None]:
df[30:].head(30)

In [None]:
df['date'] = df["timestamp"].dt.date
df['time'] = df["timestamp"].dt.time
#df['time'] = df.time.apply(lambda dt : dt.strftime('%H:%M'))
df['daytime'] = df.time.apply(lambda dt : dt.hour*60*60 + dt.minute*60 + dt.second)


In [None]:
df.time.nunique()


In [None]:
#ambiguous_chars_pattern = r'[\u202a\u202b\u202c\u200f]'

# Use re.sub() to replace all occurrences of the characters with an empty string
#cleaned_text = re.sub(ambiguous_chars_pattern, '', text)



In [None]:
#df['text'] = df['text'].str.replace(ambiguous_chars_pattern, '', regex=True)

In [None]:
# df['text'] = df['text'].str.replace("\\u200f","").str.replace("\\u202c","")
# df['text']


In [None]:
#print(r"\u200")

## Step 3: Detect staff 

In [None]:
q = r"""SELECT author, count(text) as cnt

        FROM df
        
        group by author
        order by cnt desc
        ;""" # like '%_%';"  #

#scores = ps.sqldf(q)
userstats = sqldf(q).df()

In [None]:
userstats

In [None]:
px.bar(userstats, x = userstats.index, y = "cnt", hover_data=["author"])

In [None]:
list(userstats[:4].author)

In [None]:
staff_names = {'Amritaa Sethi Mam Gfg','Aman Sir Gfg Interview', 'Nikhil Sharma', '.', 'Aadil Latif'}

df["role"] = df["author"].apply(lambda a: "staff" if a in staff_names else "student")

df["is_question"] = df["text"].apply(lambda t: "?" in t)

In [None]:
#df = df.sort_values(["time","role","timestamp"])
#df.reset_index()

In [None]:
authUnq = pd.DataFrame(df.author.unique()).reset_index()
authUnq.columns = ["Id","name"]
authUnq.Id = 1000 + authUnq.Id
authUnq

In [None]:
df

In [None]:
tm = df.time[3824]

In [None]:
tm

In [None]:
px.histogram(df, x = 'daytime', color = 'role', nbins=600)

In [None]:
q = r"""SELECT text, count(text) as cnt

        FROM df
        -- where text like '%omitted%'
        group by text
        order by cnt desc
        ;""" # like '%_%';"  #

#scores = ps.sqldf(q)
lal = sqldf(q).df()
print(lal.shape)
lal.head(30)

In [None]:
dels = list(lal.loc[[0,1,5,6,7,8,11,13,15]].text)
dels

In [None]:
dels += ['Thanks', 'Ty', 'video omitted']

urlfile = ["https://", ".pdf", ".docx", ".png"]

dels_like = ['document omitted','This message was deleted by admin']

joined = ["added ~ ", "removed ~ ", "removed +"]

when =  [f"WHEN text = '{t}' THEN 'deleted'" for t in dels]
when += [f"WHEN text LIKE '%{t}%' THEN 'deleted'" for t in dels_like]
when += [f"WHEN text LIKE '%{t}%' THEN 'urlfile'" for t in urlfile]
when += [f"WHEN text LIKE '%{t}%' THEN 'joined'" for t in joined]


when = "\n\t".join(when)
print(when)

In [None]:
dels

In [None]:
basecols = ", ".join(df.columns[:-1])
basecols

In [None]:
q = f"""SELECT {basecols}, 'user' || cast(au.id as string) as authorAnon,
    CASE
        {when}
        ELSE role
    END as class      
    
    FROM df
    JOIN authUnq au on df.author = au.name
        
        ;""" 
print(q)

In [None]:
df = sqldf(q).df()
df

In [None]:
px.histogram(df, x = 'class')

In [None]:
fig = px.scatter(df, x = "timestamp", y = "daytime", color = 'class', 
                 height=800, hover_data=["author","text"]  )
# , category_orders=
fig.update_traces(marker_size = 4)
fig.update_layout(yaxis_tickformat='%H:%M:%S')

In [None]:
q = f"""SELECT *

        FROM df
        where class in ('student','staff','urlfile')
        ;""" 
flt = sqldf(q).df()
flt.shape

In [None]:
#flt = df[df['class']=='sentence']
flt

In [None]:
fig = px.scatter(flt, x = "timestamp", y = "daytime", color = 'role', 
                 height=800, hover_data=["author","text"] )
# , category_orders=
fig.update_traces(marker_size = 3)
fig.update_layout(yaxis_tickformat='%H:%M:%S')

In [None]:
fig = px.scatter(df, x = "timestamp", y = "author", color = 'role', 
                 height=800, hover_data=["text"],  )
fig.update_traces(marker_size = 3)

In [None]:
fig = px.scatter(df, x = "timestamp", y = "daytime", color = 'class', 
                 height=800, hover_data=["text"]  )
# , category_orders=
fig.update_traces(marker_size = 3)
fig.update_layout(yaxis_tickformat='%H:%M:%S')

In [None]:
df.date.nunique()

In [None]:
px.histogram(df, x = "date", nbins=200)

In [None]:
df.text.nunique()

In [None]:
df.text

In [None]:
px.histogram(lal, x = 'cnt')

In [None]:
lal.head(50)

In [None]:
q = f"""
with s1 as 
(SELECT date, group_concat(time || ' ' || authorAnon || ': ' || text, '<hr>') as html_anon,
                group_concat(time || ' ' || author || ': ' || text, '<hr>') as html,
group_concat( authorAnon || ': ' || text, '\n') as txt

        FROM flt
        GROUP by date
)
select * 

from s1
order by date
        ;""" 

dialogues = sqldf(q).df()
dialogues

In [None]:
dialogues['txt'].to_json('dialogues.json', force_ascii=False, indent=2, orient='records')

In [None]:

def save_df_as_pretty_html(df, filename="output.html", index=True):
    pd.set_option("display.max_colwidth", None)
    # Convert newlines to <br> for HTML
    df_html_ready = df.copy()
    for col in df_html_ready.columns:
        df_html_ready[col] = df_html_ready[col].astype(str).str.replace('\n', '<br>', regex=False)

    # Generate styled HTML
    html = df_html_ready.to_html(
        escape=False,  # Needed to render <br>
        index=index,
        border=0,
        classes="styled-table"
    )

    # Add CSS styling
    style = """
    <style>
    .styled-table {
        border-collapse: collapse;
        margin: 25px 0;
        font-size: 16px;
        font-family: Arial, sans-serif;
        width: 100%;
        table-layout: auto; /* ✅ Let browser fit naturally */
    }
    .styled-table th, .styled-table td {
        border: 1px solid #dddddd;
        padding: 10px;
        vertical-align: top;
        text-align: left;
        overflow-wrap: break-word; /* ✅ Break inside words */
        white-space: pre-wrap; /* ✅ Honor \\n linebreaks */
    }
    .styled-table td {
        max-width: 600px; /* ✅ Avoid huge dream fields expanding table */
    }
    .styled-table th {
        background-color: #f2f2f2;
    }
    </style>
    """

    # Write full HTML document
    with open(filename, "w", encoding="utf-8") as f:
        f.write(f"<!DOCTYPE html><html><head>{style}</head><body>{html}</body></html>")

    print(f"✅ HTML table saved to: {filename}")


In [None]:
save_df_as_pretty_html(dialogues[['date','html']], 'dialogues.html')