In [1]:
import re
import pandas as pd
from datetime import datetime
import warnings

warnings.filterwarnings("ignore")

# 1) Read raw text
with open("whatsapp_messages.txt", "r", encoding="utf-8") as f:
    lines = f.read().splitlines()

# 2) Regex for lines like:
# [07/10/24, 10:03:03 AM] Mehdi Dhuka: Message here
# note: \s* before AM/PM handles the \u202f space you're seeing
pattern = re.compile(
    r'^\[(\d{1,2}/\d{1,2}/\d{2,4}),\s+(\d{1,2}:\d{2}:\d{2})\s*([APap][Mm])\]\s([^:]+):\s(.*)$'
)

datetimes_24 = []   # list 1: date+time 24h
user_messages = []  # list 2: "username: message"
users = []
messages = []

unmatched = 0

for line in lines:
    m = pattern.match(line)
    if not m:
        # uncomment this if you want to see which lines are skipped:
        # print("NO MATCH:", repr(line))
        unmatched += 1
        continue

    date_str, time_str, ampm, user, message = m.groups()

    # Build "dd/mm/yy hh:mm:ss AM" for parsing
    dt_input = f"{date_str} {time_str} {ampm}"

    # 12-hour → 24-hour
    dt_obj = datetime.strptime(dt_input, "%d/%m/%y %I:%M:%S %p")
    dt_24 = dt_obj.strftime("%d/%m/%y %H:%M:%S")

    datetimes_24.append(dt_24)
    user_messages.append(f"{user}: {message}")
    users.append(user)
    messages.append(message)

print("Matched messages:", len(datetimes_24))
print("Unmatched lines:", unmatched)

print("\nFirst 5 datetimes (24h):")
print(datetimes_24[:5])

print("\nFirst 5 user+messages:")
print(user_messages[:5])



Matched messages: 405
Unmatched lines: 525

First 5 datetimes (24h):
['07/10/24 10:03:03', '29/03/25 16:19:40', '27/04/25 12:57:54', '27/04/25 12:58:21', '27/04/25 12:58:41']

First 5 user+messages:
['Hello There!: \u200eYou were added', "Mehdi Dhuka: \u200eMehdi Dhuka changed this group's icon", 'Mehdi Dhuka: Chalo ghumne jaate Hain', 'Mehdi Dhuka: Bor Ho Gaye', 'Husain(mola): Shadi aur jaldi karlo']


In [2]:
df = pd.DataFrame({'user_message' : user_messages , 'message_date' : datetimes_24})
df['message_date'] = pd.to_datetime(df['message_date'],format='%d/%m/%y %H:%M:%S')
df.rename(columns={'message_date' : 'dates'} , inplace =True)

In [3]:
# Split user and message into separate columns
df[['user', 'message']] = df['user_message'].str.split(
    pat=':',
    n=1,
    expand=True
)

df['user'] = df['user'].str.strip()
df['message'] = df['message'].str.strip()
df.drop(columns=['user_message'] , inplace=True)
df.head(5)

Unnamed: 0,dates,user,message
0,2024-10-07 10:03:03,Hello There!,‎You were added
1,2025-03-29 16:19:40,Mehdi Dhuka,‎Mehdi Dhuka changed this group's icon
2,2025-04-27 12:57:54,Mehdi Dhuka,Chalo ghumne jaate Hain
3,2025-04-27 12:58:21,Mehdi Dhuka,Bor Ho Gaye
4,2025-04-27 12:58:41,Husain(mola),Shadi aur jaldi karlo


In [5]:
df['year'] = df['dates'].dt.year

In [7]:
df['month'] = df['dates'].dt.month_name()

In [11]:
df['day'] = df['dates'].dt.day
df['hour'] = df['dates'].dt.hour

In [13]:
df['minute'] = df['dates'].dt.minute

In [14]:
df

Unnamed: 0,dates,user,message,year,month,day,hour,minute
0,2024-10-07 10:03:03,Hello There!,‎You were added,2024,October,7,10,3
1,2025-03-29 16:19:40,Mehdi Dhuka,‎Mehdi Dhuka changed this group's icon,2025,March,29,16,19
2,2025-04-27 12:57:54,Mehdi Dhuka,Chalo ghumne jaate Hain,2025,April,27,12,57
3,2025-04-27 12:58:21,Mehdi Dhuka,Bor Ho Gaye,2025,April,27,12,58
4,2025-04-27 12:58:41,Husain(mola),Shadi aur jaldi karlo,2025,April,27,12,58
...,...,...,...,...,...,...,...,...
400,2025-10-17 22:10:56,Akbar Hussain Motla (Ilol),https://www.instagram.com/reel/DP6TSFCDHD1/?ig...,2025,October,17,22,10
401,2025-11-02 12:20:19,ZakirAli Dhuka,https://youtu.be/Uu2QK9Z9X5E?si=5UHS8eUg886haZ_H,2025,November,2,12,20
402,2025-11-02 12:20:20,ZakirAli Dhuka,https://youtu.be/Uu2QK9Z9X5E?si=5UHS8eUg886haZ_H,2025,November,2,12,20
403,2025-11-12 19:21:46,ZakirAli Dhuka,https://www.instagram.com/thenxrclub/,2025,November,12,19,21
