In [39]:
import re
import pandas as pd

def read_whatsapp_chat(file_path:str):
    encryption_message = "Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them. Tap to learn more."
    media_pattern = "<Media omitted>"
    email_pattern = r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}'
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    edited_message = "<This message was edited>"
    deleted_message = "You deleted this message"
    null_message = "null"
    created_group_message = "created group"
    added_you_to_group_message = "added you"
    tagging_pattern = r'@[\w]+'

    with open(file_path, 'r', encoding = 'utf-8' ) as f:
        lines = f.readlines()

    # Apply filters to remove unwanted lines
    filtered_lines = []
    for line in lines:
        if (
            encryption_message not in line and
            deleted_message not in line and
            null_message != line.split(" ")[-1] and
            media_pattern not in line and
            created_group_message not in line and
            added_you_to_group_message not in line and
            not re.search(email_pattern, line) and
            not re.search(url_pattern, line)
        ):
            line = line.replace(edited_message, "").strip()
            line = re.sub(tagging_pattern, "", line).strip()
            filtered_lines.append(line)

    # Normalize content:
    content = '\n'.join(filtered_lines)
    # Replace narrow no-break space (iOS specific)
    content = content.replace('\u202f', ' ')
    # Remove square brackets if they surround the timestamp (only for iOS)
    content = re.sub(
        r'\[(\d{1,2}/\d{1,2}/\d{2,4}, \d{1,2}:\d{2}(?::\d{2})?\s?[APap][Mm])\]',
        r'\1',
        content
    )
    # Remove LRM and RLM characters (Left-to-Right Mark and Right-to-Left Mark)
    content = content.replace('\u200E', '').replace('\u200F', '')

    # Updated regex pattern to match both iOS and Android WhatsApp exports.
    pattern = r'(\d{1,2}/\d{1,2}/\d{2,4}, \d{1,2}:\d{2}(?::\d{2})?(?:\s?[APap][Mm])?)\s?(?:-|\~)?\s?(.*?): (.*?)(?=\n\d{1,2}/\d{1,2}/\d{2,4}, \d{1,2}:\d{2}|$)'
    messages = re.findall(pattern, content, re.DOTALL)

    df = pd.DataFrame(messages,columns=['timestamp','sender','message'])
    timestamps = []
    for timestamp in df['timestamp']:
        try:
            timestamp = pd.to_datetime(
                timestamp, format='mixed', errors='coerce')
        except Exception as e:
            print(f"Error parsing timestamp '{timestamp}': {e}")
            timestamp = pd.NaT
        timestamps.append(timestamp)

    df['timestamp'] = timestamps
    return df

In [41]:
from pathlib import Path

all_chats = {}
data_directory = Path("./data/")

for file in data_directory.glob('*.txt'):
    print(file)
    file_name = file.stem
    all_chats[file_name] = read_whatsapp_chat(file)
    print(all_chats[file_name])

data/DummyData.txt
               timestamp                                             sender  \
0    2023-01-11 00:14:00  Messages and calls are end-to-end encrypted. O...   
1    2023-03-03 16:00:00                                       sumit mandal   
2    2023-03-07 17:53:00                                      Aniket Parkar   
3    2023-03-11 11:10:00                                       sumit mandal   
4    2023-03-11 11:13:00                                      Aniket Parkar   
...                  ...                                                ...   
3220 2025-05-30 22:44:00                            sagar Bhangale Tata AIG   
3221 2025-05-30 22:45:00                                       sumit mandal   
3222 2025-05-30 22:45:00                            sagar Bhangale Tata AIG   
3223 2025-06-01 19:13:00                            sagar Bhangale Tata AIG   
3224 2025-06-01 19:15:00                                       sumit mandal   

                                

In [37]:
text_sequence = ""
for file_name in all_chats.keys():
    text_sequence += " ".join(all_chats[file_name]['message'].values)

len(text_sequence)

133612

In [38]:
with open("./output/combined_text.txt", "w", encoding="utf-8") as f:
    f.write(text_sequence)

In [47]:
!git init

Initialized empty Git repository in /Users/smandal14/work_aig/aig/Kendra-Bot/whatsapp_own_bot_training/.git/


In [52]:
!git add .

In [53]:
!git commit -m "Data preprocessing"

[main 1cddd2a] Data preprocessing
 Committer: Mandal <smandal14@TAGLDP23483.local>
Your name and email address were configured automatically based
on your username and hostname. Please check that they are accurate.
You can suppress this message by setting them explicitly:

    git config --global user.name "Your Name"
    git config --global user.email you@example.com

After doing this, you may fix the identity used for this commit with:

    git commit --amend --reset-author

 2 files changed, 32 insertions(+), 573 deletions(-)
 delete mode 100644 combined_text.txt


In [54]:
!git branch -M main

In [58]:
!git remote add origin git@github.com-personal:sumit-mandal/llm-training-from-scratch.git

error: remote origin already exists.


In [56]:
!git push -u origin main


git@github.com: Permission denied (publickey).
fatal: Could not read from remote repository.

Please make sure you have the correct access rights
and the repository exists.


In [57]:
ls -al ~/.ssh


total 112
drwx------  15 smandal14  1452157595   480 Nov  7  2024 [34m.[m[m/
drwxr-x---+ 76 smandal14  1452157595  2432 May 13 14:19 [34m..[m[m/
-rw-r--r--@  1 smandal14  1452157595  6148 Apr 29  2024 .DS_Store
-rw-------@  1 smandal14  1452157595  2622 Jan 19  2023 aig_id_rsa
-rw-r--r--@  1 smandal14  1452157595   581 Jan 19  2023 aig_id_rsa.pub
-rw-r--r--@  1 smandal14  1452157595   521 Nov  2  2024 config
-r--------   1 smandal14  1452157595  2610 Oct 31  2024 google_compute_engine
-rw-r--r--@  1 smandal14  1452157595   581 Oct 31  2024 google_compute_engine.pub
-rw-r--r--   1 smandal14  1452157595   327 Jan  4 00:15 google_compute_known_hosts
-rw-------@  1 smandal14  1452157595  2622 Jan 19  2023 id_rsa
-rw-r--r--@  1 smandal14  1452157595   581 Jan 19  2023 id_rsa.pub
-rw-------@  1 smandal14  1452157595   419 Apr 29  2024 id_rsa_personal
-rw-r--r--@  1 smandal14  1452157595   109 Apr 29  2024 id_rsa_personal.pub
-rw-------   1 smandal14  1452157595  2038 Nov  1  2024 known