# Dataset Extraction

Importing the required libraries

In [1]:
import re
import pandas as pd

Checking each line if it starts with date and time to identify each unique message in the text file

In [2]:
def rawToDf(file, key):
#     Converts raw .txt file into a Data Frame
    
    split_formats = {
        '12hr' : '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s[APap][mM]\s-\s',
        '24hr' : '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s',
        'custom' : ''
    }
    datetime_formats = {
        '12hr' : '%d/%m/%Y, %I:%M %p - ',
        '24hr' : '%d/%m/%Y, %H:%M - ',
        'custom': ''
    }
    
    with open(file, 'r', encoding='utf-8') as raw_data:
        # print(raw_data.read())
        raw_string = ' '.join(raw_data.read().split('\n')) # converting the list split by newline char. as one whole string as there can be multi-line messages
        user_msg = re.split(split_formats[key], raw_string) [1:] # splits at all the date-time pattern, resulting in list of all the messages with user names
        date_time = re.findall(split_formats[key], raw_string) # finds all the date-time patterns
        
        df = pd.DataFrame({'date_time': date_time, 'user_msg': user_msg}) # exporting it to a df
        
    # converting date-time pattern which is of type String to type datetime,
    # format is to be specified for the whole string where the placeholders are extracted by the method 
    df['date_time'] = pd.to_datetime(df['date_time'], format=datetime_formats[key])
    
    # split user and msg 
    usernames = []
    msgs = []
    for i in df['user_msg']:
        a = re.split('([\w\W]+?):\s', i) # lazy pattern match to first {user_name}: pattern and spliting it aka each msg from a user
        if(a[1:]): # user typed messages
            usernames.append(a[1])
            msgs.append(a[2])
        else: # other notifications in the group(eg: someone was added, some left ...)
            usernames.append("group_notification")
            msgs.append(a[0])
            
    
    # creating new columns         
    df['user'] = usernames
    df['message'] = msgs

    # dropping the old user_msg col.
    df.drop('user_msg', axis=1, inplace=True)
    
    return df

In [3]:
df = rawToDf('chat.txt', '12hr')

In [4]:
df.head()

Unnamed: 0,date_time,user,message
0,2020-01-26 16:19:00,group_notification,Messages and calls are end-to-end encrypted. N...
1,2020-01-24 20:25:00,group_notification,"Tanay Kamath (TSEC, CS) created group ""CODERSðŸ‘¨..."
2,2020-01-26 16:19:00,group_notification,You joined using this group's invite link
3,2020-01-26 16:20:00,group_notification,+91 99871 38558 joined using this group's invi...
4,2020-01-26 16:20:00,group_notification,+91 91680 38866 joined using this group's invi...


In [5]:
# checking out number of unique authors of the messages
df['user'].unique()

array(['group_notification', '+91 96536 93868',
       'Dheeraj Lalwani (TSEC, CS)', '+91 99201 75875', '+91 95949 08570',
       '+91 79778 76844', '+91 90499 38860', 'Tanay Kamath (TSEC, CS)',
       'Saket (TSEC, CS)', '+91 77568 95072', 'Rohit Pathak (TSEC, CS)',
       '+91 75078 05454', 'Darshan Rander (TSEC, IT)', '+91 79774 68083',
       '+91 70394 60876', '+91 96191 55044', '+91 90678 93300',
       'Mohit Varma (TSEC, CS)', '+91 79770 56210',
       'Chirag Sharma (TSEC, CS)', 'Vivek Iyer (TSEC, Biomed)',
       'Tushar Nankani', '+91 81696 22410', '+91 89764 07509',
       '+91 78758 66747', 'Ankit (TSEC, CS)', '+91 86556 33169',
       '+91 76663 28147', '+91 88284 70904', '+91 97698 67348',
       'Vivek (TSEC, CS)', 'Hardik Raheja (TSEC, CS)', '+91 91680 38866',
       'Pranay Thakur (TSEC, CS)', 'Mittul Dasani (TSEC, CS)',
       'Kartik Soneji (TSEC, CS)', '+91 77180 43697', '+91 99676 84479',
       'Shreya (TSEC, IT)', '+91 96190 16721', '+91 89833 85127',
       '+9

In [6]:
# checking out random 10 samples from the dataset
df.sample(10)

Unnamed: 0,date_time,user,message
12261,2020-09-14 12:19:00,"Kartik Soneji (TSEC, CS)",See I am asking if all this is actually worth ...
2494,2020-02-29 22:52:00,"Kartik Soneji (TSEC, CS)",Waiting for this message
138,2020-01-27 21:42:00,"Tanay Kamath (TSEC, CS)",Then why will I even store it in a variable
7364,2020-06-12 23:22:00,"Darshan Rander (TSEC, IT)",IkðŸ˜‚
11460,2020-09-11 12:35:00,"Tanay Kamath (TSEC, CS)",ha thode issues hai
8953,2020-07-16 16:15:00,"Dheeraj Lalwani (TSEC, CS)",ðŸ˜‚
9474,2020-08-07 11:02:00,"Pratik K (TSEC CS, SE)",Yesss
11171,2020-09-03 13:21:00,"Tanay Kamath (TSEC, CS)","read this article,its great"
10874,2020-08-27 21:42:00,"Dheeraj Lalwani (TSEC, CS)","Yaar, what is this Robert B. Weide joke? Hv s..."
9814,2020-08-13 12:11:00,"Darshan Rander (TSEC, IT)",Clients are irritatingðŸ˜­


In [7]:
# checking for null data
df.isna().sum()

date_time    0
user         0
message      0
dtype: int64

In [8]:
# loading the cleaned dataset into the csv file
df.to_csv('Whatsapp_Chat_Table.csv')