In [42]:
import pandas as pd
import re

In [43]:
def preprocess(data):
    # Define the pattern to split the date and message parts
    pattern = r'\[\d{1,2}/\d{1,2}/\d{4},\s\d{1,2}:\d{2}:\d{2}\s[apAP][mM]\]'
    messages = re.split(pattern, data)[1:]
    dates = re.findall(pattern, data)

    # Create a DataFrame
    df = pd.DataFrame({'user_message': messages, 'message_date': dates})

    # Convert message_date to datetime and format it correctly
    df['message_date'] = df['message_date'].str.replace('[', '').str.replace(']', '').str.replace('‎', '')
    df['message_date'] = pd.to_datetime(df['message_date'], format='%d/%m/%Y, %I:%M:%S %p')

    # Extract user and message
    df[['user', 'message']] = df['user_message'].str.extract(r'([^:]+):\s?(.*)', expand=True)

    # Drop the original user_message column
    df.drop(columns=['user_message'], inplace=True)

    # Rename columns for clarity
    df.rename(columns={'message_date': 'date'}, inplace=True)

    # Extract date components
    df['only_date'] = df['date'].dt.date
    df['year'] = df['date'].dt.year
    df['month_num'] = df['date'].dt.month
    df['month'] = df['date'].dt.month_name()
    df['day'] = df['date'].dt.day
    df['day_name'] = df['date'].dt.day_name()
    df['hour'] = df['date'].dt.hour
    df['minute'] = df['date'].dt.minute

    # Define the period of the day
    period = []
    for hour in df['hour']:
        if hour == 23:
            period.append(str(hour) + "-" + str('00'))
        elif hour == 0:
            period.append(str('00') + "-" + str(hour + 1))
        else:
            period.append(str(hour) + "-" + str(hour + 1))
    df['period'] = period

    return df

In [44]:
with open('_chat.txt', 'r') as file:
    # Read the contents of the file
    chat_contents = file.read()
df = preprocess(chat_contents)
df.head()

  df['message_date'] = df['message_date'].str.replace('[', '').str.replace(']', '').str.replace('‎', '')


Unnamed: 0,date,user,message,only_date,year,month_num,month,day,day_name,hour,minute,period
0,2020-01-29 19:04:19,Peaky THF Blinders ✨🖤❤️‍🔥,"‎Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them.",2020-01-29,2020,1,January,29,Wednesday,19,4,19-20
1,2020-01-29 19:04:19,Salman Bhai,‎Salman Bhai created this group,2020-01-29,2020,1,January,29,Wednesday,19,4,19-20
2,2020-01-29 19:04:19,Peaky THF Blinders ✨🖤❤️‍🔥,‎You were added,2020-01-29,2020,1,January,29,Wednesday,19,4,19-20
3,2021-11-26 00:06:08,Fahad Kasha jahaz,insha'Allah❤️ best of luck bro !,2021-11-26,2021,11,November,26,Friday,0,6,00-1
4,2021-11-26 00:06:26,Umar Khayam,آئیں گے انشاللہ,2021-11-26,2021,11,November,26,Friday,0,6,00-1


In [45]:
pd.set_option('display.max_rows', False)  # Set it to None to display all rows
pd.set_option('display.max_colwidth', False)  # Set it to None to display full width of the column
df['message'].value_counts()

‎image omitted                                             1885
‎audio omitted                                             1325
‎video omitted                                             532 
Ok                                                         192 
ok                                                         165 
‎sticker omitted                                           138 
🤣                                                          113 
😂                                                          111 
?                                                          107 
han                                                        67  
Ameen                                                      63  
Insha Allah                                                61  
                                                           ..  
Khana lasani pe kha lety hein, chach interchange ke pas    1   
5:30 asar ki namaz ky bad                                  1   
or hazro se fruit le kar kain beth ke kh

In [46]:
pattern = r'image|video|audio|document omitted'

# Use the `str.contains` method to apply the regex pattern, setting `na=False` to treat NaNs as False
mask = df['message'].str.contains(pattern, na=False)
media_message = df[~mask]
media_message

Unnamed: 0,date,user,message,only_date,year,month_num,month,day,day_name,hour,minute,period
0,2020-01-29 19:04:19,Peaky THF Blinders ✨🖤❤️‍🔥,"‎Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them.",2020-01-29,2020,1,January,29,Wednesday,19,4,19-20
1,2020-01-29 19:04:19,Salman Bhai,‎Salman Bhai created this group,2020-01-29,2020,1,January,29,Wednesday,19,4,19-20
2,2020-01-29 19:04:19,Peaky THF Blinders ✨🖤❤️‍🔥,‎You were added,2020-01-29,2020,1,January,29,Wednesday,19,4,19-20
3,2021-11-26 00:06:08,Fahad Kasha jahaz,insha'Allah❤️ best of luck bro !,2021-11-26,2021,11,November,26,Friday,0,6,00-1
4,2021-11-26 00:06:26,Umar Khayam,آئیں گے انشاللہ,2021-11-26,2021,11,November,26,Friday,0,6,00-1
5,2021-11-26 00:06:37,Fahad Kasha jahaz,insha'Allah❤️,2021-11-26,2021,11,November,26,Friday,0,6,00-1
7,2021-11-26 00:45:20,Umar Khayam,ھاھاھاھاھاھا,2021-11-26,2021,11,November,26,Friday,0,45,00-1
8,2021-11-26 06:51:44,Haseeb jahaz library,کل پروجیکٹر پر لگا لو 😂,2021-11-26,2021,11,November,26,Friday,6,51,6-7
9,2021-11-26 09:20:19,Fahad Kasha jahaz,Umar miss sayy nbr pouch mry bhi or fhdi ko b ?,2021-11-26,2021,11,November,26,Friday,9,20,9-10
...,...,...,...,...,...,...,...,...,...,...,...,...


In [47]:
def most_common_words(selected_user,df):

    f = open('stop_hinglish.txt','r')
    stop_words = f.read()

    if selected_user != 'Overall':
        df = df[df['user'] == selected_user]

    pattern = r'image|video|audio|document omitted'

    # Use the `str.contains` method to apply the regex pattern, setting `na=False` to treat NaNs as False
    mask = df['message'].str.contains(pattern, na=False)
    temp = df[~mask]

    words = []

    for message in temp['message']:
        for word in message.lower().split():
            if word not in stop_words:
                words.append(word)
    print(words)
    most_common_df = pd.DataFrame(Counter(words).most_common(20))
    return most_common_df

In [48]:
most_common_words('Overall', df)



NameError: name 'Counter' is not defined