In [1]:
import pandas as pd
import re

In [2]:
file_path = 'whatsapp-chat-data.txt'

In [3]:
with open(file_path, 'r', encoding='utf-8') as file:
    chat_data = file.readlines()

In [4]:
print(chat_data)



In [5]:
type(chat_data)

list

In [6]:
import re
import pandas as pd

def preprocess(data):
    # Regular expression to extract date, time, sender, and message for WhatsApp chat data
    pattern_v2 = r'^(\d{2}/\d{2}/\d{4}), (\d{1,2}:\d{2} [apAP][mM]) - ([^:]+): (.*)$'

    # Lists to store extracted data
    dates = []
    times = []
    senders = []
    messages = []

    # Initialize a variable to keep track of the previous message if needed
    current_message = ""

    # Parse each line in the chat data
    for line in data:
        line = line.strip()
        match = re.match(pattern_v2, line)
        if match:
            if current_message:  # If there's an ongoing message, append it
                messages.append(current_message)
                current_message = ""

            dates.append(match.group(1))
            times.append(match.group(2))
            senders.append(match.group(3))
            messages.append(match.group(4))
        else:
            # If the line doesn't match the pattern, it might be a continuation of the previous message
            if messages:
                current_message += ' ' + line

    # Append the last message if any
    if current_message:
        messages.append(current_message)

    # Print lengths for debugging
    print(f"Dates length: {len(dates)}")
    print(f"Times length: {len(times)}")
    print(f"Senders length: {len(senders)}")
    print(f"Messages length: {len(messages)}")

    # Ensure all lists are of the same length
    min_length = min(len(dates), len(times), len(senders), len(messages))
    dates = dates[:min_length]
    times = times[:min_length]
    senders = senders[:min_length]
    messages = messages[:min_length]

    # Create a DataFrame from the extracted data
    df = pd.DataFrame({
        'Date': dates,
        'Time': times,
        'Sender': senders,
        'Message': messages
    })

    # Convert 'Date' to datetime format
    df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
    
    # Combine 'Date' and 'Time' into a single 'DateTime' column
    df['DateTime'] = pd.to_datetime(df['Date'].astype(str) + ' ' + df['Time'])

    # Extract additional time-based columns
    df['Day'] = df['DateTime'].dt.day
    df['Month'] = df['DateTime'].dt.month
    df['Year'] = df['DateTime'].dt.year
    df['Hour'] = df['DateTime'].dt.hour
    df['Minute'] = df['DateTime'].dt.minute
    
    
    df.drop(['Date', 'Time'], axis=1, inplace=True)

    return df


In [7]:
df = preprocess(chat_data)

Dates length: 13379
Times length: 13379
Senders length: 13379
Messages length: 14932


  df['DateTime'] = pd.to_datetime(df['Date'].astype(str) + ' ' + df['Time'])


In [8]:
type(df)

pandas.core.frame.DataFrame

In [9]:
df.shape

(13379, 8)

In [10]:
df.head()

Unnamed: 0,Sender,Message,DateTime,Day,Month,Year,Hour,Minute
0,+91 96536 93868,<Media omitted>,2020-01-27 19:31:00,27,1,2020,19,31
1,+91 96536 93868,Give it a try ....,2020-01-27 19:31:00,27,1,2020,19,31
2,"Dheeraj Lalwani (TSEC, CS)",Alright,2020-01-27 19:31:00,27,1,2020,19,31
3,"Dheeraj Lalwani (TSEC, CS)",We can make this a trend,2020-01-27 19:32:00,27,1,2020,19,32
4,+91 96536 93868,Sure,2020-01-27 19:32:00,27,1,2020,19,32


In [11]:
words = []
for word in df['Message']:
    words.extend(word.split())

In [12]:
len(words)

92857

In [13]:
df.iloc[0]

Sender          +91 96536 93868
Message         <Media omitted>
DateTime    2020-01-27 19:31:00
Day                          27
Month                         1
Year                       2020
Hour                         19
Minute                       31
Name: 0, dtype: object

In [14]:
df[df['Message']== '<Media omitted>'].shape[0]

634

In [15]:
print(df)

                           Sender  \
0                 +91 96536 93868   
1                 +91 96536 93868   
2      Dheeraj Lalwani (TSEC, CS)   
3      Dheeraj Lalwani (TSEC, CS)   
4                 +91 96536 93868   
...                           ...   
13374   Darshan Rander (TSEC, IT)   
13375   Darshan Rander (TSEC, IT)   
13376     Tanay Kamath (TSEC, CS)   
13377   Darshan Rander (TSEC, IT)   
13378  Dheeraj Lalwani (TSEC, CS)   

                                                 Message            DateTime  \
0                                        <Media omitted> 2020-01-27 19:31:00   
1                                     Give it a try .... 2020-01-27 19:31:00   
2                                                Alright 2020-01-27 19:31:00   
3                               We can make this a trend 2020-01-27 19:32:00   
4                                                   Sure 2020-01-27 19:32:00   
...                                                  ...                 ..

In [16]:
from urlextract import URLExtract
urlextract = URLExtract()
urls = urlextract.find_urls('hi google.com are you sure you want facebook.com')
urls

['google.com', 'facebook.com']

In [18]:
df.head()

Unnamed: 0,Sender,Message,DateTime,Day,Month,Year,Hour,Minute
0,+91 96536 93868,<Media omitted>,2020-01-27 19:31:00,27,1,2020,19,31
1,+91 96536 93868,Give it a try ....,2020-01-27 19:31:00,27,1,2020,19,31
2,"Dheeraj Lalwani (TSEC, CS)",Alright,2020-01-27 19:31:00,27,1,2020,19,31
3,"Dheeraj Lalwani (TSEC, CS)",We can make this a trend,2020-01-27 19:32:00,27,1,2020,19,32
4,+91 96536 93868,Sure,2020-01-27 19:32:00,27,1,2020,19,32


In [21]:
y = []
for url in df.Message:
    y.extend(urlextract.find_urls(url))
    
    

In [22]:
y

['https://youtu.be/AU7mADJMa9Y',
 'https://youtu.be/aZu084TPInE',
 'http://meetu.ps/e/HKD5Q/BGt8n/d',
 'https://www.youtube.com/watch?v=fUqpYvIYj-Y',
 'https://www.youtube.com/playlist?list=PLDN4rrl48XKpZkf03iYFl-O29szjTrs_O',
 'https://repl.it/repls/AmusingPungentComputation',
 'https://youtu.be/4eWKHLSRHPY',
 'https://research.hackerrank.com/developer-skills/2020?utm_medium=social&utm_source=instagram&utm_campaign=021420&utm_content=IGPoll',
 'https://forms.gle/27DSsnJnBBMRy6Bt8',
 'https://code.dcoder.tech/files/code/5e3d7fad3975f256bf9d10e8/practice',
 'https://code.dcoder.tech/files/code/5e4a61595e611d351827daec/hell',
 'https://code.dcoder.tech/files/code/5e4c15209e436e412f89d805/right-angled-triangle',
 'https://forms.gle/Ui2Fb4BF2FTW5fhZA',
 'https://pastebin.com/cm0Y3fYv',
 'https://www.codechef.com/icpc',
 'https://chat.whatsapp.com/EbekhvkeUPcJYvfUs4M4js',
 'https://forms.gle/FofR6NiHHgkfL4ow7',
 'https://m.facebook.com/djsce.codestars',
 'https://www.instagram.com/djsce.cod