In [37]:
import pandas as pd
import re
from typing import List
import plotly.graph_objects as go
import plotly
import plotly.express as px

# File only renamed, otherwise not modified
# Currently only set up to handled Android extract
FILE_NAME = 'AI-DSDM-Chat-2019_08_29-2020-12-16.txt'
# FILE_NAME = '_chat.txt'

In [40]:
def read_raw_data(file_name:str, 
                  encoding:str='utf-8'):
    """
    Open file and Read Raw Data

    """
    with open(FILE_NAME, encoding="utf-8") as f:
        content = f.read()
    f.close()
    return content
    
    
def clean_WhatsApp_Android_Extract(extract_content:str, 
                                   extract_type:str = 'Android', 
                                   max_username_length:int=25) -> pd.DataFrame:
    """
    Cleans WhatsApp Android extract and place into dataframe
    
    Parameters:
    
    extract_content: str
        A str containing all the WhatsApp Messages from an Android Extract
    max_username_length: int, Default = 25
        Limit the usernames in length, this removes messages regarding people being added or removed from
        the group chat. May remove too many lines if contact names are longer than this value.
    extract_type: str, Default 'Android'
        
    Returns: pd.DataFrame
        With date, user, message, and contains_media as columns 
    """
    if extract_type == 'iOS':
        print("Check")
        extract_content = extract_content.encode("ascii", "ignore")
        extract_content = extract_content.decode()
    
    extract_content = re.sub(r"\t", " ", extract_content)
    extract_content = re.sub(r"\n", " ", extract_content)
    
    if extract_type == 'iOS':
        extract_content = re.sub(r"\[(\d{4}\-\d{2}\-\d{2}, \d{1,2}:\d{2}:\d{2} [APM]+)\] ([^:]+):", r"\n\1\t\2\t", extract_content)
    elif extract_type == 'Android':
        print("Check")
        extract_content = re.sub(r"\[(\d{2}\/\d{2}\/\d{4}, \d{2}:\d{2}) \- ([^:]+):", r"\n\1\t\2\t", extract_content)
        
        
    extract_content = extract_content.split('\n')
    extract_content = [x.split('\t') for x in extract_content]
    
    
    df = pd.DataFrame(extract_content, columns = ['date', 'user', 'message'])
    df['date'] = pd.to_datetime(df['date'])
    df = df[df['message'].notnull()]
    df = df[df['user'].str.len()<max_username_length]
    
    
    df = df[df['date'].notnull()]
    df['contains_media'] = False
    df.loc[df['message'].str.contains('<Media omitted>'), 'contains_media'] = True
    return df


In [41]:
df = clean_WhatsApp_Android_Extract(read_raw_data(FILE_NAME), 'Android')

Check


ValueError: 3 columns passed, passed data had 1 columns

In [36]:
df.head()

Unnamed: 0,date,user,message,contains_media
1,2020-08-25 04:21:04,AI & DSDM Main,Messages and calls are end-to-end encrypted. ...,False
5,2020-08-17 20:56:14,+385976622896,"you guys also cant find ""DKE"" under My Organi...",False
6,2020-08-17 20:56:20,+385976622896,thanks!,False
10,2020-08-17 21:40:06,+393496742409,Or is another hacking attack,False
13,2020-08-17 21:41:36,+385976622896,Hahahah,False
