In [None]:
import pandas as pd
import re
from typing import List

# File only renamed, otherwise not modified
# Currently only set up to handled Android extract
FILE_NAME = 'AI-DSDM-Chat-2019_08_29-2020-12-16.txt'

In [None]:
def read_raw_data(file_name:str, 
                  encoding:str='utf-8'):
    """
    Open file and Read Raw Data

    """
    with open(FILE_NAME, encoding="utf-8") as f:
        content = f.read()
    f.close()
    return content
    
    
def clean_WhatsApp_Android_Extract(extract_content:str, max_username_length:int=25) -> pd.DataFrame:
    """
    Cleans WhatsApp Android extract and place into dataframe
    
    Parameters:
    
    extract_content: str
        A str containing all the WhatsApp Messages from an Android Extract
    max_username_length: int, Default = 25
        Limit the usernames in length, this removes messages regarding people being added or removed from
        the group chat. May remove too many lines if contact names are longer than this value.
    """
    
    extract_content = re.sub(r"\t", " ", extract_content)
    extract_content = re.sub(r"\n", " ", extract_content)
    extract_content = re.sub(r"(\d{2}\/\d{2}\/\d{4}, \d{2}:\d{2}) \- ([^:]+):", r"\n\1\t\2\t", extract_content)
    extract_content = extract_content.split('\n')
    extract_content = [x.split('\t') for x in extract_content]
    
    
    df = pd.DataFrame(extract_content, columns = ['date', 'user', 'message'])
    df['date'] = pd.to_datetime(df['date'])
    df = df[df['message'].notnull()]
    df = df[df['user'].str.len()<max_username_length]
    
    
    df = df[df['date'].notnull()]
    df['contains_media'] = False
    df.loc[df['message'].str.contains('<Media omitted>'), 'contains_media'] = True
    return df


In [None]:
df = clean_WhatsApp_Android_Extract(read_raw_data(FILE_NAME))

In [None]:
df.head()