In [None]:
import pandas as pd
import re
from typing import List
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly
import plotly.express as px
import numpy as np
from PIL import Image
from os import path
from wordcloud import WordCloud, STOPWORDS
%matplotlib inline

# File only renamed, otherwise not modified
# Currently only set up to handled Android extract
FILE_NAME = 'AI-DSDM-Chat-2019_08_29-2020-12-16.txt'
# FILE_NAME = '_chat.txt'

In [None]:
def read_raw_data(file_name:str, 
                  encoding:str='utf-8'):
    """
    Open file and Read Raw Data

    """
    with open(FILE_NAME, encoding="utf-8") as f:
        content = f.read()
    f.close()
    return content
    
    
def clean_WhatsApp_Android_Extract(extract_content:str, 
                                   extract_type:str = 'Android', 
                                   max_username_length:int=25) -> pd.DataFrame:
    """
    Cleans WhatsApp Android extract and place into dataframe
    
    Parameters:
    
    extract_content: str
        A str containing all the WhatsApp Messages from an Android Extract
    max_username_length: int, Default = 25
        Limit the usernames in length, this removes messages regarding people being added or removed from
        the group chat. May remove too many lines if contact names are longer than this value.
    extract_type: str, Default 'Android'
        
    Returns: pd.DataFrame
        With date, user, message, and contains_media as columns 
    """
    if extract_type == 'iOS':
        extract_content = extract_content.encode("ascii", "ignore")
        extract_content = extract_content.decode()
    
    extract_content = re.sub(r"\t", " ", extract_content)
    extract_content = re.sub(r"\n", " ", extract_content)

    if extract_type == 'iOS':
        extract_content = re.sub(r"\[(\d{4}\-\d{2}\-\d{2}, \d{1,2}:\d{2}:\d{2} [APM]+)\] ([^:]+):", r"\n\1\t\2\t", extract_content)
    elif extract_type == 'Android':
        extract_content = re.sub(r"(\d{2}\/\d{2}\/\d{4}, \d{2}:\d{2}) \- ([^:]+):", r"\n\1\t\2\t", extract_content)
                
    extract_content = extract_content.split('\n')
    extract_content = [x.split('\t') for x in extract_content]
    
    
    df = pd.DataFrame(extract_content, columns = ['date', 'user', 'message'])
    df['date'] = pd.to_datetime(df['date'])
    df = df[df['message'].notnull()]
    df = df[df['user'].str.len()<max_username_length]
    
    
    df = df[df['date'].notnull()]
    df['contains_media'] = False
    df.loc[df['message'].str.contains('<Media omitted>'), 'contains_media'] = True
    return df


In [None]:
def plot_cloud(wordcloud):
    """
    Plots a wordcloud based on the word cloud package

    Parameters:
        wordcloud
    
    Returns:

    """
    plt.figure(figsize=(12, 9))
    plt.imshow(wordcloud) 
    plt.axis("off")

def get_all_messages(df:pd.DataFrame, user:str)->str:
    """
    Creates a string containing all the messages of a user

    Parameters:
    
    df:pd.DataFrame
        DataFrame containing user and messages
    user:str
        Name of the user as a string
    
    Returns:str
        String containing all the messages of a user 
    """
    df1 = df.copy()
    df1 = df1[df1['user'] == user]
    df1 = ' '.join(list(df1['message']))
    return df1
    

def user_word_cloud(df:pd.DataFrame, 
                    user:str, 
                    mask_image:str='silhouette.png', 
                    background_color:str='black', 
                    colormap:str='Greys'):
    """
    Generate a word cloud within the mask_image

    Parameters:
    
    df: pd.DataFrame
        DataFrame containing user and messages
    user: str
        The user name as a string, often a cell number
    mask_image: str
        Name of the mask image to use for the wordcloud
    background_color:
        Background color behind the silhouette
    colormap:
        Color for the text of the word clouse
    
    Returns:
        Generates a word cloud image
    """
    text = get_all_messages(df, user)
    d = path.dirname(__file__) if "__file__" in locals() else os.getcwd()
    cloud_mask = np.array(Image.open(path.join(d, "silhouette.png")))

    # Generate word cloud
    wordcloud = WordCloud(
        width = 400, height = 300, random_state=1, 
        background_color='black', colormap='Greys', 
        collocations=False, stopwords = STOPWORDS, 
        mask=cloud_mask).generate(text)
    # Plot
    plot_cloud(wordcloud)

In [None]:
df = clean_WhatsApp_Android_Extract(read_raw_data(FILE_NAME), 'Android')

In [None]:
df.head()

In [None]:
user_word_cloud(df, '')