In [1]:
import os

cwd = os.getcwd()

In [None]:
import pandas as pd
import re
import json
import emoji
import plotly.express as px
import plotly.graph_objs as go

from collections import Counter
from datetime import datetime, timedelta
from statistics import mean, median, stdev
from shiny import App, Inputs, Outputs, Session, reactive, render, ui
from shiny.types import FileInfo
from shinywidgets import output_widget, render_widget

from icons import gear_fill, question_circle_fill

days_of_week = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
hhmm_range = pd.date_range('00:00', '23:59', freq='15min').time
hhmm_list = [t.strftime('%H:%M') for t in hhmm_range]
hh_range = pd.date_range('00:00', '23:59', freq='H').time
hh_list = [t.strftime('%H') for t in hh_range]
color_theme = [px.colors.qualitative.Plotly[i] for i in range(10)]


##### ETL FUNCTIONS #####

def extract_emojis(s: str) -> str:
    return ''.join(c for c in s if emoji.is_emoji(str(c)))


def extract_whatsapp_row(string: str) -> list:
    """
    Extracts a list of 4 elements from a given string representing a WhatsApp chat message.

    The elements are:
    - Date (str)
    - Time (str)
    - User (str)
    - Message (str)

    :param string: The string to parse, representing a WhatsApp message.
    :return: A list of 4 elements containing the extracted information.
    """
    string = string.replace("\u202f", " ")
    output = [None, None, None, None]

    # Extract date and time string
    match = re.search(r'.*?(?=[\]\-])', string)
    if match:
        date_time_string = match.group().strip("[]- ,")
        date = re.search(r'\d{1,2}/\d{1,2}/\d{2,4}', date_time_string)
        if date:
            output[0] = date.group().strip("[]- ,")

        time = re.search(r',\s.*', date_time_string)
        if time:
            output[1] = time.group().strip("[]- ,")

    user_and_message = re.search(r'([-\]]).*', string)
    if user_and_message:
    
        # Extract user
        match = re.search(r'^(.*?):',user_and_message.group())
        if match:
            user = match.group().strip("[]-: ")
            user = re.sub(r" */ *", " ", user)
            output[2] = user
        
        # Extract message
        match = re.search(r':.*', user_and_message.group())
        if match:
            output[3]= match.group().lstrip("[]-: ,").strip()
    
    return output


def get_date_format(dates: list[str]) -> str:
    # Potential date formats
    formats = [
        "%d/%m/%Y",  # 21/08/2023
        "%d/%m/%y",  # 21/08/23
        "%m/%d/%Y",  # 08/21/2023
        "%m/%d/%y",  # 08/21/23
        "%Y/%m/%d",  # 2023/08/21
        "%Y-%m-%d",  # 2023-08-21
        "%d-%m-%Y",  # 21-08-2023
        "%m-%d-%Y",  # 08-21-2023
        "%d.%m.%Y",  # 21.08.2023
        "%m.%d.%Y",  # 08.21.2023
        "%Y.%m.%d",  # 2023.08.21
    ]

    matched_format = None
    for fmt in formats:
        try:
            if all(datetime.strptime(date, fmt) for date in dates):
                matched_format = fmt
                print(f"Date format determined to be {fmt}")
                break
        except ValueError:
            continue
    return matched_format


def get_time_format(times: list[str]) -> str:
    # Potential time formats
    formats = [
        "%H:%M",        # 14:30
        "%H:%M:%S",     # 14:30:00
        "%I:%M %p",     # 02:30 PM
        "%I:%M:%S %p",  # 02:30:00 PM
        "%H%M",         # 1430
        "%I%M %p"       # 0230 PM
    ]

    matched_format = None
    for fmt in formats:
        try:
            if all(datetime.strptime(time, fmt) for time in times):
                matched_format = fmt
                print(f"Time format determined to be {fmt}")
                break
        except ValueError:
            continue
    return matched_format


def get_day_of_week(i: int) -> str:
    return days_of_week[i]


def parse_day_column(df_col):
    df_col = df_col.astype('category')
    existing_categories = [day for day in days_of_week if day in df_col.cat.categories]
    df_col = df_col.cat.reorder_categories(existing_categories, ordered=True)
    return df_col


def parse_whatsapp_data(input_df: pd.DataFrame) -> pd.DataFrame:
    """
    Parse WhatsApp data

    Args:
        input_df (pd.DataFrame): Input DataFrame with columns 'Date', 'Time', 'User', 'Message'

    Returns:
        pd.DataFrame: DataFrame with columns 'Date', 'Time', 'MMYYYY', 'Week', 'Hour', 'Day', 'User', 'Message', 'Emojis'
    """
    df = input_df.copy()
    date_format = get_date_format(set(df.Date))
    df.Date = pd.to_datetime(df.Date, format=date_format)
    df['MMYYYY'] = df.Date.apply(lambda x: x.strftime("%m/%Y"))
    df.MMYYYY = pd.to_datetime(df.MMYYYY, format='%m/%Y')
    df['Week'] = df.Date.dt.isocalendar().week
    time_format = get_time_format(df.Time)
    df.Time = pd.to_datetime(df.Time, format=time_format)
    df.Date = input_df.Date + ' ' + input_df.Time
    df.Date = pd.to_datetime(df.Date, format=f"{date_format} {time_format}")
    df['Hour'] = df.Time.dt.hour
    df.Time = df.Time.dt.floor('15min').dt.time
    df['Day'] = df.Date.dt.weekday
    df.Day = df.Day.apply(get_day_of_week)
    df.Day = parse_day_column(df.Day)
    df['Emojis'] = df.Message.apply(extract_emojis)
    print('Dataframe created and WhatsApp data parsed.')
    return df


def parse_telegram_data(input_df: pd.DataFrame) -> pd.DataFrame:
    """
    Parse Telegram data

    Args:
        input_df (pd.DataFrame): Input DataFrame with columns 'date_unixtime', 'text', 'from'

    Returns:
        pd.DataFrame: DataFrame with columns 'Date', 'Time', 'MMYYYY', 'Week', 'Hour', 'Day', 'User', 'Message', 'Emojis'
    """
    df = input_df.copy()
    df.date_unixtime = df.date_unixtime.apply(lambda x: int(x) + 8 * 60 * 60)  # For GMT+8
    df.date_unixtime = pd.to_datetime(df.date_unixtime, unit='s')
    df['Date'] = df.date_unixtime
    df['Time'] = df.date_unixtime.dt.floor('15min').dt.time
    df['MMYYYY'] = df.Date.apply(lambda x: x.strftime("%m/%Y"))
    df['Week'] = df.date_unixtime.dt.isocalendar().week
    df.MMYYYY = pd.to_datetime(df.MMYYYY, format='%m/%Y')
    df['Hour'] = pd.to_datetime(df.date_unixtime, unit='s').dt.hour
    df['Day'] = pd.to_datetime(df.date_unixtime, unit='s').dt.weekday
    df.Day = df.Day.apply(get_day_of_week)
    df.Day = parse_day_column(df.Day)
    df['Message'] = df.text.apply(lambda x: str(x))
    df = df[df['Message'].str.len() > 0]
    df['Emojis'] = df.Message.apply(extract_emojis)
    df['User'] = df['from']
    df = df[['Date', 'Time', 'MMYYYY', 'Week', 'Hour', 'Day', 'User', 'Message', 'Emojis']]
    print('Dataframe created and Telegram data parsed.')
    return df


def parse_data(file_type: str, file_path: str) -> pd.DataFrame:
    """
    Parse a WhatsApp or Telegram chat file and return a DataFrame.

    Args:
        file_type (str): The type of file to parse. Must be either 'text' or 'json'.
        file_path (str): The path to the file to parse.

    Returns:
        pd.DataFrame: The parsed DataFrame with columns 'Date', 'Time', 'MMYYYY', 'Week', 'Hour', 'Day', 'User', 'Message', 'Emojis'.
    """
    df = None
    
    with open(file_path, encoding='utf-8') as file:
        decoded = file.readlines()
            
    if 'text' in file_type:
        print("WhatsApp chat detected.")

        # Split into list of messages and keep only relevant ones.
        parsed_lines = [extract_whatsapp_row(x) for x in decoded]
        parsed_lines = [row for row in parsed_lines if all(row)]  # Keep only rows with non-empty strings
        df = pd.DataFrame(parsed_lines, columns=['Date', 'Time', 'User', 'Message'])
        df = parse_whatsapp_data(df)
        
    if 'json' in file_type:
        print("Telegram chat detected.")

        try:
            decoded = ''.join(decoded)
            json_data = json.loads(decoded)
            print("JSON is valid.")

        except json.JSONDecodeError as e:
            print("Error: Invalid JSON string.")
            print(e)
            return df

        if json_data:
            df = pd.DataFrame(json_data)
            if 'messages' in df:
                df = pd.json_normalize(df.messages)
                df = df[['date_unixtime', 'from', 'text']]
                df = df[  # Keep only rows with non-empty strings
                    (df['date_unixtime'].str.strip() != '') &
                    (df['from'].str.strip() != '') &
                    (df['text'].str.strip() != '')
                ].reset_index(drop=True)
                df = parse_telegram_data(df)  
            else:
                print("Error: 'messages' field not found in JSON data.")
        else:
            print("Error: Empty JSON file.")

    # Anonymize users with initials.
    df['User'] = df.User.apply(lambda x: ''.join([name[0].upper() for name in x.split()]))

    return df


def add_message_group(df: pd.DataFrame) -> pd.DataFrame:
    """
    Add a 'Message_Group' column to a DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame to add the 'Message_Group' column to.

    Returns:
        pd.DataFrame: The DataFrame with the added 'Message_Group' column.
    """
    df = df.copy()
    user_col = df.User

    # Initialize
    id = 1
    id_list = [0]*len(user_col)

    # Handle first message
    id_list[0] = id
    current_user: str = user_col[0]
    for i in range(1, len(user_col)):
        if user_col[i] == current_user:
            id_list[i] = id
        else:
            id += 1
            id_list[i] = id
            current_user = user_col[i]

    df['Message_Group'] = id_list
    return df


def add_convo_id(df: pd.DataFrame, minutes: int = 10) -> pd.DataFrame:
    """
    Add a 'Convo_ID' column to a DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame to add the 'Convo_ID' column to.
        minutes (int, optional): The number of minutes to consider a message as part of the same conversation. Defaults to 10.

    Returns:
        pd.DataFrame: The same DataFrame with the added 'Convo_ID' column.
    """
    time_delta = pd.Timedelta(minutes=minutes)
    dates = df.Date

    # Initialize
    id = 0
    id_list = [0]*len(dates)

    # Handle first message
    if dates[1] - dates[0] <= time_delta:
        id += 1
        id_list[0] = id

    # Checks both forwards and backwards
    for i in range(1, len(dates)-1):
        if dates[i] - dates[i-1] <= time_delta:
            id_list[i] = id
        elif dates[i+1] - dates[i] <= time_delta:
            id += 1
            id_list[i] = id

    # Handle final message
    if dates[len(dates)-1] - dates[len(dates)-2] <= time_delta:
        id_list[len(dates)-1] = id

    df = df.copy()
    df['Convo_ID'] = id_list

    # Get the conversational (i.e. user > 2) rows
    df_convo = df[df.Convo_ID != 0].groupby('Convo_ID').filter(lambda x: x.User.nunique() > 1)
    df_convo.Convo_ID = pd.factorize(df_convo.Convo_ID)[0]+1

    # Append the non-conversational rows (using index of df_convo), and sort
    df_non_convo = df[~df.index.isin(df_convo.index)].copy()
    df_non_convo.Convo_ID = 0

    df = pd.concat([df_convo, df_non_convo]).sort_index()
    
    return df


def add_response_time(df: pd.DataFrame) -> pd.DataFrame:
    """
    Adds two new columns to the input DataFrame: 'Is_Response' and 'Response_Time'.
    'Is_Response' indicates whether a message is a response by a different user.
    'Response_Time' is the time between the current message and the previous message.
    If the message is not a response, it is NaN.
    If the message is the first in a conversation, it is also NaN.

    Args:
        df (pd.DataFrame): The DataFrame to add the 'Is_Response' and 'Response_Time' columns to.

    Returns:
        pd.DataFrame: The same DataFrame with the added 'Is_Response' and 'Response_Time' columns.
    """
    users = df.User
    df = df.copy()

    # Whether a message is a response by a different user
    is_response = [False]*len(users)
    current_user = users[0]
    for i in range(1, len(users)):
        if users[i] != current_user:
            is_response[i] = True
            current_user = users[i]

    df['Is_Response'] = is_response

    # Add corresponding response time
    df_is_response = df.drop_duplicates(subset=['Message_Group'], keep='first').copy()
    df_is_not_response = df[~df.index.isin(df_is_response.index)].copy()
    df_is_response['Response_Time'] = df_is_response.Date.diff()
    df = pd.concat([df_is_response, df_is_not_response]).sort_index()

    # Whether a response is within a conversation (excl. first response)
    df_is_convo_response = df.copy()[df.Convo_ID != 0].groupby("Convo_ID").head(1)
    df_is_convo_response['Is_Convo_Response'] = False  # first message in any conversation to false
    df_else = df.copy()[~df.index.isin(df_is_convo_response.index)]
    df_else['Is_Convo_Response'] = list(map(lambda x: False if x[0] == 0 else x[1], zip(df_else. Convo_ID, df_else.Is_Response)))
    df = pd.concat([df_is_convo_response, df_else]).sort_index()

    # Time delta between every message
    df['Time_Delta'] = df['Date'].diff()

    return df


def parse_timedelta(td: pd.Timedelta) -> str:
    """
    Convert timedelta to days, hours, minutes, seconds.

    Args:
        td: A pandas Timedelta object.

    Returns:
        str: A string representation of the timedelta in days, hours, minutes, and seconds.
    """
    td_components = td.components
    days = td_components.days
    seconds = td_components.seconds
    hours = td_components.hours
    minutes = td_components.minutes
    seconds = td_components.seconds

    components = []
    if days:
        components.append(f"{days} days")
    if hours:
        components.append(f"{hours} hrs")
    if minutes:
        components.append(f"{minutes} min")
    if seconds:
        components.append(f"{seconds} sec")

    return " ".join(components)


def transpose_stats_df(stats_df: pd.DataFrame) -> pd.DataFrame:
    """
    Transposes a Pandas DataFrame that contains statistics about chat messages.

    Args:
        param stats_df: A Pandas DataFrame with a 'User' column and other columns
            representing message statistics.

    Returns:
        pd.DataFrame: The same DataFrame, but with the columns and rows transposed.
    """
    stats_df = stats_df.set_index('User')
    stats_df = stats_df.T
    stats_df = stats_df.reset_index()
    stats_df.columns.name = None
    stats_df = stats_df.rename(columns={'index': ' '})
    return stats_df


In [None]:
# df = parse_data('json', '/home/zyf0717/Downloads/result.json')
# df = parse_data('text', '/home/zyf0717/Downloads/_chat.txt')

# df = parse_data('json', '/Users/yifei/Downloads/result.json')
# df = parse_data('text', '/Users/yifei/Downloads/_chat.txt')

df = parse_data('json', f'{cwd}/chats/result.json')
# df = parse_data('text', f'{cwd}/chats/_chat.txt')

df = add_message_group(df)
df = add_convo_id(df)
df = add_response_time(df)

df.tail(20)