# Edna One

In [None]:
import pandas as pd
import numpy as np
import re
from collections import OrderedDict
import math

In [None]:
chat = pd.read_csv('/tmp/oEqsHU0WnNYe22r8j39UQixTfGP2/WhatsApp/_chat.csv')

chat['timestamp'] = pd.to_datetime(chat['timestamp'])
chat['date'] = pd.to_datetime(chat['date'])
chat['month'] = pd.DatetimeIndex(chat['timestamp']).month
chat['year'] = pd.DatetimeIndex(chat['timestamp']).year
chat['month-year'] = pd.to_datetime(chat['timestamp']).dt.to_period('M')

### Masks

In [None]:
def getLoveMask(df):
    return df['content'].str.contains('((I |me )(still |just )?(l|w)ove (you|u))|(^(l|w)ove (you|u))', flags=re.IGNORECASE, regex=True)

def getMissMask(df):
    return df['content'].str.contains('((I |me )(still |just )?miss (you|u))|(^miss (you|u))', flags=re.IGNORECASE, regex=True)

def getGeneralStatistic(df):
    data = OrderedDict()
    
    totalText = df.shape[0]
    participants = df['sender'].unique().tolist()
    years = df['year'].unique().tolist()
    months = np.sort(df['month'].unique()).tolist()
    
    data['totalText'] = totalText
    data['participants'] = participants
    
    #Mean
    data['means'] = OrderedDict()
    data['means']['year'] = totalText / len(years) if len(years) > 0 else 0

    if(df.shape[0] > 0):
        delta = df['timestamp'].iloc[-1] - df['timestamp'].iloc[0] 
        delta_month = math.ceil(delta/np.timedelta64(1, 'M'))
        delta_day = math.ceil(delta/np.timedelta64(1, 'D'))
        data['totalMonth'] = delta_month
        data['totalDay'] = delta_day
        data['means']['month'] = (totalText / delta_month) if delta_month > 0 else totalText
        data['means']['day'] = (totalText / delta_day) if delta_day > 0 else totalText
    else:
        data['totalMonth'] = 0
        data['totalDay'] = 0
        data['means']['month'] = totalText
        data['means']['day'] = totalText

    data['years'] = years[0] if len(years) == 1 else years
    data['months'] = months[0] if len(months) == 1 else months
    
    data['first_record_timestamp'] = df.head(1)['timestamp'].dt.strftime('%Y-%m-%d')
    data['last_record_timestamp'] = df.tail(1)['timestamp'].dt.strftime('%Y-%m-%d')
    
    return data

In [None]:
LOVE_MASK = getLoveMask(chat)
MISS_MASK = getMissMask(chat)
ARGUE_MASK = chat['content'].str.contains('argu((ment)|e)', flags=re.IGNORECASE, regex=True)

In [None]:
love_chat = chat[LOVE_MASK]
miss_chat = chat[MISS_MASK]

In [None]:
data = getGeneralStatistic(chat)
data['annual_statistic'] = OrderedDict()
data['love_statistic'] = getGeneralStatistic(love_chat)
data['miss_statistic'] = getGeneralStatistic(miss_chat)
data

In [None]:
for year in data['years']:
    _chat = chat[chat['year'] == year]
    year = str(year)
    _love_chat = _chat[getLoveMask(_chat)]
    _miss_chat = _chat[getMissMask(_chat)]
    data['annual_statistic'][year] = getGeneralStatistic(_chat) 
    data['annual_statistic'][year]['love_statistic'] = getGeneralStatistic(_love_chat)
    data['annual_statistic'][year]['miss_statistic'] = getGeneralStatistic(_miss_chat)
    
    data['annual_statistic'][year]['monthly_statistic'] = OrderedDict()
    for month in _chat['month'].unique():
        _month_chat = _chat[_chat['month'] == month]
        _month_love_chat = _month_chat[getLoveMask(_month_chat)]
        _month_miss_chat = _month_chat[getMissMask(_month_chat)]
        month = str(month)
        data['annual_statistic'][year]['monthly_statistic'][month] = getGeneralStatistic(_month_chat)
        data['annual_statistic'][year]['monthly_statistic'][month]['love_statistic'] = getGeneralStatistic(_month_love_chat)
        data['annual_statistic'][year]['monthly_statistic'][month]['miss_statistic'] = getGeneralStatistic(_month_miss_chat)

## Higher statistic (Mean, Median and such)

In [None]:
data