# Edna One

In [5]:
import pandas as pd
import numpy as np
import re
from collections import OrderedDict
import math

In [6]:
chat = pd.read_csv('/tmp/oEqsHU0WnNYe22r8j39UQixTfGP2/WhatsApp/_chat.csv')

chat['timestamp'] = pd.to_datetime(chat['timestamp'])
chat['date'] = pd.to_datetime(chat['date'])
chat['month'] = pd.DatetimeIndex(chat['timestamp']).month
chat['year'] = pd.DatetimeIndex(chat['timestamp']).year
chat['month-year'] = pd.to_datetime(chat['timestamp']).dt.to_period('M')

### Masks

In [79]:
def getLoveMask(df):
    return df['content'].str.contains('((I |me )(still |just )?(l|w)ove (you|u))|(^(l|w)ove (you|u))', flags=re.IGNORECASE, regex=True)

def getMissMask(df):
    return df['content'].str.contains('((I |me )(still |just )?miss (you|u))|(^miss (you|u))', flags=re.IGNORECASE, regex=True)

def getGeneralStatistic(df, parentDf=None):
    data = {}
    
    totalText = df.shape[0]
    participants = df['sender'].unique().tolist()
    years = df['year'].unique().tolist()
    months = np.sort(df['month'].unique()).tolist()
    
    data['totalText'] = totalText
    data['participants'] = participants
    
    #Mean
    data['means'] = {}
    data['means']['year'] = totalText / len(years) if len(years) > 0 else 0
    
    range_df = parentDf if parentDf is not None else df
    
    if(range_df.shape[0] > 0):
        delta = range_df['timestamp'].iloc[-1] - range_df['timestamp'].iloc[0] 
        delta_month = math.ceil(delta/np.timedelta64(1, 'M'))
        delta_day = math.ceil(delta/np.timedelta64(1, 'D'))
        data['totalMonth'] = delta_month
        data['totalDay'] = delta_day
        data['means']['month'] = (totalText / delta_month) if delta_month > 0 else totalText
        data['means']['day'] = (totalText / delta_day) if delta_day > 0 else totalText
    else:
        data['totalMonth'] = 0
        data['totalDay'] = 0
        data['means']['month'] = totalText
        data['means']['day'] = totalText

    data['years'] = years
    data['months'] = months
    
    data['first_record_timestamp'] = df.iloc[0]['timestamp']
    data['last_record_timestamp'] = df.iloc[-1]['timestamp']
    
    return data

def getMaskedData(df, parentDf=None):
    data = {}
    for mask in CUSTOM_MASKS:
        data[mask['name']] = getGeneralStatistic(df[mask['mask']], df)
    
    return data

In [80]:
LOVE_MASK = getLoveMask(chat)
MISS_MASK = getMissMask(chat)
CUSTOM_MASKS = [
    {
        'name': 'love_mask',
        'mask': LOVE_MASK
    },
    {
        'name': 'miss_mask',
        'mask': MISS_MASK
    }
]

  return func(self, *args, **kwargs)


In [55]:
# love_chat = chat[LOVE_MASK]
# miss_chat = chat[MISS_MASK]

In [81]:
data = getGeneralStatistic(chat)
data['annual_statistics'] = OrderedDict()
data['masks'] = getMaskedData(chat)
# data['masks']['love_statistic'] = getGeneralStatistic(love_chat)
# data['masks']['miss_statistic'] = getGeneralStatistic(miss_chat)
data

{'totalText': 101987,
 'participants': ['Mon Petit Chou', 'Thang Nguyen'],
 'means': {'year': 20397.4,
  'month': 2683.8684210526317,
  'day': 90.01500441306267},
 'totalMonth': 38,
 'totalDay': 1133,
 'years': [2017, 2018, 2019, 2020, 2021],
 'months': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
 'first_record_timestamp': Timestamp('2017-12-19 10:22:42'),
 'last_record_timestamp': Timestamp('2021-01-24 13:18:36'),
 'annual_statistics': OrderedDict(),
 'masks': {'love_mask': {'totalText': 6338,
   'participants': ['Mon Petit Chou', 'Thang Nguyen'],
   'means': {'year': 1267.6,
    'month': 166.78947368421052,
    'day': 5.593998234774934},
   'totalMonth': 38,
   'totalDay': 1133,
   'years': [2017, 2018, 2019, 2020, 2021],
   'months': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
   'first_record_timestamp': Timestamp('2017-12-26 19:31:24'),
   'last_record_timestamp': Timestamp('2021-01-24 11:59:22')},
  'miss_mask': {'totalText': 1165,
   'participants': ['Thang Nguyen', 'Mon Petit Chou'],

In [77]:
for year in data['years']:
    _chat = chat[chat['year'] == year]
    year = str(year)
    data['annual_statistics'][year] = getGeneralStatistic(_chat) 
    data['annual_statistics'][year]['masks'] = getMaskedData(_chat)
    
    data['annual_statistics'][year]['monthly_statistics'] = OrderedDict()
    for month in _chat['month'].unique():
        _month_chat = _chat[_chat['month'] == month]
        month = str(month)
        data['annual_statistics'][year]['monthly_statistics'][month] = getGeneralStatistic(_month_chat)
        data['annual_statistics'][year]['monthly_statistics'][month]['masks'] = getMaskedData(_month_chat)

  data[mask['name']] = getGeneralStatistic(df[mask['mask']], df)
  data[mask['name']] = getGeneralStatistic(df[mask['mask']], df)
  data[mask['name']] = getGeneralStatistic(df[mask['mask']], df)
  data[mask['name']] = getGeneralStatistic(df[mask['mask']], df)
  data[mask['name']] = getGeneralStatistic(df[mask['mask']], df)
  data[mask['name']] = getGeneralStatistic(df[mask['mask']], df)
  data[mask['name']] = getGeneralStatistic(df[mask['mask']], df)
  data[mask['name']] = getGeneralStatistic(df[mask['mask']], df)
  data[mask['name']] = getGeneralStatistic(df[mask['mask']], df)
  data[mask['name']] = getGeneralStatistic(df[mask['mask']], df)
  data[mask['name']] = getGeneralStatistic(df[mask['mask']], df)
  data[mask['name']] = getGeneralStatistic(df[mask['mask']], df)
  data[mask['name']] = getGeneralStatistic(df[mask['mask']], df)
  data[mask['name']] = getGeneralStatistic(df[mask['mask']], df)
  data[mask['name']] = getGeneralStatistic(df[mask['mask']], df)
  data[mask['name']] = ge

## Higher statistic (Mean, Median and such)

In [78]:
data

{'totalText': 101987,
 'participants': ['Mon Petit Chou', 'Thang Nguyen'],
 'means': {'year': 20397.4,
  'month': 2683.8684210526317,
  'day': 90.01500441306267},
 'totalMonth': 38,
 'totalDay': 1133,
 'years': [2017, 2018, 2019, 2020, 2021],
 'months': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
 'first_record_timestamp': 0    2017-12-19
 Name: timestamp, dtype: object,
 'last_record_timestamp': 101986    2021-01-24
 Name: timestamp, dtype: object,
 'annual_statistics': OrderedDict([('2017',
               {'totalText': 1008,
                'participants': ['Mon Petit Chou', 'Thang Nguyen'],
                'means': {'year': 1008.0,
                 'month': 1008.0,
                 'day': 77.53846153846153},
                'totalMonth': 1,
                'totalDay': 13,
                'years': [2017],
                'months': [12],
                'first_record_timestamp': 0    2017-12-19
                Name: timestamp, dtype: object,
                'last_record_timestamp': 1007 