In [1]:
%reset -f
import json
import pandas as pd
import numpy as np
import datetime as dt

In [2]:
ldr_date = dt.date(2018, 9, 16)

text_file = 'ChatExport/result.json'
with open(text_file, encoding="utf8") as f:
    d = json.load(f)
cols = ['type','date','from','from_id','text','sticker_emoji','file','media_type','photo','action']
text_df = pd.json_normalize(d['messages'])[cols]

### Clean data and create new columns

In [3]:
text_df = text_df[text_df['action']!='phone_call']
text_df['from_id'] = text_df['from_id'].astype('str')
text_df['datetime'] = pd.to_datetime(text_df['date'])
text_df['datetime_vn_lc'] = text_df['datetime'].dt.tz_localize(tz='Asia/Ho_Chi_Minh')
text_df['date'] = text_df['datetime'].dt.date

text_df['hour'] = text_df['datetime'].dt.hour

text_df['from'] = np.where(text_df['from']=='Trường Hoàng',
                           u'\U0001F425'+u'\U0001F49B'+u'\U0001F33B', text_df['from'])

text_df['text2'] = text_df['text'].replace('[\'!#$%&()*+,-./:;<=>?@^_`{|}~]', '', regex=True)
text_df['text2'] = text_df['text2'].replace('\\s+', ' ', regex=True)

text_df['word_count'] = text_df['text2'].str.split().str.len()

dayOfWeek={0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}
text_df['weekday_id'] = text_df['datetime'].dt.dayofweek
text_df['weekday'] = text_df['weekday_id'].map(dayOfWeek)

text_df['isLDR'] = np.where(text_df['date'] <= ldr_date,
                            0, 1)

day_intervals = [text_df['hour'].between(6, 9),
                 text_df['hour'].between(10, 13),
                 text_df['hour'].between(14, 17),
                 text_df['hour'].between(18, 21),
                 text_df['hour'].between(22, 23),
                 text_df['hour'].between(0, 5)]
day_parts = ['6-9','10-13','14-17','18-21','22-24','24-5']

text_df['daypart'] = np.select(day_intervals, day_parts, 0)

text_df['datetime_eu'] = text_df['datetime_vn_lc'].dt.tz_convert('Europe/Berlin').dt.tz_localize(None)
text_df['datetime_eu_lc'] = text_df['datetime_vn_lc'].dt.tz_convert('Europe/Berlin')

text_df['hour_eu'] = text_df['datetime_eu'].dt.hour
day_intervals_eu = [text_df['hour_eu'].between(6, 9),
                 text_df['hour_eu'].between(10, 13),
                 text_df['hour_eu'].between(14, 17),
                 text_df['hour_eu'].between(18, 21),
                 text_df['hour_eu'].between(22, 23),
                 text_df['hour_eu'].between(0, 5)]
text_df['daypart_eu'] = np.select(day_intervals_eu, day_parts,0)

text_df['isNight'] = np.where((text_df['daypart']=='night') & (text_df['daypart_eu']=='night'),
                              1, 0)

text_df['datetime_next'] = np.where(text_df['datetime']==max(text_df['datetime']),
                                    text_df['datetime'],
                                    text_df['datetime'].shift(-1))

text_df['buffer'] = (text_df['datetime_next'] - text_df['datetime']).dt.seconds/(60*60*24)

text_df['sticker_local_path'] = np.where(text_df['media_type']=='sticker', 
                                   'ChatExport/' + text_df['file'],
                                   np.nan)

sticker_path = 'https://raw.githubusercontent.com/truonghm/telegram-text-analysis/master/'
text_df['sticker_url'] = np.where(text_df['media_type']=='sticker', 
                                   sticker_path + 'ChatExport/' + text_df['file'],
                                   np.nan)
text_df['sticker_html'] = np.where(text_df['media_type']=='sticker', 
                                   '<img src="'+ text_df['sticker_url'] + '" width="30" height="30"/>',
                                   np.nan)

text_df['date_int'] = pd.to_numeric(text_df['datetime'].dt.strftime('%Y%m%d'), errors='coerce')
text_df['month_int'] = pd.to_numeric(text_df['datetime'].dt.strftime('%Y%m'), errors='coerce')
text_df['year_int'] = text_df['datetime'].dt.year

In [4]:
cols = ['type','date','from','from_id','media_type','action',
        'datetime', 'datetime_vn_lc','hour','text2','word_count','weekday_id',
        'weekday','isLDR','daypart','datetime_eu','datetime_eu_lc','hour_eu',
        'daypart_eu','isNight','datetime_next','buffer',
        'sticker_local_path','sticker_url','sticker_html','date_int','month_int','year_int']
text_df[cols].to_csv('text_df.csv', index=False)

# Visualizations

In [None]:
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
from IPython.display import Image, HTML

pio.templates.default = "plotly_white"

### Resources

- https://plotly.com/python/templates/
- https://plotly.com/python/bar-charts/
- https://plotly.com/python/discrete-color/
- https://plotly.com/python/horizontal-bar-charts/
- https://awesome-streamlit.org/

## 1. 10 most used stickers

In [None]:
stickers_df = text_df[text_df['media_type']=='sticker'].groupby(['sticker_url','from']).agg({'from_id':'count'}).reset_index()
stickers_df = stickers_df.pivot(index='sticker_url',columns='from',values='from_id').reset_index()
stickers_df['total'] = stickers_df['🐑💛🌙'] + stickers_df['🐥💛🌻']
stickers_df = stickers_df.sort_values(['total'], ascending=False).head(10)
stickers_df['🐑💛🌙'] = stickers_df['🐑💛🌙'].astype(int)
stickers_df['🐥💛🌻'] = stickers_df['🐥💛🌻'].astype(int)
stickers_df['total'] = stickers_df['total'].astype(int)

stickers_df['sticker_html'] = '<img src="'+ stickers_df['sticker_url'] + '" width="30" height="30"/>'

pd.set_option('display.max_colwidth', None)

HTML(stickers_df[['sticker_html','🐑💛🌙','🐥💛🌻','total']].to_html(escape=False))
# st.write(stickers_df[['sticker_html','🐑💛🌙','🐥💛🌻','total']].to_html(escape=False), unsafe_allow_html=True)

## 2. Oveview

In [None]:
overview_df = text_df.groupby('from').agg({'text2':'count','word_count':'mean','hour':'median'}).reset_index()
overview_df.rename(columns={'text2':'Message_count',
                            'word_count':'Average_message_length',
                            'hour':'Median_hour'},
                   inplace=True)
overview_df.style.format({'Message_count': '{:.0f}', 'Average_message_length': '{:.2f}', 'hour': '{:.0f}'})
# overview_df.style.set_precision(2)

In [None]:
print(px.colors.qualitative.Pastel)

In [None]:
fig = px.bar(overview_df, x='Message_count', y='from',
             color='from', orientation='h',
             color_discrete_map={
                "🐑💛🌙": 'rgb(246, 207, 113)',
                "🐥💛🌻": "rgb(102, 197, 204)"},
             text='Message_count',
             color_discrete_sequence=px.colors.qualitative.Pastel)
fig.show()