In [26]:
import re
import regex

import locale
import calendar
import datetime

import pandas as pd
from collections import Counter

import emoji
import enchant
import probablepeople as pp
from wordcloud import WordCloud
from stop_words import get_stop_words

import plotly.io as pio
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, download_plotlyjs, plot, iplot
import matplotlib.pyplot as plt

import sys
import itertools 
import numpy as np
from math import log
from itertools import tee
from operator import itemgetter
from collections import defaultdict

In [27]:
# Global Settings
color_tere = "#33b2cb"
color_tere_dark = "#237C8E"

color_vivi = "#db70c9"
color_vivi_dark = '#994e8c'

yellow = 'rgb(255,191,0)'
dark_yellow = 'rgb(179,134,0)'

pio.templates.default = "none"
locale.setlocale(locale.LC_ALL, 'es_ES')
d = enchant.Dict("es_ES")
d2 = enchant.Dict("en_US")

# Load data

In [28]:
with open("ChatTere.txt") as file:  
    data = file.readlines()

In [29]:
df = []
date_pattern = "\[(.*?)\]"

for message in data:
    try:
        message_dict = {
            'datetime': re.search(date_pattern, message).group()[1:-1],
            'user': message.split()[3],
            'text': message.split(maxsplit=5)[-1]
        }
        if "This message was deleted." not in message_dict['text']: # Lo que se borra se olvida
            df.append(message_dict)
    except:
        # Whatsapp a veces no lee muy bien los mensajes con saltos de linea
        if message.strip() != "":
            df[-1]['text'] = f"{df[-1]['text']}{message}"

# Transform data

In [30]:
df = pd.DataFrame(df)

df['datetime'] = pd.to_datetime(df['datetime'], infer_datetime_format=True)

def quick_classification(text):
    if "audio omitted" in text:
        return "Audio"
    if any(file_type in text for file_type in ['image omitted', "video omitted"]):
        return "Image/Video"
    if any(file_type in text for file_type in ['GIF omitted', "sticker omitted"]):
        return "Sticker/GIF"
    if "document omitted" in text: 
        return "Document"
    if "Contact card omitted" in text: 
        return "Contact"
    if set(text.strip().lower()) == {'a', 'j'}:
        return "RISA" # Nos reimos tanto que tocó incluir una categoria para esto
    return "Text"

df['type'] = df['text'].apply(lambda text: quick_classification(text))

df['text'] = df['text'].apply(lambda text: text.strip())

In [31]:
df

Unnamed: 0,datetime,user,text,type
0,2019-02-20 21:51:37,Teresa,‎Messages to this chat and calls are now secur...,Text
1,2019-02-20 21:51:37,Viviana,Pariente de Bartolomé Calvo,Text
2,2019-02-20 21:51:50,Teresa,Lo logré!,Text
3,2019-02-20 21:52:27,Teresa,"Soy una crack, caiste primero",Text
4,2019-02-20 21:52:33,Viviana,(Paréntesis: Si es Bartolomé? Espero no haberm...,Text
...,...,...,...,...
15980,2020-04-25 12:03:46,Viviana,Jajajajaj,RISA
15981,2020-04-27 21:58:16,Teresa,El miedo al contagio se apoderó de las calles ...,Text
15982,2020-04-27 22:15:44,Viviana,Amiga hoy te estaba pensando,Text
15983,2020-04-27 22:16:01,Viviana,Como están tú y Marcelle?,Text


# Exploración Inicial

In [32]:
print(f"Primer fecha: {df['datetime'].min()}")
print(f"Última fecha: {df['datetime'].max()}\n")

print(f"Tiempo transcurrido: {(df['datetime'].max() - df['datetime'].min()).days}")
print(f"Número de días en que hablamos: {df['datetime'].apply(lambda date: date.date()).nunique()}")

print(f"Hablamos {round(df['datetime'].apply(lambda date: date.date()).nunique()/(df['datetime'].max() - df['datetime'].min()).days*100,2)}% de los días.")

Primer fecha: 2019-02-20 21:51:37
Última fecha: 2020-04-27 22:17:17

Tiempo transcurrido: 432
Número de días en que hablamos: 274
Hablamos 63.43% de los días.


In [33]:
print(f"Número de mensajes: {df.shape[0]:,}")
print(f"Promedio de mensajes por día cuando hablamos: {df.shape[0]/df['datetime'].apply(lambda date: date.date()).nunique():.1f}")

Número de mensajes: 15,985
Promedio de mensajes por día cuando hablamos: 58.3


In [34]:
print("Tipo de mensajes:")
df['type'].value_counts()

Tipo de mensajes:


Text           12287
Audio           1516
RISA             989
Image/Video      809
Sticker/GIF      377
Document           4
Contact            3
Name: type, dtype: int64

# Risas 


In [35]:
df[df['type']=='RISA'].text.value_counts()[:20]

Jajajaja                  128
Jajajajaja                109
Jajaja                     82
Jajajajajaja               82
Jajajajajajaja             43
Jajajajajajajaja           37
JAJAJAJAJAJA               30
JAJAJAJAJAJAJA             30
Jajajajajajajajaja         23
JAJAJAJA                   21
JAJAJAJAJAJAJAJA           21
JAJAJAJAJA                 20
Jajajajajajajajajaja       18
Jajajaj                    15
Jajajajajajajajajajaja     14
JAJAJAJAJAJAJAJAJA         13
Jajajajaj                  12
JAJAJAJAJAJAJAJAJAJAJA     12
jajaja                     11
JAJAJAJAJAJAJAJAJAJA       10
Name: text, dtype: int64

In [36]:
print("Tamaño de la risa más corta:")
df[df['type']=='RISA'].text.str.len().min()

Tamaño de la risa más corta:


4

In [37]:
print("Cómo es la risa más corta:")
temp = df[df['type']=='RISA'].text.str.len().reset_index()
index_risa_larga = temp[temp['text']==temp['text'].min()]['index'].values[0]

df[df['type']=='RISA'].loc[index_risa_larga]

Cómo es la risa más corta:


datetime    2019-04-08 14:51:58
user                     Teresa
text                       Jajj
type                       RISA
Name: 2216, dtype: object

In [38]:
print("Tamaño de la risa más larga:")
df[df['type']=='RISA'].text.str.len().max()

Tamaño de la risa más larga:


1950

In [39]:
print("Cómo es la risa más larga:")
temp = df[df['type']=='RISA'].text.str.len().reset_index()
index_risa_larga = temp[temp['text']==temp['text'].max()]['index'].values[0]

print(df[df['type']=='RISA'].loc[index_risa_larga])

print(df[df['type']=='RISA'].loc[index_risa_larga]['text'])

Cómo es la risa más larga:
datetime                                  2019-02-26 15:56:19
user                                                   Teresa
text        JAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJA...
type                                                     RISA
Name: 140, dtype: object
JAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJAJA

In [40]:
temp = df[df['type']=='RISA'].text.str.len().reset_index()
temp.columns = ['loc', 'laugh_length']
buckets = [0, 6, 25, 50, 100, 200, 2000]
buckets_labels = ['"Jajaja"<br>Menos de 6 car.', '(6,25]', '(25,50]', '(50,100]', '(100,200]', 'Más de 200<br>caracteres']
temp = pd.cut(temp['laugh_length'], bins=buckets, labels=buckets_labels)
temp = temp.reset_index()['laugh_length'].value_counts().reset_index()
temp.columns = ['laugh_length', 'count_laughs']

In [41]:
trace = go.Bar(
                x=temp.laugh_length.values,
                y=temp.count_laughs.values,
                text=[f"{count} mensajes" for (lenght,count) in zip(temp.laugh_length.values, temp.count_laughs.values)],
                hoverinfo='text', 
                opacity=0.6,
                marker=dict(color=yellow,
                            line=dict(color=dark_yellow,width=1.5,)
                           ),
                width=.8)

layout = go.Layout(title='¿Qué tan largas son nuestras risas?',
                   xaxis=dict(title='Número de caracteres', 
                              categoryorder='array',
                              categoryarray=buckets_labels),
                   yaxis=dict(title='Número de mensajes'),
                   #width = 900, height=400
                  )

fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [42]:
pio.write_html(fig, file='Plot1.html', auto_open=True)

In [43]:
temp = df[df['type']=='RISA'][['datetime','user', 'text']]
temp['datetime'] = temp['datetime'].apply(lambda date: date.date())
temp['laugh_length'] = list(df[df['type']=='RISA'].text.str.len().values)
temp['color'] = [color_tere if color=='Teresa' else color_vivi for color in temp.user.values]
temp = temp[temp['laugh_length']<1950] # Quitar la risa más larga porque arruina todos los cálculos
temp

Unnamed: 0,datetime,user,text,laugh_length,color
20,2019-02-26,Teresa,Jajajajajaja,12,#33b2cb
34,2019-02-26,Viviana,Jajajaja,8,#db70c9
71,2019-02-26,Viviana,Jajaja,6,#db70c9
76,2019-02-26,Teresa,jajaja,6,#33b2cb
78,2019-02-26,Viviana,Jajajajajaja,12,#db70c9
...,...,...,...,...,...
15883,2020-04-12,Teresa,JAJAJAJAJAJAJAJAJAJA,20,#33b2cb
15884,2020-04-12,Viviana,Jajajajajaj,11,#db70c9
15890,2020-04-12,Viviana,Jajajajaja,10,#db70c9
15899,2020-04-12,Teresa,Jajajajajaja,12,#33b2cb


In [44]:
trace_tere = go.Box(
                y=temp[temp['user']=='Teresa'].laugh_length.values,
                boxpoints='all',
                pointpos = 0,
                jitter=1,
                text=[f"{laugh}<br>Tamaño: {lenght}<br>{date}" for laugh, lenght, date
                      in zip(temp[temp['user']=='Teresa'].text,
                             temp[temp['user']=='Teresa'].laugh_length,
                             temp[temp['user']=='Teresa'].datetime)],
                hoverinfo = 'text',
                marker = dict(color = color_tere),
                line = dict(color = 'rgba(0,0,0,0)'),
                fillcolor = 'rgba(0,0,0,0)',
                name='Teresa'
               )

trace_vivi = go.Box(
                y=temp[temp['user']=='Viviana'].laugh_length.values,
                boxpoints='all', 
                pointpos = 0,
                jitter=1,
                text=[f"{laugh}<br>Tamaño: {lenght}<br>{date}" for laugh, lenght, date
                      in zip(temp[temp['user']=='Viviana'].text,
                             temp[temp['user']=='Viviana'].laugh_length,
                             temp[temp['user']=='Viviana'].datetime)],
                hoverinfo='text',
                marker = dict(color = color_vivi),
                line = dict(color = 'rgba(0,0,0,0)'),
                fillcolor = 'rgba(0,0,0,0)',
                name='Viviana'
               )

layout = go.Layout(title='¿Qué tan largas son nuestras risas?',
                   yaxis=dict(title='Tamaño de la risa')
                  )

fig = go.Figure(data=[trace_tere, trace_vivi], layout=layout)
iplot(fig)

In [45]:
pio.write_html(fig, file='Plot2.html', auto_open=True)

In [46]:
print(f"Tamaño promedio de la risa de Teresa: {df[(df['type']=='RISA') & (df['user']=='Teresa')].text.str.len().mean():.1f} caracteres.")
print(f"Tamaño promedio de la risa de Viviana: {df[(df['type']=='RISA') & (df['user']=='Viviana')].text.str.len().mean():.1f} caracteres.")

Tamaño promedio de la risa de Teresa: 18.3 caracteres.
Tamaño promedio de la risa de Viviana: 12.2 caracteres.


# Continuación Exploración Inicial

In [47]:
print("Número de mensajes por usuario: ")
df.user.value_counts()

Número de mensajes por usuario: 


Teresa     9025
Viviana    6960
Name: user, dtype: int64

In [48]:
print("Tipo de mensajes, por usuario:")
df[~df['type'].isin(['Document', 'Contact'])].groupby(["type", "user"]).size().reset_index(name="count")

Tipo de mensajes, por usuario:


Unnamed: 0,type,user,count
0,Audio,Teresa,775
1,Audio,Viviana,741
2,Image/Video,Teresa,375
3,Image/Video,Viviana,434
4,RISA,Teresa,570
5,RISA,Viviana,419
6,Sticker/GIF,Teresa,172
7,Sticker/GIF,Viviana,205
8,Text,Teresa,7132
9,Text,Viviana,5155


In [49]:
print("Mensajes más comunes de Teresa:")
df[(df['type']=='Text') & (df['user']=='Teresa')]['text'].value_counts().reset_index().head(16)

Mensajes más comunes de Teresa:


Unnamed: 0,index,text
0,Amiga,85
1,Vivi,61
2,O sea,39
3,?,28
4,Si,16
5,Marica,16
6,Epic fail,14
7,Parce,14
8,No,14
9,Listo,12


In [50]:
print("Mensajes más comunes de Viviana:")
df[(df['type']=='Text') & (df['user']=='Viviana')]['text'].value_counts().reset_index().head(16)

Mensajes más comunes de Viviana:


Unnamed: 0,index,text
0,Si,30
1,Amiga,28
2,No,16
3,Epic fail,15
4,Oye,12
5,Demasiado,9
6,No se,9
7,😭,8
8,Y tu cómo estás?,7
9,Hola amiga,7


# Volumen de interacciones por fecha

In [51]:
df['date'] = df['datetime'].apply(lambda date: date.date())

In [52]:
traces = []

# Total
trace_tere = go.Scatter(
                    x=df[df['user']=='Teresa'].groupby('date').count()['text'].reset_index().date.values,
                    y=df[df['user']=='Teresa'].groupby('date').count()['text'].reset_index().text.values,
                    text=[f"Teresa - Total<br>{val} mensajes<br>{date}" for val, date in \
                          zip(df[df['user']=='Teresa'].groupby('date').count()['text'].reset_index().text.values, \
                              df[df['user']=='Teresa'].groupby('date').count()['text'].reset_index().date.values)],
                    mode='lines', 
                    hoverinfo='text',
                    line = {
                        'color': color_tere,
                        'width': 1.2
                    },
                    name="Teresa"
                ) 

trace_vivi = go.Scatter(
                    x=df[df['user']=='Viviana'].groupby('date').count()['text'].reset_index().date.values,
                    y=df[df['user']=='Viviana'].groupby('date').count()['text'].reset_index().text.values,
                    text=[f"Viviana - Total<br>{val} mensajes<br>{date}" for val, date in \
                          zip(df[df['user']=='Viviana'].groupby('date').count()['text'].reset_index().text.values, \
                              df[df['user']=='Viviana'].groupby('date').count()['text'].reset_index().date.values)],
                    mode='lines',
                    hoverinfo='text',
                    line = {
                        'color': color_vivi,
                        'width': 1.2
                    },
                    name="Viviana"
                ) 

traces.append(trace_tere)
traces.append(trace_vivi)

# By category
for typ in df.type.unique()[:-2]:
    trace_tere = go.Scatter(
                        x=df[(df['user']=='Teresa') & (df['type']==typ)].groupby('date').count()['text'].reset_index().date.values,
                        y=df[(df['user']=='Teresa') & (df['type']==typ)].groupby('date').count()['text'].reset_index().text.values,
                        text=[f"Teresa - {typ}<br>{val} mensajes<br>{date}" for val, date in \
                              zip(df[(df['user']=='Teresa') & (df['type']==typ)].groupby('date').count()['text'].reset_index().text.values, \
                                  df[(df['user']=='Teresa') & (df['type']==typ)].groupby('date').count()['text'].reset_index().date.values)],
                        mode='lines', 
                        hoverinfo='text',
                        line = {
                            'color': color_tere,
                            'width': 1.2
                        },
                        name="Teresa",
                        visible=False
                    ) 

    trace_vivi = go.Scatter(
                        x=df[(df['user']=='Viviana') & (df['type']==typ)].groupby('date').count()['text'].reset_index().date.values,
                        y=df[(df['user']=='Viviana') & (df['type']==typ)].groupby('date').count()['text'].reset_index().text.values,
                        text=[f"Viviana - {typ}<br>{val} mensajes<br>{date})" for val, date in \
                              zip(df[(df['user']=='Viviana') & (df['type']==typ)].groupby('date').count()['text'].reset_index().text.values, \
                                  df[(df['user']=='Viviana') & (df['type']==typ)].groupby('date').count()['text'].reset_index().date.values)],
                        mode='lines',
                        hoverinfo='text',
                        line = {
                            'color': color_vivi,
                            'width': 1.2
                        },
                        name="Viviana",
                        visible=False
                    ) 
    
    traces.append(trace_tere)
    traces.append(trace_vivi)

In [53]:
updatemenus = list([
    dict(
         active=0,
         buttons=list([
            dict(label = 'Total',
                 method = 'restyle',
                 args = [{'visible': [
                                         True,
                                         True,
                                         False,
                                         False,
                                         False,
                                         False,
                                         False,
                                         False,
                                         False,
                                         False,
                                         False,
                                         False]},
                         {'title': '¿Cuántos mensajes al día?<br>Total'}]),
             dict(label = 'Texto',
                 method = 'restyle',
                 args = [{'visible': [
                                         False,
                                         False,
                                         True,
                                         True,
                                         False,
                                         False,
                                         False,
                                         False,
                                         False,
                                         False,
                                         False,
                                         False]},
                         {'title': '¿Cuántos mensajes al día?<br>Texto'}]),
             dict(label = 'RISA',
                 method = 'restyle',
                 args = [{'visible': [
                                         False,
                                         False,
                                         False,
                                         False,
                                         True,
                                         True,
                                         False,
                                         False,
                                         False,
                                         False,
                                         False,
                                         False]},
                         {'title': '¿Cuántos mensajes al día?<br>RISA'}]),
             dict(label = 'Image/Video',
                 method = 'restyle',
                 args = [{'visible': [
                                         False,
                                         False,
                                         False,
                                         False,
                                         False,
                                         False,
                                         True,
                                         True,
                                         False,
                                         False,
                                         False,
                                         False]},
                         {'title': '¿Cuántos mensajes al día?<br>Image/Video'}]),
             dict(label = 'Audio',
                 method = 'restyle',
                 args = [{'visible': [
                                         False,
                                         False,
                                         False,
                                         False,
                                         False,
                                         False,
                                         False,
                                         False,
                                         True,
                                         True,
                                         False,
                                         False]},
                         {'title': '¿Cuántos mensajes al día?<br>Audio'}]),
             dict(label = 'Sticker/GIF',
                 method = 'restyle',
                 args = [{'visible': [
                                         False,
                                         False,
                                         False,
                                         False,
                                         False,
                                         False,
                                         False,
                                         False,
                                         False,
                                         False,
                                         True,
                                         True]},
                         {'title': '¿Cuántos mensajes al día?<br>Sticker/GIF'}]),
        ]),
    )
])

In [54]:
layout = go.Layout(title="¿Cuántos mensajes al día?",
                   yaxis=dict(title='Número de mensajes',
                              #range=[0,500]
                             ),
                   xaxis=dict(range=[df.date.min(), df.date.max()]),
                   updatemenus=updatemenus
                  )

fig = go.Figure(data=traces, layout=layout)
iplot(fig)

In [55]:
pio.write_html(fig, file='Plot3.html', auto_open=True)

In [56]:
df['dow'] = df['date'].apply(lambda date: calendar.day_name[date.weekday()])

In [57]:
trace_tere = go.Bar(
                x=df[df['user']=='Teresa'].dow.value_counts(normalize=True).reset_index()['index'].values,
                y=df[df['user']=='Teresa'].dow.value_counts(normalize=True).reset_index()['dow'].values,
                text=[f"Teresa<br>{count*100:.0f}% mensajes son enviados los {day}" for (day,count) in 
                      zip(df[df['user']=='Teresa'].dow.value_counts(normalize=True).reset_index()['index'].values, 
                          df[df['user']=='Teresa'].dow.value_counts(normalize=True).reset_index()['dow'].values)],
                hoverinfo='text', 
                opacity=0.6,
                marker=dict(color=color_tere,
                            line=dict(color=color_tere_dark,width=1.5,)
                           ),
                name='Teresa',
                #width=.8
)

trace_vivi = go.Bar(
                x=df[df['user']=='Viviana'].dow.value_counts(normalize=True).reset_index()['index'].values,
                y=df[df['user']=='Viviana'].dow.value_counts(normalize=True).reset_index()['dow'].values,
                text=[f"Viviana<br>{count*100:.0f}% mensajes son enviados los {day}" for (day,count) in 
                      zip(df[df['user']=='Viviana'].dow.value_counts(normalize=True).reset_index()['index'].values, 
                          df[df['user']=='Viviana'].dow.value_counts(normalize=True).reset_index()['dow'].values)],
                hoverinfo='text', 
                opacity=0.6,
                marker=dict(color=color_vivi,
                            line=dict(color=color_vivi_dark,width=1.5,)
                           ),
                name='Viviana',
                #width=.8
)

layout = go.Layout(title='Mensajes',
                   xaxis=dict(categoryorder='array',
                              categoryarray=list(calendar.day_name)),
                   yaxis=dict(tickformat=',.0%'),
                  )

fig = go.Figure(data=[trace_tere, trace_vivi], layout=layout)
iplot(fig)

In [58]:
pio.write_html(fig, file='Plot4.html', auto_open=True)

In [25]:
df['time'] = df['datetime'].apply(lambda date: str(date.time().replace(second=0)).rsplit(":",1)[0])

In [320]:
temp = df[['user', 'time']]
temp_tere = temp[temp['user']=='Teresa']['time'].value_counts(normalize=True).reset_index()
temp_vivi = temp[temp['user']=='Viviana']['time'].value_counts(normalize=True).reset_index()


minutes = [str(minute).zfill(2) for minute in range(0,60)]
hours = [str(hour).zfill(2) for hour in range(0,24)]
mins_hrs = [f"{minute}:{hour}" for minute in hours for hour in minutes]

for mh in mins_hrs:
    if temp_tere[(temp_tere['index']==mh)].shape[0] == 0:
        tempie = pd.DataFrame([[mh, 0]], columns = ['index', 'time'])
        temp_tere = pd.concat([temp_tere, tempie], ignore_index=True)
    
    if temp_vivi[(temp_vivi['index']==mh)].shape[0] == 0:
        tempie = pd.DataFrame([[mh, 0]], columns = ['index', 'time'])
        temp_vivi = pd.concat([temp_vivi, tempie], ignore_index=True)
        
temp_tere.sort_values('index', inplace=True)
temp_vivi.sort_values('index', inplace=True)

In [321]:
trace_tere = go.Scatter(
                    x=temp_tere['index'].values,
                    y=temp_tere['time'].values,
                    mode='lines', 
                    hoverinfo='text',
                    text=[f"Teresa<br>Hora: {time}<br>{count*100:.2f}% de los mensajes" for time,count in 
                         zip(temp_tere['index'].values, temp_tere['time'].values)],
                    line = {
                        'color': color_tere,
                        'width': 0.8
                    },
                    name="Teresa"
                ) 


trace_vivi = go.Scatter(
                    x=temp_vivi['index'].values,
                    y=temp_vivi['time'].values,
                    mode='lines', 
                    hoverinfo='text',
                    text=[f"Viviana<br>Hora: {time}<br>{count*100:.2f}% de los mensajes" for time,count in 
                         zip(temp_vivi['index'].values, temp_vivi['time'].values)],
                    line = {
                        'color': color_vivi,
                        'width': 0.8
                    },
                    name="Viviana"
                ) 


layout = go.Layout(title="¿A qué horas del día?",
                   yaxis=dict(tickformat=',.2%',
                              range=[0,0.0045]),
                   xaxis=dict(categoryorder='array',
                              categoryarray=mins_hrs,
                              tickmode = 'array',
                              tickvals=mins_hrs[::60])
                  )

fig = go.Figure(data=[trace_tere, trace_vivi], layout=layout)
iplot(fig)

In [322]:
pio.write_html(fig, file='Plot4.html', auto_open=True)

# NPL

In [323]:
stop_words = get_stop_words('spanish')
regex_remove_punct = "([^\p{L}0-9']+)"

In [324]:
text_tere = df[(df['user']=='Teresa') & (df['type']=='Text')].text.str.cat(sep=' ')
text_vivi = df[(df['user']=='Viviana') & (df['type']=='Text')].text.str.cat(sep=' ')

In [325]:
print(f"Caracteres Teresa: {len(text_tere):,.0f}")
print(f"Caracteres Viviana: {len(text_vivi):,.0f}")

Caracteres Teresa: 164,350
Caracteres Viviana: 198,810


In [326]:
text_tere_clean = regex.sub(regex_remove_punct, ' ',text_tere).lower().split()
text_tere_clean = [word for word in text_tere_clean if word not in stop_words]
print(f"Número de diferentes palabras Teresa: {len(set(text_tere_clean)):,.0f}")
Counter(text_tere_clean).most_common(20)

Número de diferentes palabras Teresa: 4,796


[('amiga', 193),
 ('vivi', 121),
 ('voy', 89),
 ('cómo', 86),
 ('quiero', 76),
 ('creo', 76),
 ('serio', 75),
 ('ay', 72),
 ('vida', 68),
 ('risa', 65),
 ('re', 65),
 ('dios', 64),
 ('tan', 63),
 ('así', 62),
 ('foto', 61),
 ('ahora', 60),
 ('bueno', 59),
 ('puedo', 59),
 ('gusta', 57),
 ('bien', 57)]

In [327]:
text_vivi_clean = regex.sub(regex_remove_punct, ' ',text_vivi).lower().split()
text_vivi_clean = [word for word in text_vivi_clean if word not in stop_words]
print(f"Número de diferentes palabras Viviana: {len(set(text_vivi_clean)):,.0f}")
Counter(text_vivi_clean).most_common(20)

Número de diferentes palabras Viviana: 4,847


[('amiga', 356),
 ('jajaja', 232),
 ('jajajaja', 174),
 ('re', 153),
 ('literal', 125),
 ('mk', 122),
 ('creo', 107),
 ('demasiado', 103),
 ('quiero', 101),
 ('así', 96),
 ('bien', 92),
 ('voy', 90),
 ('jajajajaja', 89),
 ('vida', 86),
 ('siento', 85),
 ('foto', 82),
 ('vez', 81),
 ('tan', 79),
 ('amo', 77),
 ('día', 75)]

#### Nombres

```python
for c in Counter(text_tere_clean).most_common():
    if list(pp.tag(c[0])[0])==['GivenName'] and not d.check(c[0]) and len(set(c[0])) != 2:
        if not d2.check(c[0]):
            print(c)
            
for c in Counter(text_vivi_clean).most_common():
    if list(pp.tag(c[0])[0])==['GivenName'] and not d.check(c[0]) and len(set(c[0])) != 2:
        if not d2.check(c[0]):
            print(c)
```

In [328]:
### The following code is to weight words depending its frequency within the sentences from a linguistic perspective.
# Code from https://github.com/amueller/word_cloud/blob/master/wordcloud/tokenization.py
# Read more at http://nlp.stanford.edu/fsnlp/promo/colloc.pdf (dunning's likelihood ratio)

def pairwise(iterable):
    a, b = tee(iterable)
    next(b, None)
    return zip(a, b)


def score(count_bigram, count1, count2, n_words):
    if n_words <= count1 or n_words <= count2:
        return 0
    N = n_words
    c12 = count_bigram
    c1 = count1
    c2 = count2
    p = c2 / N
    p1 = c12 / c1
    p2 = (c2 - c12) / (N - c1)
    score = (l(c12, c1, p) + l(c2 - c12, N - c1, p)
             - l(c12, c1, p1) - l(c2 - c12, N - c1, p2))
    return -2 * score


def l(k, n, x):
    return log(max(x, 1e-10)) * k + log(max(1 - x, 1e-10)) * (n - k)


def process_tokens(words, normalize_plurals=True):
    d = defaultdict(dict)
    for word in words:
        word_lower = word.lower()
        case_dict = d[word_lower]
        case_dict[word] = case_dict.get(word, 0) + 1
    if normalize_plurals:
        merged_plurals = {}
        for key in list(d.keys()):
            if key.endswith('s') and not key.endswith("ss"):
                key_singular = key[:-1]
                if key_singular in d:
                    dict_plural = d[key]
                    dict_singular = d[key_singular]
                    for word, count in dict_plural.items():
                        singular = word[:-1]
                        dict_singular[singular] = (
                            dict_singular.get(singular, 0) + count)
                    merged_plurals[key] = key_singular
                    del d[key]
    fused_cases = {}
    standard_cases = {}
    item1 = itemgetter(1)
    for word_lower, case_dict in d.items():
        first = max(case_dict.items(), key=item1)[0]
        fused_cases[first] = sum(case_dict.values())
        standard_cases[word_lower] = first
    if normalize_plurals:
        for plural, singular in merged_plurals.items():
            standard_cases[plural] = standard_cases[singular.lower()]
    return fused_cases, standard_cases


def unigrams_and_bigrams(words, normalize_plurals=True):
    n_words = len(words)
    bigrams = list(pairwise(words))
    counts_unigrams, standard_form = process_tokens(words, normalize_plurals=normalize_plurals)
    counts_bigrams, standard_form_bigrams = process_tokens([" ".join(bigram) for bigram in bigrams],
                                                           normalize_plurals=normalize_plurals)
    counts = counts_unigrams.copy()

    for bigram_string, count in counts_bigrams.items():
        bigram = tuple(bigram_string.split(" "))
        word1 = standard_form[bigram[0].lower()]
        word2 = standard_form[bigram[1].lower()]

        if score(count, counts[word1], counts[word2], n_words) > 30:
            counts_unigrams[word1] -= counts_bigrams[bigram_string]
            counts_unigrams[word2] -= counts_bigrams[bigram_string]
            counts_unigrams[bigram_string] = counts_bigrams[bigram_string]
            
    words = list(counts_unigrams.keys())
    for word in words:
        if counts_unigrams[word] <= 0:
            del counts_unigrams[word]
    return counts_unigrams


def get_word_cloud(text, cat):

    stopwords = stop_words

    flags = (re.UNICODE if sys.version < '3' and type(text) is unicode else 0)
    regexp = r"\w[\w']+"

    words = re.findall(regexp, text, flags)
    words = [word for word in words if word.lower() not in stopwords]
    words = [word[:-2] if word.lower().endswith("'s") else word for word in words]
    words = [word for word in words if not word.isdigit()]
    words = [word for word in words if len(word) >= 0]
    
    df = pd.DataFrame(sorted(unigrams_and_bigrams(words, True).items(), key=lambda x: x[1], reverse=True))
    df.columns = ['word', 'count']
    df['type'] = cat
    return df

In [329]:
wc = pd.concat([get_word_cloud(text_tere, 'Teresa'),get_word_cloud(text_vivi, 'Viviana')])

In [330]:
wc.to_csv("wcTere.csv")

# Emojis

In [331]:
def get_emojis(text):
    emoji_list = []
    data = regex.findall(r'\X', text)
    for word in data:
        if any(char in emoji.UNICODE_EMOJI for char in word):
            emoji_list.append(word)

    return emoji_list

In [332]:
emojis_tere = get_emojis(text_tere)
emojis_tere = pd.DataFrame(Counter(emojis_tere).most_common(), columns =['emoji', 'count'])

In [333]:
emojis_vivi = get_emojis(text_vivi)
emojis_vivi = pd.DataFrame(Counter(emojis_vivi).most_common(), columns =['emoji', 'count'])

In [334]:
print("Teresa")
print(f"Diferentes emojis usados: {emojis_tere.shape[0]}")
print(f"Número de emojis usados: {emojis_tere['count'].sum():,.0f}")

print("\nViviana")
print(f"Diferentes emojis usados: {emojis_vivi.shape[0]}")
print(f"Número de emojis usados: {emojis_vivi['count'].sum():,.0f}")

Teresa
Diferentes emojis usados: 86
Número de emojis usados: 1,023

Viviana
Diferentes emojis usados: 75
Número de emojis usados: 645


In [335]:
print("Teresa")
print(f"Otros emojis: {emojis_tere.shape[0]-emojis_tere[~mask_tere].shape[0]}")
mask_tere = emojis_tere['count'] < threshold
for e in emojis_tere[mask_tere]['emoji'].values:
    print(e, end=" ")

print(end="\n\n")

for e in emojis_tere[~mask_tere]['emoji'].values:
    print(e, end=" ")
    
    
print(end="\n\n")

print("Viviana")
print(f"Otros emojis: {emojis_vivi.shape[0]-emojis_vivi[~mask_vivi].shape[0]}")
mask_vivi = emojis_vivi['count'] < threshold
for e in emojis_vivi[mask_vivi]['emoji'].values:
    print(e, end=" ")

print(end="\n\n")

for e in emojis_vivi[~mask_vivi]['emoji'].values:
    print(e, end=" ")
    
emojis_tere['user'] = 'Teresa'
emojis_vivi['user'] = 'Viviana'
pd.concat([emojis_tere,emojis_vivi]).to_csv("emojisTereALL.csv")

Teresa
Otros emojis: 68
👌🏽 🤤 ⁉️ 😕 😶 🎉 😛 ❌ 😑 😁 👎🏽 🌈 💵 ✅ 🥺 😟 😘 🙌🏽 💫 🙃 💸 💴 🥳 🤩 💜 🏳‍🌈 😊 💁🏼 🙈 💷 🙄 🥴 😮 💁🏻 😇 💩 🏳 🧯 🏃🏽‍♀ 👾 ⚔ 🗡 🔪 😳 💛 💶 😏 🍆 😐 👏🏽 🥶 🥵 💪🏽 💁🏽 👉🏽 😪 💗 🙊 💚 💁🏽‍♀ 😄 🤙🏽 🤘🏽 👩🏼‍🎤 👩🏼‍🔬 🤰🏼 🤱🏼 😫 

❤ 🤦🏽‍♀ 😱 😭 😍 💔 🧠 ☹️ 🔥 😡 💘 🙏🏽 🤞🏽 💓 🖕🏽 💖 💞 👍🏽 

Viviana
Otros emojis: 58
😢 ♥️ 👀 🔫 👍 💜 😔 🐘 👏 🥰 ❌ 🤗 💁‍♀️ 😨 🎉 💩 🍑 😞 🍁 🥵 ✅ 🤪 😁 🐬 🙈 🏳️ 🌈 😤 🙁 💯 ✂️ 😰 🌹 💐 🌷 👙 🍻 💦 💧 🚿 🍺 😃 🥚 😮 😿 💉 😕 🤷‍♀️ 🌳 👎 😻 🍷 🕯 😉 🤒 🦟 😆 😩 

😭 ❤️ ☹️ 💔 😂 😍 🤦‍♀️ 😱 😡 🤣 👌 💕 🥺 😏 🤞 😬 🙏 

In [336]:
threshold = 10

emojis_tere = get_emojis(text_tere)
emojis_tere = pd.DataFrame(Counter(emojis_tere).most_common(), columns =['emoji', 'count'])
mask = emojis_tere['count'] < threshold
n_mask = emojis_tere[~mask]
tempie = pd.DataFrame([["Others", emojis_tere[mask]['count'].sum()]], columns = emojis_tere.columns)
emojis_tere = pd.concat([n_mask, tempie], ignore_index=True)
emojis_tere['user'] = 'Teresa'

emojis_vivi = get_emojis(text_vivi)
emojis_vivi = pd.DataFrame(Counter(emojis_vivi).most_common(), columns =['emoji', 'count'])
mask = emojis_vivi['count'] < threshold
n_mask = emojis_vivi[~mask]
tempie = pd.DataFrame([["Others", emojis_vivi[mask]['count'].sum()]], columns = emojis_vivi.columns)
emojis_vivi= pd.concat([n_mask, tempie], ignore_index=True)
emojis_vivi['user'] = 'Viviana'

df_emojis = pd.concat([emojis_tere, emojis_vivi], ignore_index=True)
df_emojis['emojis'] = 'Emojis'

In [337]:
fig = px.treemap(df_emojis, 
                 path=['emojis', 'user', 'emoji'], 
                 values='count',
                 color='user',
                 color_discrete_map={'(?)': "light_gray", 
                                     'Teresa': color_tere, 
                                     'Viviana': color_vivi},
                hover_name="user")
fig.show()

In [261]:
df_emojis.to_csv("emojisTere.csv")