In [1]:
from lxml import etree
from bs4 import BeautifulSoup
from parsel import Selector

import pandas as pd
import numpy as np

import regex as re

import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
pio.renderers.default = "vscode"

In [2]:
# with open('john-hole.html') as fp:
#     soup = BeautifulSoup(fp, 'html.parser')
with open('john-hole.html') as f:
    html = f.read()


In [3]:
html = html.replace('<b>', '')
html = html.replace('</b>', '')
html = html.replace('\n', '')

In [4]:
selector = Selector(html)

In [5]:
# selector.xpath('//div/span[@class="bubble"]/text()').getall()
message_objs = {
    'texts' : [],
    'image' : [],
    'video' : [],
    'sender_number' : [],
    'reactions' : []
}
dates = []
for message in selector.xpath('//div[@class="message"]'):
    dates.append(re.sub(r'\(.*\)', '', message.xpath('./div/p/span[@class="timestamp"]/text()').get()))
    message_objs['sender_number'].append(message.xpath('./div/p/span[@class="sender"]/text()').get())
    message_objs['texts'].append(message.xpath('./div/div/span[@class="bubble"]/text()').get())
    message_objs['image'].append(message.xpath('.//div/div/div[@class="attachment"]/img/@src').get())
    message_objs['video'].append(message.xpath('.//div/div/div[@class="attachment"]/video/source/@src').get())
    reactions = message.xpath('./div/div[@class="tapbacks"]').xpath('./div[@class="tapback"]/span/text()').getall()
    message_objs['reactions'].append([reaction.replace('\t', '') for reaction in reactions])

In [6]:
for key in message_objs:
    print(f"{key}: {len(message_objs[key])}")

texts: 15735
image: 15735
video: 15735
sender_number: 15735
reactions: 15735


In [7]:
message_objs['dates'] = dates
df = pd.DataFrame(message_objs)

In [8]:
df

Unnamed: 0,texts,image,video,sender_number,reactions,dates
0,I can’t put down the mowater I can’t put down ...,,,+14169857677,"[Laughed by +16133160960, Laughed by +16138044...","Jan 01, 2025 12:54:33 PM"
1,,attachments/302/48791.jpeg,,+16138044084,[Laughed by +16138588921],"Jan 01, 2025 1:36:32 PM"
2,Stop firing at Ukraine dawg,,,+16138588921,"[Laughed by +16133160960, Laughed by +16138044...","Jan 01, 2025 1:37:49 PM"
3,New jack pic that doesn’t look like jack dropped,,,+16138044084,[Laughed by +16133160960],"Jan 01, 2025 1:36:39 PM"
4,Stop firing at Ukraine dawg,,,+16138588921,"[Laughed by +16133160960, Laughed by +16138044...","Jan 01, 2025 1:37:49 PM"
...,...,...,...,...,...,...
15730,Mane I was on reddit looking at peoples lists ...,,,+16138588921,[],"Feb 01, 2025 12:32:56 AM"
15731,When jack johnson played Ottawa blues fest u g...,,,itsjaxonmusic@gmail.com,[],"Feb 01, 2025 12:37:55 AM"
15732,Holy fuck LOL,,,+16138588921,[],"Feb 01, 2025 12:38:19 AM"
15733,Lowkey heartbreaking,,,itsjaxonmusic@gmail.com,[],"Feb 01, 2025 12:42:40 AM"


In [9]:
# Dropping duplicates
df = df.loc[df[['texts', 'sender_number', 'dates']].drop_duplicates().index]

In [10]:
df.dates = df.dates.str.strip()

In [11]:
df.set_index('dates', inplace=True)

In [12]:
df.index = pd.to_datetime(df.index, format="%b %d, %Y %I:%M:%S %p")

In [13]:
df

Unnamed: 0_level_0,texts,image,video,sender_number,reactions
dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-01-01 12:54:33,I can’t put down the mowater I can’t put down ...,,,+14169857677,"[Laughed by +16133160960, Laughed by +16138044..."
2025-01-01 13:36:32,,attachments/302/48791.jpeg,,+16138044084,[Laughed by +16138588921]
2025-01-01 13:37:49,Stop firing at Ukraine dawg,,,+16138588921,"[Laughed by +16133160960, Laughed by +16138044..."
2025-01-01 13:36:39,New jack pic that doesn’t look like jack dropped,,,+16138044084,[Laughed by +16133160960]
2025-01-01 13:38:49,LOOOOL,,,+16138044084,[]
...,...,...,...,...,...
2025-02-01 00:32:56,Mane I was on reddit looking at peoples lists ...,,,+16138588921,[]
2025-02-01 00:37:55,When jack johnson played Ottawa blues fest u g...,,,itsjaxonmusic@gmail.com,[]
2025-02-01 00:38:19,Holy fuck LOL,,,+16138588921,[]
2025-02-01 00:42:40,Lowkey heartbreaking,,,itsjaxonmusic@gmail.com,[]


In [14]:
df.sender_number.unique()

array(['+14169857677', '+16138044084', '+16138588921', '+16133160960',
       'hashtagyoloswagbeef@gmail.com', '+15198782573', 'Me',
       '+16472994537', '+14372341484', '+19024973878',
       'gregorynip@icloud.com', '+12368865666', '+16136202840',
       '+14168368964', '+14168347529', '+16048685587', '+16476226650',
       '+12899330628', '+14167995612', '+16474591137',
       'josh.maycock@yahoo.com', '+13432045724',
       'melaniesmyth13@gmail.com', '+16134004076', '+16138093940',
       '+16475754548', '+14163054266', '+16133559273', '+16477019241',
       'stephazel108@gmail.com', '+16135581208', '+16479917601',
       'itsjaxonmusic@gmail.com', '+16475394413', '+16139797154',
       'edenkettleson@gmail.com', '+15197192478',
       'ernestormz99@gmail.com', 'ryanmatson18@gmail.com', '+18103048098',
       '+16138594804', '+16137628867'], dtype=object)

In [15]:
senders = {
    '+16138044084' : 'Ryan',
    'ryan.matson@icloud.com' : 'Ryan',
    '+16472994537' : 'Greg',
    '+16476226650' : 'Ali',
    '+15198782573' : 'John',
    '+14168347529' : 'Tommy',
    '+14383514096' : 'Jake',
    '+16138588921' : 'Ben',
    'ben_maycock@yahoo.com' : 'Ben',
    '+16048685587' : 'Michaela',
    '+19024973878' : 'Ryan 2',
    'ryan.keays@gmail.com' : 'Ryan 2',
    'darcydrums@yahoo.ca' : 'Darcy',
    '+447918720754' : 'Darcy',
    '+16136202840' : 'Darcy',
    'Me' : 'Henry',
    '+16474679692' : 'Pias',
    '+16133160960' : 'Jack',
    'itsjaxonmusic@gmail.com' : 'Jack',
    '+436764425651' : 'Miklos',
    '+14167055056' : 'Ernesto',
    '+14168368964' : 'Rayhan',
    '+12263763043' : 'Maggie',
    '+14169857677' : 'Riley',
    '+14372341484' : 'M****',
    '+14372555533' : 'Colin',
    'ernestormz99@gmail.com' : 'Ernesto',
    '+16133559273' : 'Igor',
    '+16475754548' : 'Nicole',
    '+16137628867' : 'Mel',
    'gregorynip@icloud.com' : 'Greg', 
    'edenkettleson@gmail.com' : 'Eden',
    '+12368865666' : 'Eden',
    '+16474591137' : 'Krystiana',
    '+14163054266' : 'Austin',
    'ryanmatson18@gmail.com' : 'Ryan',
    '+16134004076' : 'Nick'
}

In [16]:
df['reactions_recieved'] = df['reactions'].apply(lambda x : len(x))

In [17]:
df['reactions_recieved']

dates
2025-01-01 12:54:33    2
2025-01-01 13:36:32    1
2025-01-01 13:37:49    3
2025-01-01 13:36:39    1
2025-01-01 13:38:49    0
                      ..
2025-02-01 00:32:56    0
2025-02-01 00:37:55    0
2025-02-01 00:38:19    0
2025-02-01 00:42:40    0
2025-02-01 00:43:04    0
Name: reactions_recieved, Length: 13709, dtype: int64

In [18]:
df['sender'] = df['sender_number'].replace(senders)

In [19]:
text_count = df.groupby('sender').count().sort_values(by='texts', ascending=False)

In [20]:
text_count['reactions_recieved'] = df.groupby('sender').sum()['reactions_recieved']

In [21]:
text_count['messages'] = text_count['texts'] + text_count['image']
text_count = text_count.sort_values(by='messages', ascending=False)

In [22]:
# Tommy reacted but did not send any messages

# tommy_dict = [{col: 0 for col in text_count.columns}]
# tommy_dict[0]['sender'] = 'Tommy'
# tommy_df = pd.DataFrame(tommy_dict)
# tommy_df = tommy_df.set_index('sender')
# text_count = pd.concat([text_count, tommy_df])

In [23]:
text_count[['texts', 'image', 'messages']]

Unnamed: 0_level_0,texts,image,messages
sender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ryan,1477,271,1748
Austin,1605,140,1745
Krystiana,1546,125,1671
Ben,1246,62,1308
Igor,1178,104,1282
Riley,1038,143,1181
John,652,384,1036
Nick,840,98,938
Jack,642,92,734
+14167995612,505,64,569


In [24]:
text_count['reactions_sent'] = pd.Series()
text_count['reactions_sent'] = text_count['reactions_sent'].fillna(0)


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [25]:
text_count['hahas_sent'] = pd.Series()
text_count['hahas_sent'] = text_count['hahas_sent'].fillna(0)


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [26]:
def reactions_sent(x):
  for reaction in x:
    for sender in df.sender_number.unique():
      if sender in reaction:
        try:
          text_count.loc[senders.get(sender, sender), 'reactions_sent'] += 1
        except KeyError:
          pass

In [27]:
def hahas_sent(x):
  for reaction in x:
    for sender in df.sender_number.unique():
      if (sender in reaction) and ("Laughed" in reaction):
        try:
          text_count.loc[senders.get(sender, sender), 'hahas_sent'] += 1
        except KeyError:
          pass

In [28]:
df['reactions'].apply(reactions_sent)
df['reactions'].apply(hahas_sent)

dates
2025-01-01 12:54:33    None
2025-01-01 13:36:32    None
2025-01-01 13:37:49    None
2025-01-01 13:36:39    None
2025-01-01 13:38:49    None
                       ... 
2025-02-01 00:32:56    None
2025-02-01 00:37:55    None
2025-02-01 00:38:19    None
2025-02-01 00:42:40    None
2025-02-01 00:43:04    None
Name: reactions, Length: 13709, dtype: object

In [29]:
text_count[['texts', 'image', 'reactions_recieved', 'reactions_sent', 'hahas_sent', 'messages']]

Unnamed: 0_level_0,texts,image,reactions_recieved,reactions_sent,hahas_sent,messages
sender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Ryan,1477,271,1059,594,478,1748
Austin,1605,140,550,662,364,1745
Krystiana,1546,125,326,277,203,1671
Ben,1246,62,595,659,536,1308
Igor,1178,104,629,380,293,1282
Riley,1038,143,452,773,662,1181
John,652,384,670,182,136,1036
Nick,840,98,410,756,610,938
Jack,642,92,431,746,663,734
+14167995612,505,64,182,361,192,569


In [53]:
# text_count.loc[text_count['messages'] > 5]['messages'].to_json('message-ranking.json')

In [51]:
px.bar(text_count.loc[text_count['messages'] > 5]['messages'], labels={'value' : '# of Messages', 'variable' : 'Message Type', 'sender' : 'Sender'})

In [58]:
text_count.loc[text_count['messages'] > 5][['texts', 'image']].to_json('text-image-ranking.json')

In [55]:
px.bar(text_count.loc[text_count['messages'] > 5][['texts', 'image']], labels={'value' : '# of Messages', 'variable' : 'Message Type', 'sender' : 'Sender'}, title="Messages Sent")





In [32]:
df.head(40)

Unnamed: 0_level_0,texts,image,video,sender_number,reactions,reactions_recieved,sender
dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2025-01-01 12:54:33,I can’t put down the mowater I can’t put down ...,,,+14169857677,"[Laughed by +16133160960, Laughed by +16138044...",2,Riley
2025-01-01 13:36:32,,attachments/302/48791.jpeg,,+16138044084,[Laughed by +16138588921],1,Ryan
2025-01-01 13:37:49,Stop firing at Ukraine dawg,,,+16138588921,"[Laughed by +16133160960, Laughed by +16138044...",3,Ben
2025-01-01 13:36:39,New jack pic that doesn’t look like jack dropped,,,+16138044084,[Laughed by +16133160960],1,Ryan
2025-01-01 13:38:49,LOOOOL,,,+16138044084,[],0,Ryan
2025-01-01 13:39:33,,,attachments/302/48792.mov,+16138044084,"[Laughed by +16133160960, Laughed by +14169857...",2,Ryan
2025-01-01 13:40:16,InMowaterJosh (Cabernet Sauvignon)Josh (Maycoc...,,,+16133160960,"[Laughed by +16138044084, Laughed by +16138588...",3,Jack
2025-01-01 13:41:54,LOOOL,,,+16138588921,[],0,Ben
2025-01-01 13:40:47,Dude Josh coming over to Evan’s house at,,,+16138044084,"[Laughed by +16138588921, Laughed by +14169857...",2,Ryan
2025-01-01 13:40:50,And watch shawshank,,,+16138044084,[],0,Ryan


In [33]:
list(df['texts'])

['I can’t put down the mowater I can’t put down the mowater',
 None,
 'Stop firing at Ukraine dawg',
 'New jack pic that doesn’t look like jack dropped',
 'LOOOOL',
 None,
 'InMowaterJosh (Cabernet Sauvignon)Josh (Maycock)Shawshank redemptionCRT Television static OutA&W Iced teaHot tubs that are too smallGreening out at new years ',
 'LOOOL',
 'Dude Josh coming over to Evan’s house at ',
 'And watch shawshank ',
 'Most shocking thing I’ve ever heard in my life ',
 'Genuinely ',
 '￼',
 'Ryan dropped his full beer and it landed perfectly like this ',
 'Actually couldn’t believe this',
 'The sound it made too',
 'Evan coming in from the new iPhone XR',
 'Man you guys missed a crazy sleepover at the Maycock residence ',
 'Did josh put on tv static and tuck you in',
 'Yea man I gave him his new years kiss ',
 'Anyone want to chill ',
 'Les Linklader ',
 'LOLL',
 'me and ryan will later',
 'You guys getting active ?',
 'Ben’s fiending to get active rn',
 'Bruh I’m fucking depressed on New Ye

In [34]:
df['texts'].fillna('')

dates
2025-01-01 12:54:33    I can’t put down the mowater I can’t put down ...
2025-01-01 13:36:32                                                     
2025-01-01 13:37:49                          Stop firing at Ukraine dawg
2025-01-01 13:36:39     New jack pic that doesn’t look like jack dropped
2025-01-01 13:38:49                                               LOOOOL
                                             ...                        
2025-02-01 00:32:56    Mane I was on reddit looking at peoples lists ...
2025-02-01 00:37:55    When jack johnson played Ottawa blues fest u g...
2025-02-01 00:38:19                                        Holy fuck LOL
2025-02-01 00:42:40                                Lowkey heartbreaking 
2025-02-01 00:43:04                                Actually. Not lowkey 
Name: texts, Length: 13709, dtype: object

In [35]:
df['word_count'] = df['texts'].fillna('').apply(lambda x : len(x.split(' ')))

In [36]:
word_count = df.groupby('sender').sum().sort_values(by='word_count', ascending=False)

In [37]:
text_count['word_count'] = word_count.word_count

In [54]:
fig = go.Figure(data=[go.Bar(
            x=word_count.index, y=word_count['word_count'],
            text=word_count['word_count'],
            textposition='auto',
        )])

fig.update_layout(
    title="Word Count",
    xaxis_title="Sender",
    yaxis_title="Words",
)

fig.show()

In [66]:
text_count.loc[text_count['hahas_sent'] > 5]['hahas_sent'].sort_values(ascending=False).to_json("hahas-sent.json")

In [67]:
fig = go.Figure(data=[go.Bar(
            x=text_count['hahas_sent'].sort_values(ascending=False).index, y=text_count['hahas_sent'].sort_values(ascending=False),
            text=text_count['hahas_sent'].sort_values(ascending=False),
            textposition='auto',
        )])

fig.update_layout(
    title="Hahas Sent",
    xaxis_title="Sender",
    yaxis_title="Haha Count",
)

fig.show()

In [40]:
(text_count.loc[text_count['messages'] > 5]['reactions_recieved'] / text_count.loc[text_count['messages'] > 5]['messages']).sort_values(ascending=False)

sender
Ali                              1.590909
Greg                             0.889764
Henry                            0.680000
John                             0.646718
Darcy                            0.617978
melaniesmyth13@gmail.com         0.616541
Ryan                             0.605835
Jack                             0.587193
hashtagyoloswagbeef@gmail.com    0.561338
Rayhan                           0.555556
Igor                             0.490640
Eden                             0.476923
josh.maycock@yahoo.com           0.463415
Ben                              0.454893
Nick                             0.437100
Riley                            0.382727
+14167995612                     0.319859
Austin                           0.315186
+12899330628                     0.298246
Ernesto                          0.254545
Krystiana                        0.195093
Nicole                           0.111111
+16475394413                     0.086957
dtype: float64

In [41]:
df.loc[df.sender == 'Ali']

Unnamed: 0_level_0,texts,image,video,sender_number,reactions,reactions_recieved,sender,word_count
dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2025-01-03 12:49:48,￼￼￼￼￼￼￼￼￼￼￼￼￼￼￼￼￼￼￼I love you John! Happy bday💋,attachments/302/48848.jpeg,,16476226650,[],0,Ali,6
2025-01-06 14:44:03,,attachments/302/48966.jpeg,,16476226650,"[Laughed by +14169857677, Laughed by Me]",2,Ali,1
2025-01-12 00:26:57,Maya!!! Maya the great,,,16476226650,[],0,Ali,5
2025-01-14 21:28:40,System attack,,,16476226650,[],0,Ali,2
2025-01-17 00:33:29,,attachments/302/49872.jpeg,,16476226650,"[Laughed by gregorynip@icloud.com, Laughed by ...",2,Ali,1
2025-01-23 10:57:48,Oh man… there’s no way it’s coming back,,,16476226650,[Laughed by edenkettleson@gmail.com],1,Ali,9
2025-01-23 16:52:30,thank you,,,16476226650,[],0,Ali,2
2025-01-23 16:52:46,also Ali is very commonly a guy's name too,,,16476226650,[Emphasized by +14169857677],1,Ali,9
2025-01-23 16:53:17,ever heard of him?,,,16476226650,[],0,Ali,4
2025-01-24 09:25:12,￼￼￼￼￼￼￼￼￼￼￼￼Happy birthday little Henry!!!!!!,attachments/302/50617.jpeg,,16476226650,[],0,Ali,4


In [42]:
df.loc[df.reactions_recieved == df.reactions_recieved.max()].texts.to_string()

'dates\n2025-01-25 18:32:58    ￼'

In [43]:
text_count['ratio'] = (text_count['reactions_recieved'] / text_count['messages']).sort_values(ascending=False)

In [44]:
# text_count[["texts", "image", "reactions_recieved", "messages", "reactions_sent", "hahas_sent", "ratio", "word_count"]].to_json('data.json', orient='index')

## This is where all of the formatting ends and the file has been outputted

In [45]:
text_count.sort_values(ascending=False, by='ratio')[['messages', 'reactions_recieved']]

Unnamed: 0_level_0,messages,reactions_recieved
sender,Unnamed: 1_level_1,Unnamed: 2_level_1
Ali,44,70
+16135581208,5,6
+13432045724,4,4
M****,1,1
Tommy,4,4
stephazel108@gmail.com,1,1
Greg,127,113
Henry,175,119
John,1036,670
Darcy,89,55


In [46]:
px.bar(text_count['ratio'].sort_values(ascending=False), title="Text-to-Reactions Recieved Ratio")

In [47]:
df.loc[df.sender == "Ali"].sort_values("reactions_recieved", ascending=False)[['texts', 'reactions_recieved']].values

array([['Loving john is a treacherous journey full of emotions ', 6],
       ['I’m so glad I’m not Johns friend. I would hate to be betrayed like this ',
        6],
       [None, 5],
       ['Oh. Sorry. I don’t see you like that ', 4],
       ['I am currently working at a company called Fable. They help make digital products accessible for people with disabilities ',
        4],
       ['Behind the scenes of john holing', 4],
       [None, 3],
       ['I resent that', 3],
       ['This chat is actually so freaking funny. I was up late last night and just cackling at some shit',
        3],
       ['Infancy. Its hard. You can tell at times he really wants to be able to speak . ',
        3],
       ['Elevated Darcy’s crate', 3],
       ['Omg… Ryan stop!!! I’m a girl and I’m taken ', 3],
       ['This looks exactly like Luigi ', 3],
       ['I feel crazy too much ', 2],
       ['This is Ryan’s type ', 2],
       ['￼Good afternoon', 2],
       [None, 2],
       [None, 2],
       [None, 2

In [48]:
df.loc[df.reactions_recieved.idxmax()]['reactions']

['Loved by +16133559273',
 'Loved by +14167995612',
 'Loved by +14163054266',
 'Loved by +14169857677',
 'Loved by +16133559273',
 'Loved by +16476226650',
 'Loved by +14169857677',
 'Loved by +16133160960',
 'Loved by +14167995612',
 'Loved by +14163054266']

In [49]:
px.line(
  df[(df.sender == 'Ryan 1') | (df.sender == 'Greg')], x=df[(df.sender == 'Ryan 1') | (df.sender == 'Greg')].index, y='word_count', color='sender',
  labels={'word_count' : 'Word Count', 'x' : 'Date'}
)


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result

