In [1]:
import sqlite3
import pandas as pd
from datetime import datetime
from datetime import date
import matplotlib.pyplot as plt
%matplotlib inline

import importlib
import sys
sys.path.append('../src/')
import helper as hp

Version 3.1
April 2024

# 1. Connect to the database and load the data tables

In [2]:
# find your chat.db and establish a connection
# conn = sqlite3.connect('/Users/<YOUR MAC USERNAME>/Library/Messages/chat.db')
conn = sqlite3.connect('/Users/yorgos/Desktop/chat.db')
cur = conn.cursor()

# query the database to get all the table names
cur.execute(" select name from sqlite_master where type = 'table' ")

# you can see all the tables included in the database here.
for name in cur.fetchall():
    print(name)

('_SqliteDatabaseProperties',)
('chat_message_join',)
('deleted_messages',)
('sqlite_sequence',)
('chat_recoverable_message_join',)
('handle',)
('sync_deleted_chats',)
('kvtable',)
('sync_deleted_attachments',)
('sync_deleted_messages',)
('unsynced_removed_recoverable_messages',)
('recoverable_message_part',)
('chat_handle_join',)
('message_attachment_join',)
('message_processing_task',)
('message',)
('chat',)
('attachment',)
('sqlite_stat1',)


# 2. Query and Load the data tables we need

In [3]:
# create pandas dataframes with the tables we need. 

## Mac OSX versions below High Sierra
#messages = pd.read_sql_query('''select *, datetime(date + strftime("%s", "2001-01-01") ,"unixepoch","localtime")  as date_utc from message''', conn) 

## High Sierra and above
# this is the table with all the messages.
messages = pd.read_sql_query('''select *, datetime(date/1000000000 + strftime("%s", "2001-01-01") ,"unixepoch","localtime")  as date_utc from message ORDER BY date DESC''', conn) 

# handles and contact info
handles = pd.read_sql_query("select * from handle", conn)

# table mapping each message_id to its chat_id
chat_message_joins = pd.read_sql_query("select * from chat_message_join", conn)

# table mapping each chat_id to the handles that are part of that chat.
chat_handle_join = pd.read_sql_query("select * from chat_handle_join", conn)

In [4]:
# renaming some fields for better code readability
messages.rename(columns={'ROWID':'message_id'}, inplace=True)
handles.rename(columns = {'ROWID' : 'handle_id'}, inplace=True)
handles.rename(columns = {'id' : 'contact_info'}, inplace=True)

# 3. Edit and add information to the datasets

In [5]:
# add the chat_id for each message_id
messages = pd.merge(messages, chat_message_joins, how='left', on='message_id')

In [6]:
# some messages have empty text field. I do my best and infer the text sent from a different field in the data.
messages['inferred_text'] = messages['attributedBody'].apply(lambda x: hp.clean_text(x))

In [8]:
#Now combine the observed text and inferred_text columns into a holistic columns that captures all the text information we have.

# create a composite row with the text value as it is when it is populated, and the inferred_text where 'text' is NULL.
messages['text_combined'] = messages.apply(lambda row: row['inferred_text'] if pd.isnull(row['text']) else row['text'], axis=1)

#### This following step will take 3-5 minutes

In [9]:
handle_lists = []
contact_lists = []

for i in range(len(messages)):
    chat_id = messages.iloc[i]['chat_id']
    handle_list, contact_list = hp.get_handle_and_contact_list(chat_id, chat_handle_join, handles)
    handle_lists.append(handle_list)
    contact_lists.append(contact_list)
    
# for each message, get the handle_ids that were part of that chat
messages['chat_members_handles'] = handle_lists

# same thing as above, but get the actual contact info (phone number or email) instead of just the handle_id
messages['chat_members_contact_info'] = contact_lists


In [10]:
# create a field with the chat size of each message. This is mostly for analytics purporses
messages['chat_size'] = messages['chat_members_handles'].apply(lambda x: hp.get_chat_size(x))

In [11]:
# get the contact info for the handle_id column, i.e., who sent or received the message.
messages['contact_info'] = messages['handle_id'].apply(lambda x: hp.convert_handle_id_to_contact_info(x, handles))

In [12]:
# grab only the columns we need, for easier exploration and readability of the dataset.

columns = ['message_id', 'is_from_me', 'text_combined', 'text',  'inferred_text', 'date_utc', 'handle_id', 'contact_info','chat_id', 
          'chat_members_contact_info', 'chat_size', 'chat_members_handles',
           'attributedBody']
df_messages_trimmed = messages[columns].copy()

In [13]:
# quick look at the dataframe
df_messages_trimmed.head(2)

Unnamed: 0,message_id,is_from_me,text_combined,text,inferred_text,date_utc,handle_id,contact_info,chat_id,chat_members_contact_info,chat_size,chat_members_handles,attributedBody
0,370197,1,Don’t forget us when you’re at the top,Don’t forget us when you’re at the top,Donâ t forget us when youâ re at the top &,2024-04-20 09:20:44,0,,,[],0,[],b'\x04\x0bstreamtyped\x81\xe8\x03\x84\x01@\x84...
1,370196,1,"Congrats my dude, donâ",,"Congrats my dude, donâ",2024-04-20 09:20:35,72,16469209965.0,84.0,[+16469209965],1,[72],b'\x04\x0bstreamtyped\x81\xe8\x03\x84\x01@\x84...


# 4. One last fix for sent messages: Fix the field with the recipient of the message.

When the handle_id = 0 if the chat type is 1-1, then assign the contact info that we got from message_id -> chat_id -> handle_id -> contact_info into the contact_info.

Some contacts have two different handle_ids, one for SMS and one for iMessage and this does get tangled sometimes, that's why I need to do this update on the contact info level and not the handle_id level. The two contacts will have the same info but different handles


In [14]:
df_messages_trimmed.loc[:, 'updated_contact_info'] = df_messages_trimmed.apply(lambda x: hp.update_contact_info(x['contact_info'], x['chat_members_contact_info'], x['message_id']), axis=1)


# 5. Contact grouping and organization: Transform phone numbers and email into names for easier readability

When doing exploration and analysis, working with the phone number can be difficult to immediately remember which phone number is who.
To solve this issue and make the analysis and exploration more readable we can create a mapping between the phone number to an actual name.

This is also needed because sometimes the same person can message you from their phone number and their email - these will be different handle_ids in Apple's dataset and will be considered two different people. 
Doing this step allows us to gather all the messages sent from the same person across phone number and email, so the analysis is more accurate.

In [15]:
contact_grouping = {'+1234567788':'Jane Doe',
                   'example@gmail.com':'Jane Doe', # note here that you can assign multiple contact information to the same person
                   'email@yahoo.com' : 'John Doe',

                   }


df_messages = pd.merge(df_messages_trimmed, pd.DataFrame(list(contact_grouping.items()), columns=['updated_contact_info', 'name']),\
                       on='updated_contact_info', how='left')

# for messages that don't have a contact name assigned, just use the phone number/email as the name.
df_messages['name'] = df_messages['name'].fillna(df_messages['updated_contact_info'])


In [16]:
# List of Top-10 people you have exchanged the most messages with.
df_messages.groupby('name').size().sort_values(ascending=False)[:10]

name
+19317973385            94975
group-chat              89291
+15183898676            59902
anthiask@hotmail.com    23136
+306932384392           15128
+16518959121             9066
+13057942891             9038
+16309159447             6796
+15044536827             5383
+35799763166             2818
dtype: int64

# 6. Add analysis friendly columns

Finally, to make the analysis easier, I add date related columns here. 

In [17]:
df_messages['timestamp'] = df_messages['date_utc'].apply(lambda x: pd.Timestamp(x))
df_messages['date'] = df_messages['timestamp'].apply(lambda x: x.date())
df_messages['month'] = df_messages['timestamp'].apply(lambda x: int(x.month))
df_messages['year'] = df_messages['timestamp'].apply(lambda x: int(x.year))


In [18]:
df_messages.head(2)

Unnamed: 0,message_id,is_from_me,text_combined,text,inferred_text,date_utc,handle_id,contact_info,chat_id,chat_members_contact_info,chat_size,chat_members_handles,attributedBody,updated_contact_info,name,timestamp,date,month,year
0,370197,1,Don’t forget us when you’re at the top,Don’t forget us when you’re at the top,Donâ t forget us when youâ re at the top &,2024-04-20 09:20:44,0,,,[],0,[],b'\x04\x0bstreamtyped\x81\xe8\x03\x84\x01@\x84...,group-chat,group-chat,2024-04-20 09:20:44,2024-04-20,4,2024
1,370196,1,"Congrats my dude, donâ",,"Congrats my dude, donâ",2024-04-20 09:20:35,72,16469209965.0,84.0,[+16469209965],1,[72],b'\x04\x0bstreamtyped\x81\xe8\x03\x84\x01@\x84...,+16469209965,+16469209965,2024-04-20 09:20:35,2024-04-20,4,2024


### see the entire history of your messages with someone

In [19]:
contact_info='123456788'
df_messages.loc[df_messages['contact_info']==contact_info].sort_values(by='date_utc')

# if you assigned names to phone numbers you can use that field as well 
# name='Jane Doe'
# df_messages.loc[df_messages['name']==name].sort_values(by='date_utc')


Unnamed: 0,message_id,is_from_me,text_combined,text,inferred_text,date_utc,handle_id,contact_info,chat_id,chat_members_contact_info,chat_size,chat_members_handles,attributedBody,updated_contact_info,name,timestamp,date,month,year


### 7 Detect messages that are iMessage Reactions (aka Tapback).

iMessage Reactions were introduced in iOS 10, launched September 13th, 2016.

https://en.wikipedia.org/wiki/IOS_10

https://support.apple.com/guide/messages/use-tapbacks-icht504f698a/mac

In [20]:
# Note that this code detects only Reactions from English or Greek language settings. 
# You can modify to detect the reactions in other languages.

df_messages['reaction'] = df_messages['text_combined'].apply(lambda x: hp.detect_reaction(x))

In [21]:
ind = df_messages['reaction']!=0
df_messages.groupby('reaction').size()

reaction
0             354615
Disliked         174
Emphasized      2107
Laughed at      2677
Liked           2331
Loved           8292
dtype: int64

### 8. Save the dataset for future analysis.

In [19]:
# removing the special character '\r' from the text of the messages as they interfere with the to_csv command.
df_messages['text'] = df_messages['text'].str.replace('\r', '')

In [20]:
df_messages.to_csv("../data/df_messages.csv", index=False)