### Get imports done, set up some RegEx, functions etc

In [112]:
# Set encoding, import the necessary modules etc
encoding = 'utf-8'
import pandas as pd
import requests
import json
import re
import time

In [113]:
# Define where the direct-message.js and direct-message-group.js files are located.
# If you don't have them in the same directory as your Notebook, give an absolute path here.
DIRECT_MESSAGE_PATH = 'direct-message.js'
DIRECT_MESSAGE_GROUP_PATH = 'direct-message-group.js'

# Define the name of the exported csv file here
CSV_EXPORT_FILE = 'twitter_dms.csv'

# The URL to look up a Twitter user 
ID_LOOKUP_URL = 'https://twitter.com/intent/user?user_id='

In [114]:
# Set up two empty lists that later will contain our parsed messages, ready to be 
# moved into a Pandas dataframe
dms_message_items = []
dmg_message_items = []

In [115]:
# Defines some regex we need in order to parse the backup
RECIPIENT_ID_REGEX = "(?<=\"recipientId\" : \")(.*)(?=\",)"
SENDER_ID_REGEX = "(?<=\"senderId\" : \")(.*)(?=\",)"
TEXT_REGEX = "(?<=\"text\" : \")(.*)(?=\",)"
MEDIA_URL_REGEX = "(?<=\"mediaUrls\" : )(.*)(?=,)"
MESSAGE_ID_REGEX = "(?<=\"id\" : \")(.*)(?=\",)"
CREATED_AT_REGEX = "(?<=\"createdAt\" : \")(.*)(?=\")"
HANDLE_REGEX = "(?<=<p><span class=\"nickname\">)(.*)(?=<\/span><\/p>)"

In [116]:
# This will help us grab the data out of the archive and parse it 
# into the correct 'category' for later assembly.
def parse_archive_data(REGEX, rawdata):
    data = re.search(REGEX, rawdata)
    if data:
        data = data.group(0)
        return data
    else:
        return "NULL"

### Handle the direct-message.js file

In [117]:
# Read in the direct-message data dump into a list called dms
dms = open(DIRECT_MESSAGE_PATH, encoding='UTF-8').read().split("\"messageCreate\" : ")

# Iterate through the dms list and split out the various parts into 
# their own lists for later re-assembly
for i, dm in enumerate(dms):
    # Get rid of the crap in the first entry
    dms.pop(0)

    # Let's set up some empty lists to store data in as we iterate through the dump
    recipient_id_list = []
    sender_id_list = []
    text_list = []
    media_url_list = []
    message_id_list = []
    created_at_list = []

    for i in range(0, len(dms)):

        # Grab the recipient ID
        recipient_id_list.append(parse_archive_data(RECIPIENT_ID_REGEX, dms[i]))

        # Grab the sender ID
        sender_id_list.append(parse_archive_data(SENDER_ID_REGEX, dms[i]))

        # Grab the text of the DM
        text_list.append(parse_archive_data(TEXT_REGEX, dms[i]))

        # Grab the media URL, if applicable
        media_url_list.append(parse_archive_data(MEDIA_URL_REGEX, dms[i]))

        # Grab the message ID
        message_id_list.append(parse_archive_data(MESSAGE_ID_REGEX, dms[i]))

        # Grab when the DM was created
        created_at_list.append(parse_archive_data(CREATED_AT_REGEX, dms[i]))

        i+=1

# Loop through and assemble each individual item into a proper JSON structure
for msg in range(0,len(message_id_list)):
    json_message = {
        'SENDER_ID': sender_id_list[msg],
        'TEXT': text_list[msg],
        'RECIPIENT_ID': recipient_id_list[msg],
        'MEDIA_URL': media_url_list[msg],
        'MESSAGE_ID': message_id_list[msg],
        'CREATED_AT': created_at_list[msg],
        'TYPE': 'DMS'
        }
    dms_message_items.append(json_message)

### Handle the direct-message-group.js file

In [118]:
# Read in the direct-message data dump into a list called dms
dmg = open(DIRECT_MESSAGE_GROUP_PATH, encoding='UTF-8').read().split("\"messageCreate\" : ")

# Iterate through the dms list and split out the various parts into 
# their own lists for later re-assembly
for i, dm in enumerate(dmg):
    # Get rid of the crap in the first entry
    dmg.pop(0)

    # Let's set up some empty lists to store data in as we iterate through the dump
    recipient_id_list = []
    sender_id_list = []
    text_list = []
    media_url_list = []
    message_id_list = []
    created_at_list = []

    for i in range(0, len(dmg)):

        # Grab the recipient ID
        recipient_id_list.append(parse_archive_data(RECIPIENT_ID_REGEX, dmg[i]))

        # Grab the sender ID
        sender_id_list.append(parse_archive_data(SENDER_ID_REGEX, dmg[i]))

        # Grab the text of the DM
        text_list.append(parse_archive_data(TEXT_REGEX, dmg[i]))

        # Grab the media URL, if applicable
        media_url_list.append(parse_archive_data(MEDIA_URL_REGEX, dmg[i]))

        # Grab the message ID
        message_id_list.append(parse_archive_data(MESSAGE_ID_REGEX, dmg[i]))

        # Grab when the DM was created
        created_at_list.append(parse_archive_data(CREATED_AT_REGEX, dmg[i]))

        i+=1

# Loop through and assemble each individual item into a proper JSON structure
for msg in range(0,len(message_id_list)):
    json_message = {
        'SENDER_ID': sender_id_list[msg],
        'TEXT': text_list[msg],
        'RECIPIENT_ID': recipient_id_list[msg],
        'MEDIA_URL': media_url_list[msg],
        'MESSAGE_ID': message_id_list[msg],
        'CREATED_AT': created_at_list[msg],
        'TYPE': 'DMG'
        }
    dmg_message_items.append(json_message)

### This is where we concatenate the lists, gets them into Pandas and save it all out as a CSV file

In [119]:
# Create one list from DM's and Group DM's.
all_dms = dms_message_items + dmg_message_items

# Create the Pandas dataframe
df = pd.DataFrame.from_dict(all_dms)

In [120]:
# Let's have a look at how our dataframe looks
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2934 entries, 0 to 2933
Data columns (total 7 columns):
CREATED_AT      2934 non-null object
MEDIA_URL       2934 non-null object
MESSAGE_ID      2934 non-null object
RECIPIENT_ID    2934 non-null object
SENDER_ID       2934 non-null object
TEXT            2934 non-null object
TYPE            2934 non-null object
dtypes: object(7)
memory usage: 160.5+ KB


Unnamed: 0,CREATED_AT,MEDIA_URL,MESSAGE_ID,RECIPIENT_ID,SENDER_ID,TEXT,TYPE
count,2934,2934,2934,2934,2934,2934,2934
unique,2934,48,2934,76,123,2914,2
top,2016-08-03T14:55:27.898Z,[ ],834955938549415939,3390728889,3390728889,Lol,DMS
freq,1,2887,1,1676,875,5,2544


In [121]:
# The CREATED_AT field should be date format so let's sort that now
df['CREATED_AT'] = pd.to_datetime(df['CREATED_AT'])
df['CREATED_AT'].head(5)

0   2016-06-15 20:00:08.226000+00:00
1   2016-06-14 06:40:20.763000+00:00
2   2016-05-26 07:06:12.175000+00:00
3   2016-05-21 08:39:42.205000+00:00
4   2016-05-21 08:39:03.074000+00:00
Name: CREATED_AT, dtype: datetime64[ns, UTC]

In [127]:
# Let's try and get the Sender and Recipient Twitter handles into our dataframe as well

# rirst off, a list of the unique ID's from the Sender and Recipient columns
unique_ids = pd.unique(df[['SENDER_ID', 'RECIPIENT_ID']].values.ravel('K'))

# We'll need an empty dictionary to store our info
json_handles = {}

# Now we'll iterate through all of them and try and grab the information from Twitter.
# There's a short pause in order to try and not miss info
for id in unique_ids:
    if id != 'NULL':
        time.sleep(.6)
        r = requests.get(ID_LOOKUP_URL + id)
        user_handle = parse_archive_data(HANDLE_REGEX, r.text)
        if id not in json_handles.keys():
            json_handles[id] = user_handle
            
# We should now be able to use this dictionary to populate two new columns in the dataframe
df['SENDER_HANDLE'] = df['SENDER_ID'].map(json_handles)
df['RECIPIENT_HANDLE'] = df['RECIPIENT_ID'].map(json_handles)

In [134]:
# Let's have another look at how our dataframe looks
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2934 entries, 0 to 2933
Data columns (total 9 columns):
CREATED_AT          2934 non-null datetime64[ns, UTC]
MEDIA_URL           2934 non-null object
MESSAGE_ID          2934 non-null object
RECIPIENT_ID        2934 non-null object
SENDER_ID           2934 non-null object
TEXT                2934 non-null object
TYPE                2934 non-null object
SENDER_HANDLE       2934 non-null object
RECIPIENT_HANDLE    2544 non-null object
dtypes: datetime64[ns, UTC](1), object(8)
memory usage: 206.4+ KB


Unnamed: 0,CREATED_AT,MEDIA_URL,MESSAGE_ID,RECIPIENT_ID,SENDER_ID,TEXT,TYPE,SENDER_HANDLE,RECIPIENT_HANDLE
count,2934,2934,2934.0,2934.0,2934.0,2934,2934,2934,2544
unique,2934,48,2934.0,76.0,123.0,2914,2,119,65
top,2019-09-26 16:58:33.505000+00:00,[ ],8.349559385494158e+17,3390728889.0,3390728889.0,Lol,DMS,@Arron_banks,@Arron_banks
freq,1,2887,1.0,1676.0,875.0,5,2544,875,1676
first,2015-08-03 10:50:46.609000+00:00,,,,,,,,
last,2019-11-11 19:02:33.208000+00:00,,,,,,,,


In [122]:
# Save the dataframe as a csv file
df.to_csv(CSV_EXPORT_FILE, index=False)

### We now have a CSV file and a dataframe and the world is our oyster...

#### Go forth and play with that data!