In [243]:
import numpy as np
import pandas as pd
import itertools
import json
import os
from collections import Counter

# Network data pre-processing
This notebook is a WIP, and will contain following sections:
1. **Data inspection**
2. **Data restructure**
2. **Data selection**
3. **Data transformation**
4. **Data output**

### **1. Data inspection**

### Loading the data

In [60]:
path = str(os.path.abspath(os.path.join('',"../../data/toy_data/")))
data = pd.read_json(path + "/data1.json",orient="records", dtype=False, lines=True, encoding="utf-8")

In [244]:
data.head()

Unnamed: 0,created_at,id,id_str,text,source,truncated,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,...,place_coordinates,hashtags,user_location,user_created_at,user_geo_enabled,is_retweet,parent_tweet_id_str,parent_user_id_str,has_media,mentions
0,2019-10-10 19:59:51+00:00,1182385272383791104,1182385272383791104,RT @JuliaHB1: FFS\n\nParalympic medallist Jame...,"<a href=""http://twitter.com/download/android"" ...",False,,,,,...,,[],The Pub,Sat Aug 09 16:04:21 +0000 2014,False,1,1182278788333395968,459390022,False,[459390022]
1,2019-10-10 19:59:51+00:00,1182385273738424321,1182385273738424321,RT @BasedPoland: More videos are emerging from...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",False,,,,,...,,['Brazil'],#wwg1wga,Tue Feb 23 22:29:20 +0000 2010,True,1,1182348104844156928,753974664041533440,False,[753974664041533440]
2,2019-10-10 19:59:51+00:00,1182385274560634880,1182385274560634880,RT @pictoline: “Lo que está en movimiento es c...,"<a href=""http://twitter.com/download/iphone"" r...",False,,,,,...,,[],Guatemala,Mon Aug 08 14:17:45 +0000 2011,True,1,1182370222847791105,3243658266,False,[3243658266]
3,2019-10-10 19:59:51+00:00,1182385274501894146,1182385274501894146,RT @350: “What we’re pushing to get back to ar...,"<a href=""http://twitter.com/download/iphone"" r...",False,,,,,...,,[],"Milan, MI",Tue Oct 08 08:07:43 +0000 2019,False,1,1180799866873810944,14266598,False,[14266598]
4,2019-10-10 19:59:51+00:00,1182385274720047109,1182385274720047109,RT @mollyfprince: I genuinely don’t understand...,"<a href=""http://twitter.com/download/android"" ...",False,,,,,...,,[],"Pereira, Colombia",Wed Aug 07 15:07:39 +0000 2019,False,1,1182288645753098243,911254908921298944,True,"[911254908921298944, 1182248816105463809, 1182..."


In [61]:
data.shape

(200, 36)

In [62]:
data.dtypes

created_at                   datetime64[ns, UTC]
id                                         int64
id_str                                    object
text                                      object
source                                    object
truncated                                   bool
in_reply_to_status_id                    float64
in_reply_to_status_id_str                 object
in_reply_to_user_id                      float64
in_reply_to_user_id_str                   object
in_reply_to_screen_name                   object
user                                      object
geo                                       object
coordinates                               object
place                                     object
contributors                              object
retweeted_status                          object
is_quote_status                             bool
quote_count                                int64
reply_count                                int64
retweet_count       

In [63]:
data['id_str'] = data['id_str'].astype(str)
data['in_reply_to_user_id_str'] = data['in_reply_to_user_id_str'].astype(str)

### **2. Data Restructuring**
#### Reducing complexity / flattening data

In [64]:
data['user_id_str']           = [row['id_str'] for row in data['user']]
data['user_name']             = [row['name'] for row in data['user']]
data['user_screen_name']      = [row['screen_name'] for row in data['user']]
data['user_defined_location'] = [row['location'] if pd.notnull(row['location']) else None for row in data['user']]
data['user_followers_count']  = [row['followers_count'] for row in data['user']]
data['place_country_code']    = [row['country_code'] if row != None else None for row in data['place']]
data['place_name']            = [row['name'] if row != None else None for row in data['place']]
data['place_type']            = [row['place_type'] if row != None else None for row in data['place']]
data['place_coordinates']     = [dict(row['bounding_box'])['coordinates'][0][0] if row != None else None for row in data['place']]
data['hashtags']              = [[htags['text'].lower() for htags in row['hashtags']] for row in data['entities'] if row['hashtags'] != ""]
data['user_location']         = [row['location'] for row in data['user']]
data['user_created_at']       = [row['created_at'] for row in data['user']]
data['user_geo_enabled']      = [row['geo_enabled'] for row in data['user']]
data['is_retweet']            = [1 if b else 0 for b in data['retweeted_status'].notnull()]

#### Extracting parent tweet ids and user ids for retweets

In [188]:
parent_tweet_ids = []
parent_user_ids  = []
for index, row in data.iterrows():
    if row['is_retweet']:
        tweet = row['retweeted_status']
        parent_tweet_ids.append(tweet['id_str'])
        parent_user_ids.append(tweet['user']['id_str'])
    else:
        parent_tweet_ids.append(None)
        parent_user_ids.append(None)
        
data['parent_tweet_id_str'] = parent_tweet_ids
data['parent_user_id_str']  = parent_user_ids

#### Extracting boolean value for original video / image content

In [189]:
data['has_media'] = pd.notna(data['extended_entities'])

#### Extract ids of the mentioned users for each tweet

In [190]:
import re

def extract_mentions(x):
    return [str(n.replace("id': ","")) for n in re.findall("id': [0-9]*", str(x))]

data['mentions'] = data['entities'].apply(lambda x: extract_mentions(x))

#### Extract hashtags

In [191]:
def extract_hashtags(x):
    return [n.replace("'text': ","") for n in re.findall("'text': '[a-zA-Z0-9_]*'", str(x))]

data['hashtags'] = data['entities'].apply(lambda x: extract_hashtags(x))

### **3. Data selection**

#### Geolocation data
Quick look at the location data (Twitter's page about geotagging available [here](https://developer.twitter.com/en/docs/tutorials/filtering-tweets-by-location))

In [153]:
print("Tweet-level location data:") 
print("\t* Coordinates present in {0} ({1} %) of the tweets".format(
                                np.sum(data['place_coordinates'].notnull()) , 
                                np.sum(data['place_coordinates'].notnull()) / data.shape[0] * 100))

print("\t* Place type present in {0} ({1} %) of the tweets".format(
                                np.sum(data['place_type'].notnull()), 
                                np.sum(data['place_type'].notnull()) / data.shape[0] * 100))

print("\t* Place name present in {0} ({1} %) of the tweets".format(
                                np.sum(data['place_name'].notnull()), 
                                np.sum(data['place_name'].notnull()) / data.shape[0] * 100))

print("\t* Country code present in {0} ({1} %) of the tweets".format(
                                np.sum(data['place_country_code'].notnull()), 
                                np.sum(data['place_country_code'].notnull()) / data.shape[0] * 100))

Tweet-level location data:
	* Coordinates present in 2 (1.0 %) of the tweets
	* Place type present in 2 (1.0 %) of the tweets
	* Place name present in 2 (1.0 %) of the tweets
	* Country code present in 2 (1.0 %) of the tweets


In [184]:
print("User-level location data present in {} ({} %) of the tweets".format(
                                    np.sum(data['user_location'].notna()), 
                                    np.sum(data['user_location'].notna() / data.shape[0] * 100)))

User-level location data present in 137 (68.5 %) of the tweets


In [221]:
print("Preview:")
user_locs = data['user_location'][data['user_location'].notna()]
user_locs[0:20]

Preview:


0                                               The Pub
1                                              #wwg1wga
2                                             Guatemala
3                                             Milan, MI
4                                     Pereira, Colombia
5                                             Smethwick
6                                            Texas, USA
8                                               Wroclaw
10                                  Charlottesville, VA
13                                        New York, USA
15                                  South East, England
16    Mi’kma’ki, the ancestral and #unceded territor...
19                           frigiliana (Malaga) ESPAÑA
20                                           Denver, CO
21                                            Barcelona
23                                               Canada
26                                      Goiânia, Brasil
28                                              

Let's check the set membership of the entries in a set of English country names.

In [296]:
all_countries = set([e[0] for e in pd.read_csv('countries.csv', index_col=0).values])

In [297]:
np.sum([loc in all_countries for loc in user_locs.values])

11

Based on this initial preview, the tweet-level location data is sparse and the user-defined location data is unstructured and ambiguous. 
#### Dropping out unnecessary attributes

In [245]:
selection = ['created_at',                                  # Timestamp for possible time comparisons
             'id_str',                                      # Id of the tweet for collecting replies / retweets
             'hashtags',                                    # Hashtags for coloring the nodes
             'user_id_str',                                 # Identify / define nodes in the network 
             'mentions',                                    # Define interaction-edges for mentions
             'parent_tweet_id_str',
             'parent_user_id_str',                          # Retweets
             'place_coordinates','place_name','place_type',
             'user_defined_location',                       # Keep location/place data for filtering by city 
             'in_reply_to_user_id_str']                     # Define interaction-edges for replies
data_sel  = data[selection]

In [246]:
data_sel.shape

(200, 12)

In [247]:
n = 30
k = 10
data_sel.iloc[n:n+k]

Unnamed: 0,created_at,id_str,hashtags,user_id_str,mentions,parent_tweet_id_str,parent_user_id_str,place_coordinates,place_name,place_type,user_defined_location,in_reply_to_user_id_str
30,2019-10-10 19:59:55+00:00,1182385288850620416,[],1114171414230241281,[47753979],,,,,,,47753979.0
31,2019-10-10 19:59:55+00:00,1182385288993222665,['wtylewizji'],826731067172335616,[],,,,,,"Poznań, Polska",
32,2019-10-10 19:59:55+00:00,1182385289374982144,['Rom'],871823728145039361,"[22926365, 1006419421244678144]",1.181194087891968e+18,22926365.0,,,,"Ludwigshafen am Rhein, Germany",
33,2019-10-10 19:59:55+00:00,1182385289383292930,[],1125031585726849025,"[1156281409193086976, 16465385]",1.182300609833255e+18,1.1562814091930867e+18,,,,,
34,2019-10-10 19:59:55+00:00,1182385289530097665,[],69903520,[],,,"[-76.712759, 44.16054]",Kingston,city,Pluto America,
35,2019-10-10 19:59:55+00:00,1182385289806745600,[],536068379,[],,,,,,"Victoria, Australia",
36,2019-10-10 19:59:55+00:00,1182385290016677888,[],1358892625,"[6134882, 1177946826407849987, 118237722104814...",1.182377221048144e+18,6134882.0,,,,"Vienna, Austria",
37,2019-10-10 19:59:55+00:00,1182385290029215745,[],20725516,[],,,,,,West Yorks via North London,
38,2019-10-10 19:59:55+00:00,1182385290209480704,[],965014538130083840,[1653217514],1.1823724291229368e+18,1653217514.0,,,,,
39,2019-10-10 19:59:55+00:00,1182385290423558144,[],1481735839,[212973087],1.1807568712480604e+18,212973087.0,,,,,


### **4. Data transformation** 

#### Nodes
Let's transform the node data so that we will have a set of unique node ids attached to most relevant node attributes. We select only location as the node attribute. This enables us flexible filtering of data based on location during the analysis.

In [248]:
nodes = dict(data[['user_id_str','user_defined_location']].values)

#### Edges
Let's define a method for transforming the data into a list of edges with edge attributes. For this we will conveniency class Counter from collections.

In [249]:
def init_edge(src,trg,edges):
    if (str(src),str(trg)) not in edges: edges[(src,trg)] = Counter({'orig_mentions':0, 'retweets':0, 'replies':0})            

def transform_to_edges(data: pd.DataFrame) -> dict:
    edges = {} # Dict where keys are tuples containing the source and target ids
               # values are Counters with attributes: orig_mentions, retweets, replies
    
    for i,row in data.iterrows():
        src      = row['user_id_str']
        par_id   = row['parent_user_id_str']
        reply_id = row['in_reply_to_user_id_str'] 
        mentions = list(row['mentions'])
        
        # Note that mentions are superset of retweets and replies, thus we 
        # first remove both of them, and then authentic mentions left.
        
        # Case 1. replies
        if reply_id in mentions:
            init_edge(src,reply_id,edges)
            edges[(src,reply_id)]['replies'] += 1
            mentions.remove(reply_id)
        
        # Case 2. retweets
        if par_id is not None:
            init_edge(src,par_id,edges)
            edges[(src,par_id)]['retweets'] += 1
            mentions.remove(par_id) 
            
        # Case 3. mentions
        for trg in mentions:
            init_edge(src,trg,edges)
            edges[(src,trg)]['orig_mentions'] += 1            
        
    return edges

In [250]:
edges = transform_to_edges(data_sel)
print(len(edges), 'edges found in the data set.')

227 edges found in the data set.


Inspect the resulting data:

In [267]:
k, rang = 110, 120
items = list(edges.items())
for i in range(k,rang):
    print(i,'Index {0}\nreplies:  {1}\nretweets: {2}\nmentions: {3}\n\n'.format(i,items[i][1]['replies'], items[i][1]['retweets'], items[i][1]['orig_mentions']))
    if k == rang: break

110 Index 110
replies:  0
retweets: 1
mentions: 0


111 Index 111
replies:  0
retweets: 1
mentions: 0


112 Index 112
replies:  0
retweets: 1
mentions: 0


113 Index 113
replies:  0
retweets: 1
mentions: 0


114 Index 114
replies:  1
retweets: 0
mentions: 0


115 Index 115
replies:  0
retweets: 1
mentions: 0


116 Index 116
replies:  1
retweets: 0
mentions: 0


117 Index 117
replies:  0
retweets: 0
mentions: 1


118 Index 118
replies:  0
retweets: 0
mentions: 1


119 Index 119
replies:  0
retweets: 0
mentions: 1




### **5. Data Output** 

In [268]:
edgelist = [(i[0],i[1],dict(k)) for i,k in edges.items()]
missing_tweets = list(out_retweet_ids)

In [269]:
with open('nodelist.json', 'w') as file:
    json.dump(nodes, file, allow_nan=False)
    
with open('edgelist.json', 'w') as file:
    json.dump(edgelist, file, allow_nan=False)