In [230]:
import numpy as np
import pandas as pd
import itertools
import json
import os
from collections import Counter

# Network data pre-processing
This notebook is a WIP, and will contain following sections:
1. **Data inspection**
2. **Data restructure**
2. **Data selection**
3. **Data transformation**
4. **Data output**

### **1. Data inspection**

### Loading the data

In [231]:
path =  str(os.path.abspath(os.path.join('',"../../data/toy_data/")))
data = pd.read_json(path + "/data1.json",orient="records", lines=True, encoding="utf-8")

In [232]:
data.shape

(200, 36)

In [233]:
data.dtypes

created_at                   datetime64[ns, UTC]
id                                         int64
id_str                                     int64
text                                      object
source                                    object
truncated                                   bool
in_reply_to_status_id                    float64
in_reply_to_status_id_str                float64
in_reply_to_user_id                      float64
in_reply_to_user_id_str                  float64
in_reply_to_screen_name                   object
user                                      object
geo                                      float64
coordinates                              float64
place                                     object
contributors                             float64
retweeted_status                          object
is_quote_status                             bool
quote_count                                int64
reply_count                                int64
retweet_count       

### **2. Restructuring data**
#### Reducing complexity / flattening data

In [234]:
data['user_id']               = [row['id'] for row in data['user']]
data['user_name']             = [row['name'] for row in data['user']]
data['user_screen_name']      = [row['screen_name'] for row in data['user']]
data['user_defined_location'] = [row['location'] if pd.notnull(row['location']) else None for row in data['user']]
data['user_followers_count']  = [row['followers_count'] for row in data['user']]
data['place_country_code']    = [row['country_code'] if row != None else None for row in data['place']]
data['place_name']            = [row['name'] if row != None else None for row in data['place']]
data['place_type']            = [row['place_type'] if row != None else None for row in data['place']]
data['place_coordinates']     = [dict(row['bounding_box'])['coordinates'][0][0] if row != None else None for row in data['place']]
data['hashtags']              = [[htags['text'].lower() for htags in row['hashtags']] for row in data['entities'] if row['hashtags'] != ""]
data['user_created_at']       = [row['created_at'] for row in data['user']]
data['user_geo_enabled']      = [row['geo_enabled'] for row in data['user']]
data['is_retweet']            = [1 if b else 0 for b in data['retweeted_status'].notnull()]

#### Extracting parent tweet ids for retweets

In [235]:
parent_tweet_ids = []
for index, row in data.iterrows():
    if row['is_retweet']:
        tweet = row['retweeted_status']
        parent_tweet_ids.append(tweet['id_str'])
    else:
        parent_tweet_ids.append(None)
        
data['parent_tweet_id'] = parent_tweet_ids

#### Extracting boolean value for original video / image content

In [236]:
data['has_media'] = pd.notna(data['extended_entities'])

#### Extract ids of the mentioned users for each tweet

In [237]:
import re

def extract_mentions(x):
    return [n.replace("id': ","") for n in re.findall("id': [0-9]*", str(x))]

data['mentions'] = data['entities'].apply(lambda x: extract_mentions(x))

#### Extract hashtags

In [238]:
def extract_hashtags(x):
    return [n.replace("'text': ","") for n in re.findall("'text': '[a-zA-Z0-9_]*'", str(x))]

data['hashtags'] = data['entities'].apply(lambda x: extract_hashtags(x))

### **3. Data selection**

In [239]:
selection = ['created_at',                                  # Timestamp for possible time comparisons
             'id',                                          # Id of the tweet for collecting replies / retweets
             'hashtags',                                    # Hashtags for coloring the nodes
             'user_id',
             'user_name',                                   # Identify / define nodes in the network
             'mentions',                                    # Define interaction-edges for mentions
             'retweeted',                                   # Judge if tweet is an origin-tweet in the graph
             'parent_tweet_id',                             # If retweet, find original tweet
             'place_coordinates','place_name','place_type',
             'user_defined_location',                       # Keep location/place data for filtering by city 
             'in_reply_to_user_id']                         # Define interaction-edges for replies
data_sel  = data[selection]

In [240]:
data_sel.shape

(200, 13)

In [241]:
n = 30
k = 10
data_sel.iloc[n:n+k]

Unnamed: 0,created_at,id,hashtags,user_id,user_name,mentions,retweeted,parent_tweet_id,place_coordinates,place_name,place_type,user_defined_location,in_reply_to_user_id
30,2019-10-10 19:59:55+00:00,1182385288850620416,[],1114171414230241281,volcano70,[47753979],False,,,,,,47753979.0
31,2019-10-10 19:59:55+00:00,1182385288993222665,['wtylewizji'],826731067172335616,Robert Palikot,[],False,,,,,"Poznań, Polska",
32,2019-10-10 19:59:55+00:00,1182385289374982144,['Rom'],871823728145039361,riesenpanda,"[22926365, 1006419421244678144]",False,1.181194087891968e+18,,,,"Ludwigshafen am Rhein, Germany",
33,2019-10-10 19:59:55+00:00,1182385289383292930,[],1125031585726849025,Provezza 🌊🌪,"[1156281409193086976, 16465385]",False,1.182300609833255e+18,,,,,
34,2019-10-10 19:59:55+00:00,1182385289530097665,[],69903520,Mrs. Putin☕️ Tamara Alexis,[],False,,"[-76.712759, 44.16054]",Kingston,city,Pluto America,
35,2019-10-10 19:59:55+00:00,1182385289806745600,[],536068379,D Matheson,[],False,,,,,"Victoria, Australia",
36,2019-10-10 19:59:55+00:00,1182385290016677888,[],1358892625,Ivan Darias Alfonso,"[6134882, 1177946826407849987, 118237722104814...",False,1.182377221048144e+18,,,,"Vienna, Austria",
37,2019-10-10 19:59:55+00:00,1182385290029215745,[],20725516,Ben McKenna,[],False,,,,,West Yorks via North London,
38,2019-10-10 19:59:55+00:00,1182385290209480704,[],965014538130083840,Hector Mendoza,[1653217514],False,1.1823724291229368e+18,,,,,
39,2019-10-10 19:59:55+00:00,1182385290423558144,[],1481735839,Jostein Gråfelder,[212973087],False,1.1807568712480604e+18,,,,,


### **4. Data transformation** 

Let's define a method for transforming the data into a list of edges with edge attributes. For this we will conveniency class Counter from collections.

In [242]:
in_retweet_ids  = set(data['parent_tweet_id'].values)
out_retweet_ids = set() 

In [243]:
import math

def init_edge(src,trg,edges):
    if (src,trg) not in edges: edges[(src,trg)] = Counter({'orig_mentions':0, 'retweets':0, 'replies':0})
            

def transform_to_edges(data: pd.DataFrame) -> dict:
    edges = {} # Dict where keys are tuples containing the source and target ids
               # values are Counters with attributes: orig_mentions, retweets, replies
    
    for i,row in data.iterrows():
        src      = row['user_id']
        rtd      = row['is_retweet']
        par_id   = row['parent_tweet_id']
        reply_id = row['in_reply_to_user_id']
        attr     = None
        
        mentions = list(row['mentions'])
        
        # Case 1. reply
        if reply_id in mentions:
            init_edge(src,reply_id,edges)
            edges[(src,reply_id)]['replies'] += 1
            mentions.remove(reply_id)
            
        # Case 2. retweet
        if rtd:
            if len(mentions) == 1:
                # Handle the special case: if only one mention left, then it must be the retweet
                trg = int(mentions.pop())
                init_edge(src,trg,edges)
                edges[(src,trg)]['retweets'] += 1
                
            elif par_id in retweet_ids_in:
                trg = data['user_id'].loc[data['id'] == par_id]
                init_edge(src,trg,edges)
                edges[(src,trg)]['retweets'] += 1
                mentions.remove(trg)
            else:
                out_retweet_ids.add(par_id)
                # Can't remove mention as we don't know which one of the left mentions 
                # is the right user id. This can be resolved when / if the tweet id is 
                # queried on Twitter Search API, meanwhile one of the retweets will be 
                # mislabeled as mention.
                
        # Add left ones as original (or mislabeled) tweet with a mention
        for trg in mentions:
            init_edge(src,trg,edges)
            edges[(src,trg)]['orig_mentions'] += 1
        
    return edges

In [244]:
edges = transform_to_edges(data)

In [245]:
print("{} edges found, {} retweets were mislabeled as mentions.".format(len(edges),len(out_retweet_ids)))

227 edges found, 27 retweets were mislabeled as mentions.


Inspect the resulting data:

In [246]:
k, rang = 0, 10

for i,counters in edges.items():
    print('Index {0}\nreplies:  {1}\nretweets: {2}\nmentions: {3}\n\n'.format(i,counters['replies'], counters['retweets'], counters['orig_mentions']))
    k += 1
    if k == rang: break

Index (2738228687, 459390022)
replies:  0
retweets: 1
mentions: 0


Index (116893165, 753974664041533440)
replies:  0
retweets: 1
mentions: 0


Index (350914441, 3243658266)
replies:  0
retweets: 1
mentions: 0


Index (1181481224587694080, 14266598)
replies:  0
retweets: 1
mentions: 0


Index (1159118914514771969, '911254908921298944')
replies:  0
retweets: 0
mentions: 1


Index (1159118914514771969, '1182248816105463809')
replies:  0
retweets: 0
mentions: 1


Index (1159118914514771969, '1182248889098874880')
replies:  0
retweets: 0
mentions: 1


Index (1159118914514771969, '1035407102')
replies:  0
retweets: 0
mentions: 1


Index (258950247, 415763567)
replies:  0
retweets: 1
mentions: 0


Index (930876414, 16031927)
replies:  0
retweets: 1
mentions: 0




### **5. Data Output** 

In [247]:
edgelist = [(i[0],i[1],dict(k)) for i,k in edges.items()]
missing_tweets = list(out_retweet_ids)

In [248]:
with open('edgelist.json', 'w') as file:
    json.dump(edgelist, file, allow_nan=False)
    
with open('missing_tweets.json', 'w') as file:
    json.dump(missing_tweets, file, allow_nan=False)