In [41]:
import numpy as np
import pandas as pd
import itertools

# Network data pre-processing
This notebook is a WIP, and will contain following sections:
1. **Data inspection**
2. **Data restructure**
2. **Data selection**
3. **Data transformation**
4. **Data output**

### **1. Data inspection**

### Loading the data

In [42]:
import os

path =  str(os.path.abspath(os.path.join('',"../../data/toy_data/")))
data = pd.read_json(path + "/data1.json",orient="records", lines=True, encoding="utf-8")

In [43]:
data.shape

(200, 36)

In [44]:
data.dtypes

created_at                   datetime64[ns, UTC]
id                                         int64
id_str                                     int64
text                                      object
source                                    object
truncated                                   bool
in_reply_to_status_id                    float64
in_reply_to_status_id_str                float64
in_reply_to_user_id                      float64
in_reply_to_user_id_str                  float64
in_reply_to_screen_name                   object
user                                      object
geo                                      float64
coordinates                              float64
place                                     object
contributors                             float64
retweeted_status                          object
is_quote_status                             bool
quote_count                                int64
reply_count                                int64
retweet_count       

### **2. Restructuring data**
#### Reducing complexity / flattening data

In [45]:
data['user_id']               = [row['id'] for row in data['user']]
data['user_name']             = [row['name'] for row in data['user']]
data['user_screen_name']      = [row['screen_name'] for row in data['user']]
data['user_defined_location'] = [row['location'] if pd.notnull(row['location']) else None for row in data['user']]
data['user_followers_count']  = [row['followers_count'] for row in data['user']]
data['place_country_code']    = [row['country_code'] if row != None else None for row in data['place']]
data['place_name']            = [row['name'] if row != None else None for row in data['place']]
data['place_type']            = [row['place_type'] if row != None else None for row in data['place']]
data['place_coordinates']     = [dict(row['bounding_box'])['coordinates'][0][0] if row != None else None for row in data['place']]
data['hashtags']              = [[htags['text'].lower() for htags in row['hashtags']] for row in data['entities'] if row['hashtags'] != ""]
data['user_created_at']       = [row['created_at'] for row in data['user']]
data['user_geo_enabled']      = [row['geo_enabled'] for row in data['user']]
data['is_retweet']            = [1 if b else 0 for b in data['retweeted_status'].notnull()]

#### Extracting parent tweet ids for retweets

In [46]:
parent_tweet_ids = []
for index, row in data.iterrows():
    if row['is_retweet']:
        tweet = row['retweeted_status']
        parent_tweet_ids.append(tweet['id_str'])
    else:
        parent_tweet_ids.append(None)
        
data['parent_tweet_id'] = parent_tweet_ids

#### Extracting boolean value for original video / image content

In [47]:
data['has_media'] = pd.notna(data['extended_entities'])

#### Extract ids of the mentioned users for each tweet

In [48]:
import re

def extract_mentions(x):
    return [n.replace("id': ","") for n in re.findall("id': [0-9]*", str(x))]

data['mentions'] = data['entities'].apply(lambda x: extract_mentions(x))

#### Extract hashtags

In [50]:
def extract_hashtags(x):
    return [n.replace("'text': ","") for n in re.findall("'text': '[a-zA-Z0-9_]*'", str(x))]

data['hashtags'] = data['entities'].apply(lambda x: extract_hashtags(x))

### **3. Data selection**

In [62]:
selection = ['created_at',                                  # Timestamp for possible time comparisons
             'id',                                          # Id of the tweet for collecting replies / retweets
             'hashtags',                                    # Hashtags for coloring the nodes
             'user_id',                                     # Identify / define nodes in the network
             'mentions',                                    # Define interaction-edges for mentions
             'retweeted',                                   # Judge if tweet is an origin-tweet in the graph
             'parent_tweet_id',                             # If retweet, find original tweet
             'place_coordinates','place_name','place_type',
             'user_defined_location',                       # Keep location/place data for filtering by city 
             'in_reply_to_user_id']                         # Define interaction-edges for replies
data_sel  = data[selection]

In [63]:
data_sel.shape

(200, 12)

In [61]:
n = 30
k = 10
data_sel.iloc[n:n+k]

Unnamed: 0,created_at,id,hashtags,user_id,mentions,retweeted,parent_tweet_id,place_coordinates,place_name,place_type,in_reply_to_user_id
30,2019-10-10 19:59:55+00:00,1182385288850620416,[],1114171414230241281,[47753979],False,,,,,47753979.0
31,2019-10-10 19:59:55+00:00,1182385288993222665,['wtylewizji'],826731067172335616,[],False,,,,,
32,2019-10-10 19:59:55+00:00,1182385289374982144,['Rom'],871823728145039361,"[22926365, 1006419421244678144]",False,1.181194087891968e+18,,,,
33,2019-10-10 19:59:55+00:00,1182385289383292930,[],1125031585726849025,"[1156281409193086976, 16465385]",False,1.182300609833255e+18,,,,
34,2019-10-10 19:59:55+00:00,1182385289530097665,[],69903520,[],False,,"[-76.712759, 44.16054]",Kingston,city,
35,2019-10-10 19:59:55+00:00,1182385289806745600,[],536068379,[],False,,,,,
36,2019-10-10 19:59:55+00:00,1182385290016677888,[],1358892625,"[6134882, 1177946826407849987, 118237722104814...",False,1.182377221048144e+18,,,,
37,2019-10-10 19:59:55+00:00,1182385290029215745,[],20725516,[],False,,,,,
38,2019-10-10 19:59:55+00:00,1182385290209480704,[],965014538130083840,[1653217514],False,1.1823724291229368e+18,,,,
39,2019-10-10 19:59:55+00:00,1182385290423558144,[],1481735839,[212973087],False,1.1807568712480604e+18,,,,


### **4. Data transformation** 

In [86]:
data_sel.groupby(by=['user_id'], axis=0).count()[0:]

Unnamed: 0_level_0,created_at,id,hashtags,mentions,retweeted,parent_tweet_id,place_coordinates,place_name,place_type,user_defined_location,in_reply_to_user_id
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
4922631,1,1,1,1,1,0,0,0,0,1,0
14226882,1,1,1,1,1,0,0,0,0,1,0
15677734,1,1,1,1,1,1,0,0,0,1,0
16434865,1,1,1,1,1,1,0,0,0,1,0
17190323,1,1,1,1,1,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
1163212071015768065,1,1,1,1,1,0,0,0,0,1,0
1169600083349909506,1,1,1,1,1,1,0,0,0,0,0
1174425577476022272,1,1,1,1,1,0,0,0,0,0,0
1181481224587694080,1,1,1,1,1,1,0,0,0,1,0


### **5. Data output** 

In [58]:
#TODO