Social Network Analysis

This script takes as input JSON formatted Twitter tweets,
extracts a social network of Twitter users where the link/edge
between two nodes represents:

    from_user ---(retweets)---> to_user

The output file is a GML file, which can be opened in Gephi.
It is a "directed" and "weighted" social network.

The node with a high in-degree centrality is freqeutnly re-tweeted
and is considered a social influencer since his/her tweet gets "cited" often.

In [10]:
import json
from collections import defaultdict

input_file_name = 'E:\coronavirus-tweets\coronavirus_tweets_20200127.txt'
output_file_name = 'coronavirus_tweets.gml'

# input_file_list = ['E:\coronavirus-tweets\coronavirus_tweets_20200127.txt',
#                     'E:\coronavirus-tweets\coronavirus_tweets_20200128.txt',
#                     'E:\coronavirus-tweets\coronavirus_tweets_20200128_2.txt',
#                     'E:\coronavirus-tweets\coronavirus_tweets_20200129_1.txt',
#                     'E:\coronavirus-tweets\coronavirus_tweets_20200129_2.txt',
#                     'E:\coronavirus-tweets\coronavirus_tweets_20200127.txt',
#                     'E:\coronavirus-tweets\coronavirus_tweets_20200127.txt',
#                   ]

import os

directory = 'E:\coronavirus-tweets'

input_file_list = []

for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        input_file_list.append(os.path.join(directory, filename))
        continue
    else:
        continue
        
input_file_list

['E:\\coronavirus-tweets\\coronavirus_tweets_20200127.txt',
 'E:\\coronavirus-tweets\\coronavirus_tweets_20200128.txt',
 'E:\\coronavirus-tweets\\coronavirus_tweets_20200128_2.txt',
 'E:\\coronavirus-tweets\\coronavirus_tweets_20200129_1.txt',
 'E:\\coronavirus-tweets\\coronavirus_tweets_20200129_2.txt',
 'E:\\coronavirus-tweets\\coronavirus_tweets_20200129_3.txt',
 'E:\\coronavirus-tweets\\coronavirus_tweets_20200130.txt',
 'E:\\coronavirus-tweets\\coronavirus_tweets_20200131.txt',
 'E:\\coronavirus-tweets\\coronavirus_tweets_20200201.txt',
 'E:\\coronavirus-tweets\\coronavirus_tweets_20200202.txt',
 'E:\\coronavirus-tweets\\coronavirus_tweets_20200203.txt',
 'E:\\coronavirus-tweets\\coronavirus_tweets_20200203_1.txt',
 'E:\\coronavirus-tweets\\coronavirus_tweets_20200203_2.txt',
 'E:\\coronavirus-tweets\\coronavirus_tweets_20200204_1.txt',
 'E:\\coronavirus-tweets\\coronavirus_tweets_20200204_2.txt',
 'E:\\coronavirus-tweets\\coronavirus_tweets_20200204_3.txt',
 'E:\\coronavirus-twee

In [11]:
# From the input file (JSON objects),
# retrieve only "text" and user "screen name"

tweets_texts = []
tweets_users = []

for eachfile in input_file_list:
    tweets_file = open(eachfile, "r")
    for line in tweets_file:
        try:
            try:
                tweet = json.loads(line)
                if (tweet['place']['country_code'] == 'SG'): 
                    tweets_texts.append(tweet['text'].encode('utf-8'))
                    tweets_users.append(tweet['user']['screen_name'])
            except:  
                try:
                    tweet = json.loads(line)
                    if ('Singapore' in tweet['user']['location']): 
                        tweets_texts.append(tweet['text'].encode('utf-8'))
                        tweets_users.append(tweet['user']['screen_name'])
                except:
                    continue
        except:
            continue

In [12]:
# Check if tweets_texts list has some text
print( len(tweets_texts) )

6084


In [13]:
# Start writing a GML output file
output_file = open(output_file_name, "w")
output_file.write("graph\n")
output_file.write("[\n")

2

In [14]:
# Loop through each line and extract from_user and to_user
# e.g. from_user "retweets" from to_user
counter = 0
max_length = len(tweets_texts)
pairwise_counter_dictionary = defaultdict(int)
user_id = 0
users = {}
while counter < max_length:
    text = tweets_texts[counter]
    text = text.strip()
    text_tokens = text.split()

    from_user = tweets_users[counter]
    prev_token = ""
    for token in text_tokens:
        token = token.decode('utf-8')
        
        if prev_token == 'RT' and token.startswith('@'):
            #print (token)
            token = token.replace(":", "")
            to_user = token.replace("@", "")
            #print (from_user + ',' + to_user)

            from_user_id = ""
            to_user_id = ""
            if (from_user in users) == False:
                users[from_user] = user_id
                user_id = user_id + 1
            from_user_id = users[from_user]
            if (to_user in users) == False:
                users[to_user] = user_id
                user_id = user_id + 1
            to_user_id = users[to_user]

            # increment counter
            pairwise_counter_dictionary[from_user_id, to_user_id] += 1

        prev_token = token

    # while loop counter
    counter = counter + 1

# print all nodes
for key in users.keys():
    output_file.write("  node\n")
    output_file.write("  [\n")
    output_file.write("    id " + str(users[key]) + "\n")
    output_file.write("    label \"" + key + "\"" + "\n")
    output_file.write("  ]\n")

# print all edges
for keys in pairwise_counter_dictionary.keys():
    output_file.write("  edge\n")
    output_file.write("  [\n")
    output_file.write("    source " + str(keys[0]) + "\n")
    output_file.write("    target " + str(keys[1]) + "\n")
    output_file.write("    value " + str(pairwise_counter_dictionary[keys[0], keys[1]]) + "\n")
    output_file.write("  ]\n")

output_file.write("]\n")
output_file.close()

In [15]:
#save notebook session
import dill
dill.dump_session('notebook_env_twitternetwork.db')

In [None]:
#restore notebook session
dill.load_session('notebook_env_twitternetwork.db')