Network Generation + Sentiment Analysis

In [1]:
# The needed NLTK imports
import nltk # needed for download statements below (if your machine requires them)
from nltk.corpus import stopwords # The list of stopwords 
from nltk.tokenize import word_tokenize # Word tokenization functionality (string -> list of seperate words)
from nltk.tokenize import RegexpTokenizer # allows tokenization with regex expressions
from nltk.stem import WordNetLemmatizer # Allows for lemmatization 
from nltk.sentiment import SentimentIntensityAnalyzer
from statistics import mean 

# Below are 2 downloads you might need to run once.
# There may be others (I didnt fully track them), at the bottom the error message will tell you which ones to download

# nltk.download('wordnet')
# nltk.download('omw-1.4') 
# nltk.download('vader_lexicon')
    
import re # for regex functions

import networkx as nx # networkx
import jsonlines as jl # for working with jsonl type files 


# Helpers with the text processing, seperated for more readability in the function
def deEmojify(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii')
tokeniser = RegexpTokenizer(r'\w+')
stop = set(stopwords.words('english'))
# Adding 'covid' to list of deleted words
stop.add('covid')
stop.add('coronavirus') 
stop.add('corona') 
stop.add('virus') 
stop.add('amp') 
lemmatizer = WordNetLemmatizer()

# setting up sentiment analysis
sia = SentimentIntensityAnalyzer()
all_sentiment = []
mask_sentiment= []
doctor_sentiment= []
nurse_sentiment= []
vaccine_sentiment= []
vaccinate_sentiment= []
lockdown_sentiment= []
restriction_sentiment= []
school_sentiment= []
government_sentiment= []
china_sentiment= []

uk_sentiment= []
canada_sentiment= []
italy_sentiment= []
india_sentiment= []
freedom_sentiment= []



# Pre-processing the tweet
def text_processing(tweet_text):

    tweet_text = tweet_text.lower() # lowercase

    tweet_text = tweet_text.replace('new zealand', 'newzealand') # new zealand case

    tweet_text = re.sub("(\S+://\S+)", "", tweet_text) # URLs

    tweet_text = re.sub("(\d+)", "", tweet_text) # Numbers

    tweet_text = re.sub("[.,!?:;-=-'...\"@#_]", " ", tweet_text) # symbols + punctuation

    tweet_text = deEmojify(tweet_text) # emojis

    tweet_text = tokeniser.tokenize(tweet_text) # Tokenization

    tweet_text = [word for word in tweet_text if word not in stop] # Stop words

    for i in range (0, len(tweet_text)): # Verb lemmatization
        tweet_text[i] = lemmatizer.lemmatize(tweet_text[i], pos='v')
        
    tweet_text = list(set(tweet_text)) # Removing duplicate words
    
    return tweet_text # returned processed tweet as a list

   
# Initializing the networkx graph
G = nx.Graph()

# Adding nodes and edges from the tweets to the nx graph
# Set hydrated jsonl tweet file below
with jl.open('15.may_29_2020_world_death_toll_1000000.jsonl') as json_Lines:
    for json_obj in json_Lines:
        for i in range(0,len(json_obj["data"])):
            
            tweet = json_obj["data"][i]["text"]            

            processed_tweet = text_processing(tweet)             
            
            # SENTIMENT ANALYSIS
            # getting the compound polarity score of the tweet
            sentiment = sia.polarity_scores(tweet)['compound']
            
            # Removing null sentiments
            if sentiment != 0:
                all_sentiment.append(sentiment)

            
            # Sentiment of tweets with particular keywords in them
            
            if "mask" in tweet:              
                if sentiment != 0:
                    mask_sentiment.append(sentiment)
            
            if "doctor" in tweet:              
                if sentiment != 0:
                    doctor_sentiment.append(sentiment)
            
            if "nurse" in tweet:              
                if sentiment != 0:
                    nurse_sentiment.append(sentiment)
                        

            if "vaccine" in tweet:              
                if sentiment != 0:
                    vaccine_sentiment.append(sentiment)
            
            if "vaccinate" in tweet:              
                if sentiment != 0:
                    vaccinate_sentiment.append(sentiment)
            
            if "lockdown" in tweet:              
                if sentiment != 0:
                    lockdown_sentiment.append(sentiment)
            
            if "restriction" in tweet:              
                if sentiment != 0:
                    restriction_sentiment.append(sentiment)
            
            if "school" in tweet:              
                if sentiment != 0:
                    school_sentiment.append(sentiment)
            
            if "government" in tweet:              
                if sentiment != 0:
                    government_sentiment.append(sentiment)
            
            if "china" in tweet:              
                if sentiment != 0:
                    china_sentiment.append(sentiment)
                                    
            if " uk " in tweet:              
                if sentiment != 0:
                    uk_sentiment.append(sentiment)
                    
            if "canada" in tweet:              
                if sentiment != 0:
                    canada_sentiment.append(sentiment)
                    
            if "italy" in tweet:              
                if sentiment != 0:
                    italy_sentiment.append(sentiment)
                    
            if "india" in tweet:              
                if sentiment != 0:
                    india_sentiment.append(sentiment)
                    
            if "freedom" in tweet:              
                if sentiment != 0:
                    freedom_sentiment.append(sentiment)
                    
            
            # add all unique words as nodes from the tweets
            # add_nodes_from will not add duplicates
            G.add_nodes_from(processed_tweet) 
            
                       
            # now need to add edges between all the word nodes we have just added
            # for each word of the tweet
            for j in range(0,len(processed_tweet)):
                
                #For each word after that word
                for k in range(j + 1,len(processed_tweet)): 
                    
                    # if the edge exists already for the 2 words, increase weight by one
                    if G.has_edge(processed_tweet[j], processed_tweet[k]): 
                        G.edges[processed_tweet[j], processed_tweet[k]]['weight'] += 1
                    
                    # else add a new edge between words, of weight 1
                    else:                                                    
                        G.add_edge(processed_tweet[j], processed_tweet[k], weight=1)
                        
                        

# printing the number of words and edges (Ensuring nothing went catastrophically wrong)                          

print("-----------------------------")
print(len(G.nodes))
print("-----------------------------")
print(len(G.edges))
print("-----------------------------")

print("Total Sentiment", mean(all_sentiment))

print("Mask Sentiment", mean(mask_sentiment))
print("Doctor Sentiment", mean(doctor_sentiment))
print("Nurse Sentiment", mean(nurse_sentiment))
print("Vaccine Sentiment", mean(vaccine_sentiment))
print("Vaccinate Sentiment", mean(vaccinate_sentiment))
print("Lockdown Sentiment", mean(lockdown_sentiment))
print("Restriction Sentiment", mean(restriction_sentiment))
print("School Sentiment", mean(school_sentiment))
print("Government Sentiment", mean(government_sentiment))

print("China Sentiment", mean(china_sentiment))
print("UK Sentiment", mean(uk_sentiment))
print("Canada Sentiment", mean(canada_sentiment))
print("Italy Sentiment", mean(italy_sentiment))
print("India Sentiment", mean(india_sentiment))
print("Freedom Sentiment", mean(freedom_sentiment))


#UNCOMMENT OUT BELOW IF YOU WANT TO GENERATE A GEXF FILE FOR GEPHI VISUALIZATION

# making a gexf file (for Gephi) of the networkx network
# nx.write_gexf(G, "large_network.gexf")

-----------------------------
368439
-----------------------------
21091280
-----------------------------
Total Sentiment 0.01650823248140731
Mask Sentiment 0.08409740941137892
Doctor Sentiment -0.05267082399103139
Nurse Sentiment 0.10144083541147132
Vaccine Sentiment 0.09008215927901417
Vaccinate Sentiment -0.157518125
Lockdown Sentiment 0.03800913097894107
Restriction Sentiment 0.16075354424357755
School Sentiment 0.10206796187683284
Government Sentiment -0.055149519343493554
China Sentiment -0.1419472584856397
UK Sentiment -0.1420010101010101
Canada Sentiment 0.18068385416666666
Italy Sentiment 0.056527777777777774
India Sentiment 0.13400011560693642
Freedom Sentiment 0.2532687


Degree Distribution

In [None]:
import networkx as nx # networkx
import numpy as np # numpy
import matplotlib.pyplot as plt
import matplotlib as mpl
import logging
from networkx.algorithms.community import greedy_modularity_communities
import networkx.algorithms.community as nx_comm

# reading from gexf file
G = nx.read_gexf("large_network.gexf")

# Function to plot a degree distribution graph (P(k) vs k graph)
# Code taken from CPSC 572 tutorial 
def plot_degree_dist(G):
    
    degrees = [G.degree(n) for n in G.nodes()]
    kmin = min(degrees)
    kmax = max(degrees)
    
    if kmin>0:
        bin_edges = np.logspace(np.log10(kmin), np.log10(kmax)+1, num=20)
    else:
        bin_edges = np.logspace(0, np.log10(kmax)+1, num=30)
    density, _ = np.histogram(degrees, bins=bin_edges, density=True)

    fig = plt.figure(figsize=(6,4))
    log_be = np.log10(bin_edges)    
    x = 10**((log_be[1:] + log_be[:-1])/2) 
    
    plt.loglog(x, density, marker='o', linestyle='none')    
     
    plt.xlabel(r"degree $k$", fontsize=16)
    plt.ylabel(r"$P(k)$", fontsize=16)

    ax = plt.gca()
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.yaxis.set_ticks_position('left')
    ax.xaxis.set_ticks_position('bottom')
    
    # Show the plot
    plt.show()
    

# SHOW DEGREE DISTRIBUTION
plot_degree_dist(G)



Top X eigencentral nodes

In [None]:
import networkx as nx # networkx
import numpy as np # numpy
import matplotlib.pyplot as plt
import matplotlib as mpl
import logging
from networkx.algorithms.community import greedy_modularity_communities
import networkx.algorithms.community as nx_comm

# reading from gexf file
G = nx.read_gexf("large_network.gexf")

# # A list of the top eigan nodes and their eigan values
eigans =  sorted(nx.eigenvector_centrality(G, weight='weight').items(), key=lambda x:x[1])
eigans.reverse()

# range (X)
for i in range(200):
    print(i+1, ":", eigans[i])

Top X other words used in the same tweet with a designated word 

In [4]:
import networkx as nx # networkx
import numpy as np # numpy
import matplotlib.pyplot as plt
import matplotlib as mpl
import logging
from networkx.algorithms.community import greedy_modularity_communities
import networkx.algorithms.community as nx_comm

# reading from gexf file
G = nx.read_gexf("small_network.gexf")

edges = sorted(G.edges(data=True), key=lambda t: t[2].get('weight', 1))
edges.reverse()

# designated word goes here
j = 0
X = 5 #number of other top words to find
for i in range(len(edges)):
    if j == X:
        break
    # tupal so you have to check both sides of the node
    if edges[i][0] == 'mask' or edges[i][1] == 'mask':
        j += 1
        print(j, ":", edges[i])

print("\n\n")
j = 0
X = 5 #number of other top words to find
for i in range(len(edges)):
    if j == X:
        break
    # tupal so you have to check both sides of the node
    if edges[i][0] == 'doctor' or edges[i][1] == 'doctor':
        j += 1
        print(j, ":", edges[i])

print("\n\n")
j = 0
X = 5 #number of other top words to find
for i in range(len(edges)):
    if j == X:
        break
    # tupal so you have to check both sides of the node
    if edges[i][0] == 'nurse' or edges[i][1] == 'nurse':
        j += 1
        print(j, ":", edges[i])
        
print("\n\n")
j = 0
X = 5 #number of other top words to find
for i in range(len(edges)):
    if j == X:
        break
    # tupal so you have to check both sides of the node
    if edges[i][0] == 'vaccine' or edges[i][1] == 'vaccine':
        j += 1
        print(j, ":", edges[i])
 
print("\n\n")
j = 0
X = 5 #number of other top words to find
for i in range(len(edges)):
    if j == X:
        break
    # tupal so you have to check both sides of the node
    if edges[i][0] == 'vaccinate' or edges[i][1] == 'vaccinate':
        j += 1
        print(j, ":", edges[i])
      
print("\n\n")    
j = 0
X = 5 #number of other top words to find
for i in range(len(edges)):
    if j == X:
        break
    # tupal so you have to check both sides of the node
    if edges[i][0] == 'lockdown' or edges[i][1] == 'lockdown':
        j += 1
        print(j, ":", edges[i])
      
print("\n\n")    
j = 0
X = 5 #number of other top words to find
for i in range(len(edges)):
    if j == X:
        break
    # tupal so you have to check both sides of the node
    if edges[i][0] == 'restriction' or edges[i][1] == 'restriction':
        j += 1
        print(j, ":", edges[i])
   
print("\n\n")
j = 0
X = 5 #number of other top words to find
for i in range(len(edges)):
    if j == X:
        break
    # tupal so you have to check both sides of the node
    if edges[i][0] == 'school' or edges[i][1] == 'school':
        j += 1
        print(j, ":", edges[i])
    
print("\n\n")    
j = 0
X = 5 #number of other top words to find
for i in range(len(edges)):
    if j == X:
        break
    # tupal so you have to check both sides of the node
    if edges[i][0] == 'government' or edges[i][1] == 'government':
        j += 1
        print(j, ":", edges[i])
     
print("\n\n")    
j = 0
X = 5 #number of other top words to find
for i in range(len(edges)):
    if j == X:
        break
    # tupal so you have to check both sides of the node
    if edges[i][0] == 'china' or edges[i][1] == 'china':
        j += 1
        print(j, ":", edges[i])
    
print("\n\n")    
j = 0
X = 5 #number of other top words to find
for i in range(len(edges)):
    if j == X:
        break
    # tupal so you have to check both sides of the node
    if edges[i][0] == 'uk' or edges[i][1] == 'uk':
        j += 1
        print(j, ":", edges[i])
    
print("\n\n")    
j = 0
X = 5 #number of other top words to find
for i in range(len(edges)):
    if j == X:
        break
    # tupal so you have to check both sides of the node
    if edges[i][0] == 'canada' or edges[i][1] == 'canada':
        j += 1
        print(j, ":", edges[i])
     
print("\n\n")    
j = 0
X = 5 #number of other top words to find
for i in range(len(edges)):
    if j == X:
        break
    # tupal so you have to check both sides of the node
    if edges[i][0] == 'italy' or edges[i][1] == 'italy':
        j += 1
        print(j, ":", edges[i])
    
print("\n\n")    
j = 0
X = 5 #number of other top words to find
for i in range(len(edges)):
    if j == X:
        break
    # tupal so you have to check both sides of the node
    if edges[i][0] == 'india' or edges[i][1] == 'india':
        j += 1
        print(j, ":", edges[i])

print("\n\n")        
j = 0
X = 5 #number of other top words to find
for i in range(len(edges)):
    if j == X:
        break
    # tupal so you have to check both sides of the node
    if edges[i][0] == 'freedom' or edges[i][1] == 'freedom':
        j += 1
        print(j, ":", edges[i])
        
    

1 : ('mask', 'wear', {'id': '160330', 'weight': 49.0})
2 : ('face', 'mask', {'id': '59167', 'weight': 49.0})
3 : ('mask', 'distance', {'id': '160390', 'weight': 19.0})
4 : ('mask', 'use', {'id': '160292', 'weight': 17.0})
5 : ('people', 'mask', {'id': '152556', 'weight': 17.0})



1 : ('test', 'doctor', {'id': '119897', 'weight': 11.0})
2 : ('patients', 'doctor', {'id': '8622', 'weight': 10.0})
3 : ('get', 'doctor', {'id': '11453', 'weight': 9.0})
4 : ('doctor', 'nurse', {'id': '351173', 'weight': 7.0})
5 : ('die', 'doctor', {'id': '343747', 'weight': 7.0})



1 : ('home', 'nurse', {'id': '804', 'weight': 17.0})
2 : ('doctor', 'nurse', {'id': '351173', 'weight': 7.0})
3 : ('state', 'nurse', {'id': '352083', 'weight': 6.0})
4 : ('health', 'nurse', {'id': '189815', 'weight': 6.0})
5 : ('pandemic', 'nurse', {'id': '45922', 'weight': 5.0})



1 : ('vaccine', 'astrazeneca', {'id': '303421', 'weight': 11.0})
2 : ('china', 'vaccine', {'id': '182495', 'weight': 10.0})
3 : ('say', 'vaccine', {'

Position of pandemic keywords in eigan nodes

In [3]:
import networkx as nx # networkx
import numpy as np # numpy
import matplotlib.pyplot as plt
import matplotlib as mpl
import logging
from networkx.algorithms.community import greedy_modularity_communities
import networkx.algorithms.community as nx_comm

# reading from gexf file
G = nx.read_gexf("small_network.gexf")

# # A list of the top eigan nodes and their eigan values
eigans =  sorted(nx.eigenvector_centrality(G, weight='weight').items(), key=lambda x:x[1])
eigans.reverse()

# range (X)
for i in range(len(eigans)):

    if eigans[i][0] == 'mask':
        print(i+1, ":", eigans[i])

    if eigans[i][0] == 'doctor':
        print(i+1, ":", eigans[i])

    if eigans[i][0] == 'nurse':
        print(i+1, ":", eigans[i])

    if eigans[i][0] == 'vaccine':
        print(i+1, ":", eigans[i])

    if eigans[i][0] == 'vaccinate':
        print(i+1, ":", eigans[i])

    if eigans[i][0] == 'lockdown':
        print(i+1, ":", eigans[i])

    if eigans[i][0] == 'restriction':
        print(i+1, ":", eigans[i])

    if eigans[i][0] == 'school':
        print(i+1, ":", eigans[i])

    if eigans[i][0] == 'government':
        print(i+1, ":", eigans[i])

    if eigans[i][0] == 'china':
        print(i+1, ":", eigans[i])

    if eigans[i][0] == 'uk':
        print(i+1, ":", eigans[i])

    if eigans[i][0] == 'canada':
        print(i+1, ":", eigans[i])

    if eigans[i][0] == 'italy':
        print(i+1, ":", eigans[i])

    if eigans[i][0] == 'india':
        print(i+1, ":", eigans[i])

    if eigans[i][0] == 'freedom':
        print(i+1, ":", eigans[i])





12 : ('lockdown', 0.11872186686201039)
19 : ('uk', 0.0979377322016567)
26 : ('india', 0.08337886987352307)
34 : ('government', 0.07696015528150099)
96 : ('mask', 0.043740849053814444)
155 : ('china', 0.031403762359685033)
199 : ('school', 0.026766348004618974)
208 : ('doctor', 0.025981965628130117)
293 : ('vaccine', 0.020456203020902137)
446 : ('nurse', 0.014640031604647217)
1385 : ('freedom', 0.004143464242908818)
1422 : ('canada', 0.003993890308765853)
1453 : ('italy', 0.0038647767824698596)
2672 : ('restriction', 0.0017486540666958495)
9679 : ('vaccinate', 0.0003870582477794255)
