In [None]:
import re
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string

import tweepy                  #Getting Twitter data like tweets, followers, friends

import nltk
from textblob import TextBlob  #Sentiment Analysis

import networkx as nx          #Drawing Network

import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
t_credentials = dict()
#These are the credentials obtained by setting up twitter developer account
t_credentials['CONSUMER_KEY'] = '-----------------'
t_credentials['CONSUMER_SECRET'] = '-----------------'
t_credentials['ACCESS_KEY'] = '-----------------'
t_credentials['ACCESS_SECRET'] = '-----------------'

#load Twitter API credentials
consumer_key = t_credentials['CONSUMER_KEY']
consumer_secret = t_credentials['CONSUMER_SECRET']
access_key = t_credentials['ACCESS_KEY']
access_secret = t_credentials['ACCESS_SECRET']

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth,wait_on_rate_limit=True)

In [None]:
#Tweet data is collected using this cell. 
#Since the data is already collected and stored in brexit_tweets_april.csv, no need to run this cell.
#Just read the data from brexit_tweets_april.csv file which is done below.
'''

hash_tags = '#brexit -filter:retweets'
tweet_cols = ['created_at','id','screen_name','location','followers_count','friends_count','retweeted','retweet_count',
              'text','tags','mentions']
tweet_df = pd.DataFrame(columns = tweet_cols)
for tweet in tweepy.Cursor(api.search,q=hash_tags, result_type='recent', # Example Values: mixed, recent, popular
                           lang="en",tweet_mode='extended',until='2019-04-30',wait_on_rate_limit=True).items(400000):
    tags=[]
    for i in range(len(tweet.entities['hashtags'])):
        tags.append('#'+tweet.entities['hashtags'][i]['text'].lower())
    mentions = []
    for i in range(len(tweet.entities['user_mentions'])):
        mentions.append('@'+tweet.entities['user_mentions'][i]['screen_name'])
    df = pd.DataFrame([[tweet.created_at,tweet.id,tweet.user.screen_name,tweet.user.location,tweet.user.followers_count,
                        tweet.user.friends_count,tweet.retweeted,tweet.retweet_count,tweet.full_text,tags,mentions]],columns = tweet_cols)
    tweet_df = tweet_df.append(df)
    tweet_df_rows = tweet_df.shape[0]
    if(tweet_df_rows%100==0):
        print(str(tweet_df_rows)+'---'+str(df['created_at']))
tweet_df.reset_index(drop=True,inplace=True)
tweet_df = tweet_df.sort_values(['created_at'],ascending=[False])
print(tweet_df.shape)
print(tweet_df['created_at'].min())
print(tweet_df['created_at'].max())
print(tweet_df['retweet_count'].max())
tweet_df.head()
'''

In [None]:
tweet_df = pd.read_csv('Outputs/brexit_tweets_april.csv',lineterminator='\n')
tweet_df['mentions'] = tweet_df['mentions\r'].str.strip()
tweet_df['dummy_count'] = 1
tweet_df = tweet_df.drop(['Unnamed: 0','id','location','retweeted','mentions\r'],axis='columns')
print(tweet_df.shape)
print(tweet_df['created_at'].min())
print(tweet_df['created_at'].max())
tweet_df.head()

In [None]:
len(set(tweet_df['screen_name']))

In [None]:
retweet_threshold = 10  #300 worked well for first 3 weeks. Produced 42 nodes

def clean_tweet(tweet):
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

def fill_sentiments(tweet_df):
    tweet_df['tweet_sentiment_polarity'] = 0.0
    tweet_df['tweet_sentiment_subjectivity'] = 0.0
    for index,row in tweet_df.iterrows():
        cleaned_tweet = row['text']
        s_analysis = TextBlob(cleaned_tweet)
        tweet_df.at[index,'tweet_sentiment_polarity'] = s_analysis.sentiment.polarity
        tweet_df.at[index,'tweet_sentiment_subjectivity'] = s_analysis.sentiment.subjectivity
    return tweet_df

### Week 1 tweets

In [None]:
tweet_df_week1 = tweet_df[tweet_df['created_at']<='2019-04-06 23:59:59']
tweet_df_week1.reset_index(drop=True,inplace=True)
tweet_df_week1 = fill_sentiments(tweet_df_week1)
print(tweet_df_week1.shape)
print(tweet_df_week1['created_at'].min())
print(tweet_df_week1['created_at'].max())
tweet_df_week1.head()

In [None]:
retweets_df_week1 = tweet_df_week1[['screen_name','retweet_count']].groupby('screen_name').sum()
followers_df_week1 = tweet_df_week1[['screen_name','followers_count']].groupby('screen_name').max() #Max of the week
friends_df_week1 = tweet_df_week1[['screen_name','friends_count']].groupby('screen_name').max()  #Max of the week
sentiment_polarity_df_week1 = tweet_df_week1[['screen_name','tweet_sentiment_polarity']].groupby('screen_name').sum()
sentiment_subjectivity_df_week1 = tweet_df_week1[['screen_name','tweet_sentiment_subjectivity']].groupby('screen_name').sum()
tweet_count_df_week1 = tweet_df_week1[['screen_name','dummy_count']].groupby('screen_name').sum()

week1_stats_df = pd.concat([tweet_count_df_week1,retweets_df_week1,followers_df_week1,friends_df_week1,
                            sentiment_polarity_df_week1,sentiment_subjectivity_df_week1],axis='columns')
week1_stats_df['screen_name'] = week1_stats_df.index
week1_stats_df = week1_stats_df[['screen_name','dummy_count','retweet_count','followers_count','friends_count',
                                 'tweet_sentiment_polarity','tweet_sentiment_subjectivity']]
week1_stats_df = week1_stats_df.rename(columns={"retweet_count":"total_retweet_count","followers_count":"max_followers_count",
                                                "friends_count":"max_friends_count","dummy_count":"total_tweet_count",
                                                "tweet_sentiment_polarity":"agg_sentiment_polarity",
                                                "tweet_sentiment_subjectivity":"agg_sentiment_subjectivity"})
week1_stats_df = week1_stats_df[week1_stats_df['total_retweet_count']>=retweet_threshold]
week1_stats_df['agg_sentiment_polarity'] = week1_stats_df['agg_sentiment_polarity']/week1_stats_df['total_tweet_count']
week1_stats_df['agg_sentiment_subjectivity'] = week1_stats_df['agg_sentiment_subjectivity']/week1_stats_df['total_tweet_count']
week1_stats_df.reset_index(drop=True,inplace=True)
print(week1_stats_df['agg_sentiment_polarity'].max())
print(week1_stats_df['agg_sentiment_polarity'].min())
print(week1_stats_df.shape)
week1_stats_df.tail()

### Week 2 tweets

In [None]:
tweet_df_week2 = tweet_df[(tweet_df['created_at']>'2019-04-06 23:59:59') & (tweet_df['created_at']<='2019-04-13 23:59:59')]
tweet_df_week2.reset_index(drop=True,inplace=True)
tweet_df_week2 = fill_sentiments(tweet_df_week2)
print(tweet_df_week2.shape)
print(tweet_df_week2['created_at'].min())
print(tweet_df_week2['created_at'].max())
tweet_df_week2.head()

In [None]:
retweets_df_week2 = tweet_df_week2[['screen_name','retweet_count']].groupby('screen_name').sum()
followers_df_week2 = tweet_df_week2[['screen_name','followers_count']].groupby('screen_name').max() #Max of the week
friends_df_week2 = tweet_df_week2[['screen_name','friends_count']].groupby('screen_name').max()  #Max of the week
sentiment_polarity_df_week2 = tweet_df_week2[['screen_name','tweet_sentiment_polarity']].groupby('screen_name').sum()
sentiment_subjectivity_df_week2 = tweet_df_week2[['screen_name','tweet_sentiment_subjectivity']].groupby('screen_name').sum()
tweet_count_df_week2 = tweet_df_week2[['screen_name','dummy_count']].groupby('screen_name').sum()

week2_stats_df = pd.concat([tweet_count_df_week2,retweets_df_week2,followers_df_week2,friends_df_week2,
                            sentiment_polarity_df_week2,sentiment_subjectivity_df_week2],axis='columns')
week2_stats_df['screen_name'] = week2_stats_df.index
week2_stats_df = week2_stats_df[['screen_name','dummy_count','retweet_count','followers_count','friends_count',
                                 'tweet_sentiment_polarity','tweet_sentiment_subjectivity']]
week2_stats_df = week2_stats_df.rename(columns={"retweet_count":"total_retweet_count","followers_count":"max_followers_count",
                                                "friends_count":"max_friends_count","dummy_count":"total_tweet_count",
                                                "tweet_sentiment_polarity":"agg_sentiment_polarity",
                                                "tweet_sentiment_subjectivity":"agg_sentiment_subjectivity"})
week2_stats_df = week2_stats_df[week2_stats_df['total_retweet_count']>=retweet_threshold]
week2_stats_df['agg_sentiment_polarity'] = week2_stats_df['agg_sentiment_polarity']/week2_stats_df['total_tweet_count']
week2_stats_df['agg_sentiment_subjectivity'] = week2_stats_df['agg_sentiment_subjectivity']/week2_stats_df['total_tweet_count']
week2_stats_df.reset_index(drop=True,inplace=True)
print(week2_stats_df['agg_sentiment_polarity'].max())
print(week2_stats_df['agg_sentiment_polarity'].min())
print(week2_stats_df.shape)
week2_stats_df.tail()

### Week 3 tweets

In [None]:
tweet_df_week3 = tweet_df[(tweet_df['created_at']>'2019-04-13 23:59:59') & (tweet_df['created_at']<='2019-04-20 23:59:59')]
tweet_df_week3.reset_index(drop=True,inplace=True)
tweet_df_week3 = fill_sentiments(tweet_df_week3)
print(tweet_df_week3.shape)
print(tweet_df_week3['created_at'].min())
print(tweet_df_week3['created_at'].max())
tweet_df_week3.head()

In [None]:
retweets_df_week3 = tweet_df_week3[['screen_name','retweet_count']].groupby('screen_name').sum()
followers_df_week3 = tweet_df_week3[['screen_name','followers_count']].groupby('screen_name').max() #Max of the week
friends_df_week3 = tweet_df_week3[['screen_name','friends_count']].groupby('screen_name').max()  #Max of the week
sentiment_polarity_df_week3 = tweet_df_week3[['screen_name','tweet_sentiment_polarity']].groupby('screen_name').sum()
sentiment_subjectivity_df_week3 = tweet_df_week3[['screen_name','tweet_sentiment_subjectivity']].groupby('screen_name').sum()
tweet_count_df_week3 = tweet_df_week3[['screen_name','dummy_count']].groupby('screen_name').sum()

week3_stats_df = pd.concat([tweet_count_df_week3,retweets_df_week3,followers_df_week3,friends_df_week3,
                            sentiment_polarity_df_week3,sentiment_subjectivity_df_week3],axis='columns')
week3_stats_df['screen_name'] = week3_stats_df.index
week3_stats_df = week3_stats_df[['screen_name','dummy_count','retweet_count','followers_count','friends_count',
                                 'tweet_sentiment_polarity','tweet_sentiment_subjectivity']]
week3_stats_df = week3_stats_df.rename(columns={"retweet_count":"total_retweet_count","followers_count":"max_followers_count",
                                                "friends_count":"max_friends_count","dummy_count":"total_tweet_count",
                                                "tweet_sentiment_polarity":"agg_sentiment_polarity",
                                                "tweet_sentiment_subjectivity":"agg_sentiment_subjectivity"})
week3_stats_df = week3_stats_df[week3_stats_df['total_retweet_count']>=retweet_threshold]
week3_stats_df['agg_sentiment_polarity'] = week3_stats_df['agg_sentiment_polarity']/week3_stats_df['total_tweet_count']
week3_stats_df['agg_sentiment_subjectivity'] = week3_stats_df['agg_sentiment_subjectivity']/week3_stats_df['total_tweet_count']
week3_stats_df.reset_index(drop=True,inplace=True)
print(week3_stats_df['agg_sentiment_polarity'].max())
print(week3_stats_df['agg_sentiment_polarity'].min())
print(week3_stats_df.shape)
week3_stats_df.tail()

### Week 4 tweets

In [None]:
tweet_df_week4 = tweet_df[tweet_df['created_at']>'2019-04-23 23:59:59']
tweet_df_week4.reset_index(drop=True,inplace=True)
tweet_df_week4 = fill_sentiments(tweet_df_week4)
print(tweet_df_week4.shape)
print(tweet_df_week4['created_at'].min())
print(tweet_df_week4['created_at'].max())
tweet_df_week4.head()

In [None]:
retweets_df_week4 = tweet_df_week4[['screen_name','retweet_count']].groupby('screen_name').sum()
followers_df_week4 = tweet_df_week4[['screen_name','followers_count']].groupby('screen_name').max() #Max of the week
friends_df_week4 = tweet_df_week4[['screen_name','friends_count']].groupby('screen_name').max()  #Max of the week
sentiment_polarity_df_week4 = tweet_df_week4[['screen_name','tweet_sentiment_polarity']].groupby('screen_name').sum()
sentiment_subjectivity_df_week4 = tweet_df_week4[['screen_name','tweet_sentiment_subjectivity']].groupby('screen_name').sum()
tweet_count_df_week4 = tweet_df_week4[['screen_name','dummy_count']].groupby('screen_name').sum()

week4_stats_df = pd.concat([tweet_count_df_week4,retweets_df_week4,followers_df_week4,friends_df_week4,
                            sentiment_polarity_df_week4,sentiment_subjectivity_df_week4],axis='columns')
week4_stats_df['screen_name'] = week4_stats_df.index
week4_stats_df = week4_stats_df[['screen_name','dummy_count','retweet_count','followers_count','friends_count',
                                 'tweet_sentiment_polarity','tweet_sentiment_subjectivity']]
week4_stats_df = week4_stats_df.rename(columns={"retweet_count":"total_retweet_count","followers_count":"max_followers_count",
                                                "friends_count":"max_friends_count","dummy_count":"total_tweet_count",
                                                "tweet_sentiment_polarity":"agg_sentiment_polarity",
                                                "tweet_sentiment_subjectivity":"agg_sentiment_subjectivity"})
week4_stats_df = week4_stats_df[week4_stats_df['total_retweet_count']>=retweet_threshold]
week4_stats_df['agg_sentiment_polarity'] = week4_stats_df['agg_sentiment_polarity']/week4_stats_df['total_tweet_count']
week4_stats_df['agg_sentiment_subjectivity'] = week4_stats_df['agg_sentiment_subjectivity']/week4_stats_df['total_tweet_count']
week4_stats_df.reset_index(drop=True,inplace=True)
print(week4_stats_df['agg_sentiment_polarity'].max())
print(week4_stats_df['agg_sentiment_polarity'].min())
print(week4_stats_df.shape)
week4_stats_df.tail()

### Check how many people have tweeted #Brexit for each week of April

In [None]:
#Here don't take into account of the retweet_threshold
week1_tweeters = set(tweet_df_week1['screen_name'])
print("Number of #brexit (en) tweeters in week 1 is: " + str(len(week1_tweeters)))
week2_tweeters = set(tweet_df_week2['screen_name'])
print("Number of #brexit (en) tweeters in week 2 is: " + str(len(week2_tweeters)))
week3_tweeters = set(tweet_df_week3['screen_name'])
print("Number of #brexit (en) tweeters in week 3 is: " + str(len(week3_tweeters)))
week4_tweeters = set(tweet_df_week4['screen_name'])
print("Number of #brexit (en) tweeters in week 4 is: " + str(len(week4_tweeters)))

all_week_tweeters = week1_tweeters & week2_tweeters & week3_tweeters & week4_tweeters
print("Number of common #brexit (en) tweeters for all weeks is: " + str(len(all_week_tweeters)))

### Screen only the most popular twitter users whose aggregate retweets for the week > retweet_threshold

In [None]:
week1_popular_tweeters = set(week1_stats_df['screen_name'])
print("Number of #brexit (en + retweet_threshold) tweeters in week 1 is: " + str(len(week1_popular_tweeters)))
week2_popular_tweeters = set(week2_stats_df['screen_name'])
print("Number of #brexit (en + retweet_threshold) tweeters in week 2 is: " + str(len(week2_popular_tweeters)))
week3_popular_tweeters = set(week3_stats_df['screen_name'])
print("Number of #brexit (en + retweet_threshold) tweeters in week 3 is: " + str(len(week3_popular_tweeters)))
week4_popular_tweeters = set(week4_stats_df['screen_name'])
print("Number of #brexit (en + retweet_threshold) tweeters in week 4 is: " + str(len(week4_popular_tweeters)))

all_week_popular_tweeters = week1_popular_tweeters & week2_popular_tweeters & week3_popular_tweeters & week4_popular_tweeters
print("Number of common #brexit (en + retweet_threshold) tweeters for all weeks is: " + str(len(all_week_popular_tweeters)))

In [None]:
nodes = list(all_week_popular_tweeters)
def create_pairings(source):
        result = []
        for p1 in range(len(source)):
                for p2 in range(p1+1,len(source)):
                        result.append([source[p1],source[p2]])
        return result

pairings = create_pairings(nodes)
print("%d pairings" % len(pairings))

In [None]:
screen_name_cols = ['source_screen_name','destination_screen_name']
network_df = pd.DataFrame(pairings, columns = screen_name_cols)
network_df['has_mutual_following'] = False #Initialize it to false then compute the follower friend mutual relations
network_df['source_follow_dest'] = False
network_df['dest_follow_source'] = False  #source is a friend of dest
print(network_df.shape)
network_df.head()

#### Get the mutual following info among the list of popular Twitter users

In [None]:
#This takes time. Takes about 1 hour for finding connection between every 750 pair of nodes
'''
for index,row in network_df.iterrows():
    ff_rel = api.show_friendship(source_screen_name=row['source_screen_name'], target_screen_name=row['destination_screen_name'])
    network_df.at[index,'has_mutual_following'] = (ff_rel[0].followed_by == True and ff_rel[0].following == True)
    network_df.at[index,'source_follow_dest'] = (ff_rel[0].following == True)
    network_df.at[index,'dest_follow_source'] = (ff_rel[0].followed_by == True)

print(network_df.shape)
network_df.head()
network_df.to_csv('Outputs\mutual_folling_info_retweet_thresh_'+str(retweet_threshold)+'.csv')
'''

In [None]:
network_df = pd.read_csv('Outputs\mutual_folling_info_retweet_thresh_'+str(retweet_threshold)+'.csv') #load from saved
network_df = network_df.drop(columns='Unnamed: 0',axis=1)
print(network_df.shape)
network_df.head()

In [None]:
network_df['has_mutual_following'].sum()

In [None]:
network_df['source_follow_dest'].sum() + network_df['dest_follow_source'].sum()

In [None]:
network_stats_df_week1 = week1_stats_df[week1_stats_df['screen_name'].isin(all_week_popular_tweeters)]
network_stats_df_week1.reset_index(drop=True,inplace=True)

network_stats_df_week2 = week2_stats_df[week2_stats_df['screen_name'].isin(all_week_popular_tweeters)]
network_stats_df_week2.reset_index(drop=True,inplace=True)

network_stats_df_week3 = week3_stats_df[week3_stats_df['screen_name'].isin(all_week_popular_tweeters)]
network_stats_df_week3.reset_index(drop=True,inplace=True)

network_stats_df_week4 = week4_stats_df[week4_stats_df['screen_name'].isin(all_week_popular_tweeters)]
network_stats_df_week4.reset_index(drop=True,inplace=True)

network_stats_df_week1.to_csv('Outputs\week_1_network_retweet_thresh_'+str(retweet_threshold)+'.csv')
network_stats_df_week2.to_csv('Outputs\week_2_network_retweet_thresh_'+str(retweet_threshold)+'.csv')
network_stats_df_week3.to_csv('Outputs\week_3_network_retweet_thresh_'+str(retweet_threshold)+'.csv')
network_stats_df_week4.to_csv('Outputs\week_4_network_retweet_thresh_'+str(retweet_threshold)+'.csv')

print(network_stats_df_week1.shape)
network_stats_df_week1.head()

### Network for week 1

In [None]:
#Plot the mutual relation using networkX library
nodes = list(network_stats_df_week1['screen_name'])
pd.DataFrame(nodes,columns=['Twitter_users']).to_csv('nodes_list_retweet_thresh_'+str(retweet_threshold)+'.csv')
size_of_nodes = list(network_stats_df_week1['total_retweet_count'])
color_of_nodes = list(network_stats_df_week1['agg_sentiment_polarity']*2) #Multiply by 2 to see more contrast in colors
mutual_follow_lol = network_df[['source_screen_name','destination_screen_name']][network_df['has_mutual_following']==True].values.tolist()
pd.DataFrame(mutual_follow_lol,columns=['Twitter_user_1','Twitter_user_2']).to_csv('Outputs\edges_list_retweet_thresh_'+str(retweet_threshold)+'.csv')
mutual_follow_edges = []
for mfe in mutual_follow_lol:
    mutual_follow_edges.append((mfe[0],mfe[1]))

G = nx.Graph()
G.add_nodes_from(nodes)
G.add_edges_from(mutual_follow_edges)
plt.figure(figsize=(75,75))
pos = nx.spring_layout(G, k=1.5*1/np.sqrt(len(G.nodes())), iterations=30)
nx.draw(G,pos=pos,with_labels = True,font_size=50,node_size=size_of_nodes,node_color=color_of_nodes,
        cmap=plt.get_cmap('YlOrBr'),seed=10)
plt.title('''Popular Social Network Graph of people who have tweeted #Brexit on 1st week of April
             Size of nodes indicates number of retweets
             Edges indicates mutual following relation
             Color of nodes indicates sentiment''',fontsize=100)
plt.savefig("network_graph_week1_retweet_thresh_"+str(retweet_threshold)+".jpeg") #save as jpeg
plt.show() #display
#Intensive color means the sentiment for #Brexit is positive

### Network for week 2

In [None]:
#Plot the mutual relation using networkX library
nodes = list(network_stats_df_week2['screen_name'])
size_of_nodes = list(network_stats_df_week2['total_retweet_count'])
color_of_nodes = list(network_stats_df_week2['agg_sentiment_polarity']*2) #Multiply by 2 to see more contrast in colors
mutual_follow_lol = network_df[['source_screen_name','destination_screen_name']][network_df['has_mutual_following']==True].values.tolist()
mutual_follow_edges = []
for mfe in mutual_follow_lol:
    mutual_follow_edges.append((mfe[0],mfe[1]))

G = nx.Graph()
G.add_nodes_from(nodes)
G.add_edges_from(mutual_follow_edges)
plt.figure(figsize=(75,75))
pos = nx.spring_layout(G, k=1.5*1/np.sqrt(len(G.nodes())), iterations=20)
nx.draw(G,pos=pos,with_labels = True,font_size=50,node_size=size_of_nodes,node_color=color_of_nodes,
        cmap=plt.get_cmap('YlOrBr'),seed=10)
plt.title('''Popular Social Network Graph of people who have tweeted #Brexit on 2nd week of April
             Size of nodes indicates number of retweets
             Edges indicates mutual following relation
             Color of nodes indicates sentiment''',fontsize=100)
plt.savefig("network_graph_week2_retweet_thresh_"+str(retweet_threshold)+".jpeg") #save as jpeg
plt.show() #display
#Intensive color means the sentiment for #Brexit is positive

### Network for week 3

In [None]:
#Plot the mutual relation using networkX library
nodes = list(network_stats_df_week3['screen_name'])
size_of_nodes = list(network_stats_df_week3['total_retweet_count'])
color_of_nodes = list(network_stats_df_week3['agg_sentiment_polarity']*2) #Multiply by 2 to see more contrast in colors
mutual_follow_lol = network_df[['source_screen_name','destination_screen_name']][network_df['has_mutual_following']==True].values.tolist()
mutual_follow_edges = []
for mfe in mutual_follow_lol:
    mutual_follow_edges.append((mfe[0],mfe[1]))

G = nx.Graph()
G.add_nodes_from(nodes)
G.add_edges_from(mutual_follow_edges)
plt.figure(figsize=(75,75))
pos = nx.spring_layout(G, k=1.5*1/np.sqrt(len(G.nodes())), iterations=25)
nx.draw(G,pos=pos,with_labels = True,font_size=50,node_size=size_of_nodes,node_color=color_of_nodes,
        cmap=plt.get_cmap('YlOrBr'),seed=10)
plt.title('''Popular Social Network Graph of people who have tweeted #Brexit on 3rd week of April
             Size of nodes indicates number of retweets
             Edges indicates mutual following relation
             Color of nodes indicates sentiment''',fontsize=100)
plt.savefig("network_graph_week3_retweet_thresh_"+str(retweet_threshold)+".jpeg") #save as jpeg
plt.show() #display
#Intensive color means the sentiment for #Brexit is positive

### Network for week 4

In [None]:
#Plot the mutual relation using networkX library
nodes = list(network_stats_df_week4['screen_name'])
size_of_nodes = list(network_stats_df_week4['total_retweet_count'])
color_of_nodes = list(network_stats_df_week4['agg_sentiment_polarity']*2) #Multiply by 2 to see more contrast in colors
mutual_follow_lol = network_df[['source_screen_name','destination_screen_name']][network_df['has_mutual_following']==True].values.tolist()
mutual_follow_edges = []
for mfe in mutual_follow_lol:
    mutual_follow_edges.append((mfe[0],mfe[1]))

G = nx.Graph()
G.add_nodes_from(nodes)
G.add_edges_from(mutual_follow_edges)
plt.figure(figsize=(75,75))
pos = nx.spring_layout(G, k=1.5*1/np.sqrt(len(G.nodes())), iterations=25)
nx.draw(G,pos=pos,with_labels = True,font_size=50,node_size=size_of_nodes,node_color=color_of_nodes,
        cmap=plt.get_cmap('YlOrBr'),seed=10)
plt.title('''Popular Social Network Graph of people who have tweeted #Brexit on 4th week of April
             Size of nodes indicates number of retweets
             Edges indicates mutual following relation
             Color of nodes indicates sentiment''',fontsize=100)
plt.savefig("network_graph_week4_retweet_thresh_"+str(retweet_threshold)+".jpeg") #save as jpeg
plt.show() #display
#Intensive color means the sentiment for #Brexit is positive

In [None]:
#Centrality Plot
df_centrality = network_stats_df_week1.copy()
df_centrality.head()
df_centrality['color'] = 'gold'
df_centrality['total_retweet_count'] = 500
df_centrality.at[226,'total_retweet_count'] = 10000
df_centrality.at[164,'total_retweet_count'] = 10000
df_centrality.at[122,'total_retweet_count'] = 10000
#df_centrality.at[121,'total_retweet_count'] = 10000

df_centrality.at[226,'color'] = 'red'  #Degree Centrality & Eigenvector centrality
df_centrality.at[164,'color'] = 'lime'  #Betweenness Centrality
df_centrality.at[122,'color'] = 'magenta'  #Closeness Centrality
#df_centrality.at[121,'color'] = 'cyan'  #Follower/following ratio

color_of_nodes = list(df_centrality['color']) #Multiply by 2 to see more contrast in colors
nodes = list(df_centrality['screen_name'])
size_of_nodes = list(df_centrality['total_retweet_count'])
mutual_follow_lol = network_df[['source_screen_name','destination_screen_name']][network_df['has_mutual_following']==True].values.tolist()
mutual_follow_edges = []
for mfe in mutual_follow_lol:
    mutual_follow_edges.append((mfe[0],mfe[1]))

G = nx.Graph()
G.add_nodes_from(nodes)
G.add_edges_from(mutual_follow_edges)
plt.figure(figsize=(75,75))
pos = nx.spring_layout(G, k=3*1/np.sqrt(len(G.nodes())), iterations=25)
nx.draw(G,pos=pos,with_labels = True,font_size=50,node_size=size_of_nodes,node_color=color_of_nodes,
        cmap=plt.get_cmap('YlOrBr'),seed=10)
plt.title('''Centrality Plots''',fontsize=100)
plt.savefig("centrality.jpeg") #save as jpeg
plt.show() #display
#Intensive color means the sentiment for #Brexit is positive

## Graph Algorithms (Undirected)

### Connectivity

In [None]:
#ALL NODE CONNECTIVITY
#Compute node connectivity between all pairs of nodes. (This call takes time)
#network_all_node_pair_connectivity = approx.all_pairs_node_connectivity(G)

In [None]:
#LOCAL NODE CONNECTIVITY
#Give a source node & a target node to check if there is a connectivity between them
from networkx.algorithms import approximation as approx
network_local_node_connectivity = approx.local_node_connectivity(G,'SkyNews','BBCPolitics')
network_local_node_connectivity

In [None]:
#NODE CONNECTIVITY
network_node_connectivity = approx.node_connectivity(G)
network_node_connectivity

### Clustering

In [None]:
#BIPARTITE CLUSTERING
#Compute a bipartite clustering coefficient for nodes.
#nx.algorithms.bipartite.clustering(G) #The graph is not bipartite, so it produces error

In [None]:
#CLUSTER TRIANGLES
#Finds the number of triangles that include a node as one vertex.
network_clustering_triangles = nx.triangles(G)
sorted(network_clustering_triangles.items(), key=lambda x: x[1],reverse=True)[:10]

In [None]:
#CLUSTER TRANSITIVITY
#Compute graph transitivity, the fraction of all possible triangles present in G.
network_clustering_transitivity = nx.transitivity(G)
network_clustering_transitivity

In [None]:
#SQUARE CLUSTERING
#Compute the squares clustering coefficient for nodes.
network_square_clustering = nx.square_clustering(G)
sorted(network_square_clustering.items(), key=lambda x: x[1],reverse=True)[:10]

In [None]:
#CLUSTERING
#Compute the clustering coefficient for nodes.
network_clustering = nx.clustering(G)
sorted(network_clustering.items(), key=lambda x: x[1],reverse=True)[:10]

In [None]:
#AVERAGE CLUSTERING
#Estimates the average clustering coefficient of G
network_average_clustering = nx.average_clustering(G)
network_average_clustering

In [None]:
#GENERALIZED DEGREE
#Compute the generalized degree for nodes.
#For each node, the generalized degree shows how many edges of given triangle multiplicity the node is connected to.
network_generalized_degree = nx.generalized_degree(G)
print(network_generalized_degree['BBCPolitics'])
print(network_generalized_degree['AJBillingham4'])

### Centrality

In [None]:
#DEGREE CENTRALITY
network_degree_centrality = nx.degree_centrality(G)
sorted(network_degree_centrality.items(), key=lambda x: x[1],reverse=True)[:10]

In [None]:
#EIGENVECTOR CENTRALITY
network_eigenvector_centrality = nx.eigenvector_centrality(G)
sorted(network_eigenvector_centrality.items(), key=lambda x: x[1],reverse=True)[:10]

In [None]:
#CLOSENESS CENTRALITY
network_closeness_centrality = nx.closeness_centrality(G)
sorted(network_closeness_centrality.items(), key=lambda x: x[1],reverse=True)[:10]

In [None]:
#BETWEENNESS CENTRALITY
network_betweenness_centrality = nx.betweenness_centrality(G)
sorted(network_betweenness_centrality.items(), key=lambda x: x[1],reverse=True)[:10]

In [None]:
#EDGE BETWEENNESS CENTRALITY
network_edge_betweenness_centrality = nx.edge_betweenness_centrality(G)
sorted(network_edge_betweenness_centrality.items(), key=lambda x: x[1],reverse=True)[:10]

### Communicability

In [None]:
#Return communicability between all pairs of nodes in G.
#The communicability between pairs of nodes in G is the sum of closed walks of different lengths starting at node u and 
#ending at node v.
network_communicatability = nx.communicability(G)
#network_communicatability['BBCPolitics'] #Shows the communicability of BBCPolitics with all other nodes
sorted(network_communicatability['BBCPolitics'].items(), key=lambda x: x[1],reverse=True)[:10] #Top 10 nodes

In [None]:
#COMMUNICABILITY BETWEENNESS CENTRALITY
#Communicability() - Communicability between pairs of nodes in G.
#communicability_betweenness_centrality() - Communicability betweeness centrality for each node in G.
network_communicatability_bw_centrality = nx.communicability_betweenness_centrality(G)

### Link Analysis

In [None]:
#PAGERANK
#PageRank analysis of graph structure.
#PageRank computes a ranking of the nodes in the graph G based on the structure of the incoming links. 
network_pagerank = nx.pagerank(G)
sorted(network_pagerank.items(), key=lambda x: x[1],reverse=True)[:10] 

In [None]:
#HITS
#Return HITS hubs and authorities values for nodes.
#The HITS algorithm computes two numbers for a node. Authorities estimates the node value based on the incoming links. 
    #Hubs estimates the node value based on outgoing links.
network_hubs,network_authorities = nx.hits(G)

### Trees

In [None]:
#MINIMUM SPANNING TREE
network_min_spanning_tree = nx.minimum_spanning_tree(G)
sorted(network_min_spanning_tree.edges(data=True))[:10]

In [None]:
#MAXIMUM SPANNING TREE
network_max_spanning_tree = nx.maximum_spanning_tree(G)
sorted(network_max_spanning_tree.edges(data=True))[:10]

In [None]:
#MINIMUM SPANNING EDGES
network_min_spanning_edges = nx.minimum_spanning_edges(G)
sorted(list(network_min_spanning_edges))[:10]

In [None]:
#MAXIMUM SPANNING EDGES
network_max_spanning_edges = nx.maximum_spanning_edges(G)
sorted(list(network_max_spanning_edges))[:10]

### Vitality

In [None]:
#CLOSENESS VITALITY
#Returns the closeness vitality for nodes in the graph.
#The closeness vitality of a node is the change in the sum of distances between all node pairs when excluding that node.
network_vitality = nx.closeness_vitality(G)#Requires closely connected graph, else returns nan

### Wiener index

In [None]:
#Returns the Wiener index of the given graph.
#The Wiener index of a graph is the sum of the shortest-path distances between each pair of reachable nodes. 
#For pairs of nodes in undirected graphs, only one orientation of the pair is counted.
nx.wiener_index(G)

### Community Detection

In [None]:
#KERNIGHAN–LIN (BIPARTITIAN) COMMUNITY
network_community_kl = nx.community.kernighan_lin.kernighan_lin_bisection(G) 
#It may give better results if we give the weights of edges
len(network_community_kl)

In [None]:
#GREEDY MODULARITY COMMUNITY
#Find communities in graph using Clauset-Newman-Moore greedy modularity maximization. 
#This method currently supports the Graph class and does not consider edge weights.
#Greedy modularity maximization begins with each node in its own community and joins the pair of communities that most 
#increases modularity until no such pair exists.
network_community_modularity = nx.community.greedy_modularity_communities(G)
len(network_community_modularity)

In [None]:
#K-CLIQUE COMMUNITY DETECTION
#Find k-clique communities in graph using the percolation method.
#A k-clique community is the union of all cliques of size k that can be reached through adjacent (sharing k-1 nodes) k-cliques.
network_community_k_clique = nx.community.k_clique_communities(G,5) #Set k = 5
network_community_k_clique = list(network_community_k_clique)
len(network_community_k_clique)

In [None]:
#LABEL PROPAGATION COMMUNITY
#Generates community sets determined by label propagation
network_community_lpa = nx.community.label_propagation.label_propagation_communities(G)
network_community_lpa = list(network_community_lpa)[::-1]
len(network_community_lpa)

In [None]:
#GIRVAN–NEWMAN COMMUNITY
#Partitions via centrality measures
network_community_gn = nx.community.centrality.girvan_newman(G)
network_community_gn = list(tuple(set(c) for c in next(network_community_gn)))
len(network_community_gn)

In [None]:
#LOUVAIN METHOD
#!pip install python-louvain
import community
partition = community.best_partition(G)
network_community_louvain = []
for i in range(len(set(partition.values()))):
    community_members = []
    for key, value in partition.items():
        if value == i:
            community_members.append(key)
    network_community_louvain.append(set(community_members))
    
len(network_community_louvain)

In [None]:
community_df = network_stats_df_week1.copy()
community_df['kernighanLin_ID'] = '' #Bipartitian
community_df['kernighanLin_size'] = 500
community_df['GModularity_ID'] = ''
community_df['GModularity_size'] = 500
community_df['kClique_ID'] = ''
community_df['kClique_size'] = 500
community_df['labelProp_ID'] = ''
community_df['labelProp_size'] = 500
community_df['girvanNew_ID'] = ''
community_df['girvanNew_size'] = 500
community_df['louvain_ID'] = ''
community_df['louvain_size'] = 500
for index,row in community_df.iterrows():
    for h in range(len(network_community_kl)):
        if row['screen_name'] in network_community_kl[h]:
            community_df.at[index,'kernighanLin_ID'] = h
            community_df.at[index,'kernighanLin_size'] = 2500
    for i in range(len(network_community_modularity)):
        if len(network_community_modularity[i]) >= 3:  #Only consider communities having at least 3 members
            if row['screen_name'] in network_community_modularity[i]:
                community_df.at[index,'GModularity_ID'] = i
                community_df.at[index,'GModularity_size'] = 2500
    for j in range(len(network_community_k_clique)):
        if row['screen_name'] in network_community_k_clique[j]:
            community_df.at[index,'kClique_ID'] = j
            community_df.at[index,'kClique_size'] = 2500
    for k in range(len(network_community_lpa)):
        if len(network_community_lpa[k]) >= 3:  #Only consider communities having at least 3 members
            if row['screen_name'] in network_community_lpa[k]:
                community_df.at[index,'labelProp_ID'] = k
                community_df.at[index,'labelProp_size'] = 2500
    for l in range(len(network_community_gn)):
        if len(network_community_gn[l]) >= 3:  #Only consider communities having at least 3 members
            if row['screen_name'] in network_community_gn[l]:
                community_df.at[index,'girvanNew_ID'] = l
                community_df.at[index,'girvanNew_size'] = 2500
    for m in range(len(network_community_louvain)):
        if len(network_community_louvain[m]) >= 3:  #Only consider communities having at least 3 members
            if row['screen_name'] in network_community_louvain[m]:
                community_df.at[index,'louvain_ID'] = m
                community_df.at[index,'louvain_size'] = 2500
print(community_df.shape)
community_df.head()

In [None]:
color_list = ['blue','red','green','brown','orange','crimson','cyan','pink','darkslategray','darkgreen','olive']
community_df['kernighanLin_ID'] = community_df['kernighanLin_ID'].astype(str)
community_df['kClique_ID'] = community_df['kClique_ID'].astype(str)
community_df['GModularity_ID'] = community_df['GModularity_ID'].astype(str)
community_df['labelProp_ID'] = community_df['labelProp_ID'].astype(str)
community_df['girvanNew_ID'] = community_df['girvanNew_ID'].astype(str)
community_df['louvain_ID'] = community_df['louvain_ID'].astype(str)
for c in range(len(color_list)):
    community_df['kernighanLin_ID'] = community_df['kernighanLin_ID'].replace(str(c),color_list[c])
    community_df['kClique_ID'] = community_df['kClique_ID'].replace(str(c),color_list[c])
    community_df['GModularity_ID'] = community_df['GModularity_ID'].replace(str(c),color_list[c])
    community_df['labelProp_ID'] = community_df['labelProp_ID'].replace(str(c),color_list[c])
    community_df['girvanNew_ID'] = community_df['girvanNew_ID'].replace(str(c),color_list[c])
    community_df['louvain_ID'] = community_df['louvain_ID'].replace(str(c),color_list[c])
community_df['kernighanLin_ID'] = community_df['kernighanLin_ID'].replace('','gold')
community_df['kClique_ID'] = community_df['kClique_ID'].replace('','gold')
community_df['GModularity_ID'] = community_df['GModularity_ID'].replace('','gold')
community_df['labelProp_ID'] = community_df['labelProp_ID'].replace('','gold')
community_df['girvanNew_ID'] = community_df['girvanNew_ID'].replace('','gold')
community_df['louvain_ID'] = community_df['louvain_ID'].replace('','gold')
community_df.head()

In [None]:
nodes = list(community_df['screen_name'])
size_of_nodes = list(community_df['kernighanLin_size'])
color_of_nodes = list(community_df['kernighanLin_ID']) #Multiply by 2 to see more contrast in colors
mutual_follow_lol = network_df[['source_screen_name','destination_screen_name']][network_df['has_mutual_following']==True].values.tolist()
mutual_follow_edges = []
for mfe in mutual_follow_lol:
    mutual_follow_edges.append((mfe[0],mfe[1]))

G = nx.Graph()
G.add_nodes_from(nodes)
G.add_edges_from(mutual_follow_edges)
plt.figure(figsize=(75,75))
pos = nx.spring_layout(G, k=2*1/np.sqrt(len(G.nodes())), iterations=30)
nx.draw(G,pos=pos,with_labels = True,font_size=50,node_size=size_of_nodes,node_color=color_of_nodes,seed=1)
plt.title('''Popular Social Network Graph of people who have tweeted #Brexit on each week of April
             Kernighan Lin Community''',fontsize=100)
plt.show() #display

In [None]:
nodes = list(community_df['screen_name'])
size_of_nodes = list(community_df['kClique_size'])
color_of_nodes = list(community_df['kClique_ID']) #Multiply by 2 to see more contrast in colors
mutual_follow_lol = network_df[['source_screen_name','destination_screen_name']][network_df['has_mutual_following']==True].values.tolist()
mutual_follow_edges = []
for mfe in mutual_follow_lol:
    mutual_follow_edges.append((mfe[0],mfe[1]))

G = nx.Graph()
G.add_nodes_from(nodes)
G.add_edges_from(mutual_follow_edges)
plt.figure(figsize=(75,75))
pos = nx.spring_layout(G, k=2*1/np.sqrt(len(G.nodes())), iterations=30)
nx.draw(G,pos=pos,with_labels = True,font_size=50,node_size=size_of_nodes,node_color=color_of_nodes,seed=1)
plt.title('''Popular Social Network Graph of people who have tweeted #Brexit on each week of April
             K-Clique Community''',fontsize=100)
plt.show() #display

In [None]:
nodes = list(community_df['screen_name'])
size_of_nodes = list(community_df['GModularity_size'])
color_of_nodes = list(community_df['GModularity_ID']) #Multiply by 2 to see more contrast in colors
mutual_follow_lol = network_df[['source_screen_name','destination_screen_name']][network_df['has_mutual_following']==True].values.tolist()
mutual_follow_edges = []
for mfe in mutual_follow_lol:
    mutual_follow_edges.append((mfe[0],mfe[1]))

G = nx.Graph()
G.add_nodes_from(nodes)
G.add_edges_from(mutual_follow_edges)
plt.figure(figsize=(75,75))
pos = nx.spring_layout(G, k=1.5*1/np.sqrt(len(G.nodes())), iterations=25)
nx.draw(G,pos=pos,with_labels = True,font_size=50,node_size=size_of_nodes,node_color=color_of_nodes,seed=10)
plt.title('''Popular Social Network Graph of people who have tweeted #Brexit on each week of April
             Greedy Modularity Community''',fontsize=100)
plt.savefig('greedy_modularity.png')
plt.show() #display

In [None]:
nodes = list(community_df['screen_name'])
size_of_nodes = list(community_df['labelProp_size'])
color_of_nodes = list(community_df['labelProp_ID']) #Multiply by 2 to see more contrast in colors
mutual_follow_lol = network_df[['source_screen_name','destination_screen_name']][network_df['has_mutual_following']==True].values.tolist()
mutual_follow_edges = []
for mfe in mutual_follow_lol:
    mutual_follow_edges.append((mfe[0],mfe[1]))

G = nx.Graph()
G.add_nodes_from(nodes)
G.add_edges_from(mutual_follow_edges)
plt.figure(figsize=(75,75))
pos = nx.spring_layout(G, k=1.5*1/np.sqrt(len(G.nodes())), iterations=25)
nx.draw(G,pos=pos,with_labels = True,font_size=50,node_size=size_of_nodes,node_color=color_of_nodes,seed=10)
plt.title('''Popular Social Network Graph of people who have tweeted #Brexit on each week of April
             Label Propagation Community''',fontsize=100)
plt.show() #display

In [None]:
nodes = list(community_df['screen_name'])
size_of_nodes = list(community_df['girvanNew_size'])
color_of_nodes = list(community_df['girvanNew_ID']) #Multiply by 2 to see more contrast in colors
mutual_follow_lol = network_df[['source_screen_name','destination_screen_name']][network_df['has_mutual_following']==True].values.tolist()
mutual_follow_edges = []
for mfe in mutual_follow_lol:
    mutual_follow_edges.append((mfe[0],mfe[1]))

G = nx.Graph()
G.add_nodes_from(nodes)
G.add_edges_from(mutual_follow_edges)
plt.figure(figsize=(75,75))
pos = nx.spring_layout(G, k=1.5*1/np.sqrt(len(G.nodes())), iterations=25)
nx.draw(G,pos=pos,with_labels = True,font_size=50,node_size=size_of_nodes,node_color=color_of_nodes,seed=10)
plt.title('''Popular Social Network Graph of people who have tweeted #Brexit on each week of April
             Girvan-Newman Community''',fontsize=100)
plt.show() #display

In [None]:
nodes = list(community_df['screen_name'])
size_of_nodes = list(community_df['louvain_size'])
color_of_nodes = list(community_df['louvain_ID']) #Multiply by 2 to see more contrast in colors
mutual_follow_lol = network_df[['source_screen_name','destination_screen_name']][network_df['has_mutual_following']==True].values.tolist()
mutual_follow_edges = []
for mfe in mutual_follow_lol:
    mutual_follow_edges.append((mfe[0],mfe[1]))

G = nx.Graph()
G.add_nodes_from(nodes)
G.add_edges_from(mutual_follow_edges)
plt.figure(figsize=(75,75))
pos = nx.spring_layout(G, k=1.5*1/np.sqrt(len(G.nodes())), iterations=25)
nx.draw(G,pos=pos,with_labels = True,font_size=50,node_size=size_of_nodes,node_color=color_of_nodes,seed=10)
plt.title('''Popular Social Network Graph of people who have tweeted #Brexit on each week of April
             Louvain Method''',fontsize=100)
plt.show() #display

In [None]:
#LEIDEN METHOD
#!pip install leidenalg
import leidenalg
#!pip install igraph
import igraph as ig
nx.write_graphml(G,'graph.graphml')
Gix = ig.read('graph.graphml',format="graphml")
network_community_leiden = leidenalg.find_partition(Gix, leidenalg.ModularityVertexPartition);
network_community_leiden = list(network_community_leiden)

community_df = network_stats_df_week1.copy()
community_df['leiden_ID'] = '' #Bipartitian
community_df['leiden_size'] = 500

group_id = 0
for i in range(len(network_community_leiden)):
    if len(network_community_leiden[i]) >= 3:  #Only consider communities having at least 3 members
        for index,row in community_df.iterrows():
              if index in network_community_leiden[i]:
                    community_df.at[index,'leiden_ID'] = group_id
                    community_df.at[index,'leiden_size'] = 2500
            group_id = group_id+1
                
color_list = ['blue','red','green','brown','orange','crimson','cyan','pink','darkslategray','darkgreen','olive']
community_df['leiden_ID'] = community_df['leiden_ID'].astype(str)
for c in range(len(color_list)):
    community_df['leiden_ID'] = community_df['leiden_ID'].replace(str(c),color_list[c])
community_df['leiden_ID'] = community_df['leiden_ID'].replace('','gold')
community_df.head()

In [None]:
nodes = list(community_df['screen_name'])
size_of_nodes = list(community_df['leiden_size'])
color_of_nodes = list(community_df['leiden_ID']) #Multiply by 2 to see more contrast in colors
mutual_follow_lol = network_df[['source_screen_name','destination_screen_name']][network_df['has_mutual_following']==True].values.tolist()
mutual_follow_edges = []
for mfe in mutual_follow_lol:
    mutual_follow_edges.append((mfe[0],mfe[1]))

G = nx.Graph()
G.add_nodes_from(nodes)
G.add_edges_from(mutual_follow_edges)
plt.figure(figsize=(75,75))
pos = nx.spring_layout(G, k=1.5*1/np.sqrt(len(G.nodes())), iterations=25)
nx.draw(G,pos=pos,with_labels = True,font_size=50,node_size=size_of_nodes,node_color=color_of_nodes,seed=10)
plt.title('''Popusalar Social Network Graph of people who have tweeted #Brexit on each week of April
             Leiden Method''',fontsize=100)
plt.savefig('leiden_alg.png')
plt.show() #display

## Validation of Community algorithms based on Sentiments about Brexit

In [None]:
all_week_sentiment_df = pd.concat((network_stats_df_week1[['screen_name','agg_sentiment_polarity']].rename(columns={'agg_sentiment_polarity':'week1_sentiment'}),
                          network_stats_df_week2[['screen_name','agg_sentiment_polarity']].rename(columns={'agg_sentiment_polarity':'week2_sentiment','screen_name':'screen_name_2'}),
                          network_stats_df_week3[['screen_name','agg_sentiment_polarity']].rename(columns={'agg_sentiment_polarity':'week3_sentiment','screen_name':'screen_name_3'}),
                          network_stats_df_week4[['screen_name','agg_sentiment_polarity']].rename(columns={'agg_sentiment_polarity':'week4_sentiment','screen_name':'screen_name_4'})
                          ),axis=1)
all_week_sentiment_df = all_week_sentiment_df.drop(['screen_name_2','screen_name_3','screen_name_4'],axis=1)
print(all_week_sentiment_df.shape)
all_week_sentiment_df.head()

In [None]:
node_df = pd.read_csv('nodes_list_retweet_thresh_10.csv').drop(columns=['Unnamed: 0'],axis=1).rename(columns={'Twitter_users':'screen_name'})
nw_community_df = pd.merge(tweet_df[['screen_name','retweet_count']],node_df,on='screen_name',how='inner')
nw_community_df = nw_community_df.groupby('screen_name').sum()
nw_community_df.reset_index(inplace=True)
nw_community_df['retweet_count'] = nw_community_df['retweet_count'].astype(float)
nw_community_df['kernighanLin_ID'] = '' #Bipartitian
nw_community_df['GModularity_ID'] = ''
nw_community_df['kClique_ID'] = ''
nw_community_df['labelProp_ID'] = ''
nw_community_df['girvanNew_ID'] = ''
nw_community_df['louvain_ID'] = ''
for index,row in nw_community_df.iterrows():
    for h in range(len(network_community_kl)):
        if row['screen_name'] in network_community_kl[h]:
            nw_community_df.at[index,'kernighanLin_ID'] = h
    for i in range(len(network_community_modularity)):
        if row['screen_name'] in network_community_modularity[i]:
            nw_community_df.at[index,'GModularity_ID'] = i
    for j in range(len(network_community_k_clique)):
        if row['screen_name'] in network_community_k_clique[j]:
            nw_community_df.at[index,'kClique_ID'] = j
    for k in range(len(network_community_lpa)):
        if row['screen_name'] in network_community_lpa[k]:
            nw_community_df.at[index,'labelProp_ID'] = k
    for l in range(len(network_community_gn)):
        if row['screen_name'] in network_community_gn[l]:
            nw_community_df.at[index,'girvanNew_ID'] = l
    for m in range(len(network_community_louvain)):
        if row['screen_name'] in network_community_louvain[m]:
            nw_community_df.at[index,'louvain_ID'] = m
            
nw_community_df = pd.merge(nw_community_df,pd.read_csv('leiden_df.csv')[['screen_name','leiden_ID']],on='screen_name',how='inner')
nw_community_df = nw_community_df.drop(['retweet_count'],axis=1)

community_validation_df = pd.merge(nw_community_df,all_week_sentiment_df,on='screen_name',how='inner')
print(community_validation_df.shape)
community_validation_df.head()

In [None]:
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
community_validation_df[['week1_sentiment','week2_sentiment','week3_sentiment','week4_sentiment']] = scaler.fit_transform(community_validation_df[['week1_sentiment','week2_sentiment','week3_sentiment','week4_sentiment']])
community_validation_df.head()

#OR

#community_validation_df[['week1_sentiment','week2_sentiment','week3_sentiment','week4_sentiment']] = community_validation_df[['week1_sentiment','week2_sentiment','week3_sentiment','week4_sentiment']]+1

In [None]:
comm_cols = ['mean_sentiment_CV']
community_sent_cv_df = pd.DataFrame(columns = comm_cols)
community_alg = {'Kernighan-Lin':'kernighanLin_ID','Greedy Modularity':'GModularity_ID','k-Clique':'kClique_ID',
                 'Label-Propagation':'labelProp_ID','Girvan-Newman':'girvanNew_ID','Louvain Method':'louvain_ID',
                 'Leiden Method':'leiden_ID'}
for key,value in community_alg.items():
    df = community_validation_df[[value,'week1_sentiment','week2_sentiment','week3_sentiment','week4_sentiment']].copy()
    mean_cv = np.mean(np.mean(df.groupby(value).std()/df.groupby(value).mean()))
    df = pd.DataFrame([mean_cv],columns=comm_cols,index=[key])
    community_sent_cv_df = community_sent_cv_df.append(df)
community_sent_cv_df

In [None]:
comm_cols = ['mean_sentiment_CV_week1','mean_sentiment_CV_week2','mean_sentiment_CV_week3','mean_sentiment_CV_week4']
community_sent_cv_df = pd.DataFrame(columns = comm_cols)
community_alg = {'Kernighan-Lin':'kernighanLin_ID','Greedy Modularity':'GModularity_ID','k-Clique':'kClique_ID',
                 'Label-Propagation':'labelProp_ID','Girvan-Newman':'girvanNew_ID','Louvain Method':'louvain_ID',
                 'Leiden Method':'leiden_ID'}
for key,value in community_alg.items():
    df = community_validation_df[[value,'week1_sentiment','week2_sentiment','week3_sentiment','week4_sentiment']].copy()
    mean_cv = np.mean(df.groupby(value).std()/df.groupby(value).mean())
    df = pd.DataFrame([[mean_cv[0],mean_cv[1],mean_cv[2],mean_cv[3]]],
                      columns=comm_cols,index=[key])
    community_sent_cv_df = community_sent_cv_df.append(df)
community_sent_cv_df

In [None]:
mean_cv[0]

In [None]:
df = community_validation_df[['labelProp_ID','week1_sentiment','week2_sentiment','week3_sentiment','week4_sentiment']].copy()
value = df.groupby('labelProp_ID').std()/df.groupby('labelProp_ID').mean()
np.mean(value)

In [None]:
np.mean(value)[3]

In [None]:
for key,value in community_alg.items():
    print(key,value)

## JS Visualization Data

#### Undirected

In [None]:
set(js_community_df['leiden_ID'])

In [None]:
node_df = pd.read_csv('nodes_list_retweet_thresh_10.csv').drop(columns=['Unnamed: 0'],axis=1).rename(columns={'Twitter_users':'screen_name'})
js_community_df = pd.merge(tweet_df[['screen_name','retweet_count']],node_df,on='screen_name',how='inner')
js_community_df = js_community_df.groupby('screen_name').sum()
js_community_df.reset_index(inplace=True)
js_community_df['retweet_count'] = js_community_df['retweet_count'].astype(float)
js_community_df['kernighanLin_ID'] = '' #Bipartitian
js_community_df['GModularity_ID'] = ''
js_community_df['kClique_ID'] = ''
js_community_df['labelProp_ID'] = ''
js_community_df['girvanNew_ID'] = ''
js_community_df['louvain_ID'] = ''
for index,row in js_community_df.iterrows():
    for h in range(len(network_community_kl)):
        if row['screen_name'] in network_community_kl[h]:
            js_community_df.at[index,'kernighanLin_ID'] = h
    for i in range(len(network_community_modularity)):
        if row['screen_name'] in network_community_modularity[i]:
            js_community_df.at[index,'GModularity_ID'] = i
    for j in range(len(network_community_k_clique)):
        if row['screen_name'] in network_community_k_clique[j]:
            js_community_df.at[index,'kClique_ID'] = j
    for k in range(len(network_community_lpa)):
        if row['screen_name'] in network_community_lpa[k]:
            js_community_df.at[index,'labelProp_ID'] = k
    for l in range(len(network_community_gn)):
        if row['screen_name'] in network_community_gn[l]:
            js_community_df.at[index,'girvanNew_ID'] = l
    for m in range(len(network_community_louvain)):
        if row['screen_name'] in network_community_louvain[m]:
            js_community_df.at[index,'louvain_ID'] = m
            
js_community_df = pd.merge(js_community_df,pd.read_csv('leiden_df.csv')[['screen_name','leiden_ID']],on='screen_name',how='inner')
js_community_df = js_community_df.drop(['retweet_count'],axis=1)
print(js_community_df.shape)
js_community_df.head()

In [None]:
degree_centrality_df = pd.DataFrame(columns=['screen_name','degree_centrality'])
for key,value in network_degree_centrality.items():
    degree_centrality_df = degree_centrality_df.append(pd.DataFrame([[key,value]],columns=['screen_name','degree_centrality']))
degree_centrality_df.reset_index(drop=True,inplace=True)

eigenvector_centrality_df = pd.DataFrame(columns=['screen_name','eigenvector_centrality'])
for key,value in network_eigenvector_centrality.items():
    eigenvector_centrality_df = eigenvector_centrality_df.append(pd.DataFrame([[key,value]],columns=['screen_name','eigenvector_centrality']))
eigenvector_centrality_df.reset_index(drop=True,inplace=True)

closeness_centrality_df = pd.DataFrame(columns=['screen_name','closeness_centrality'])
for key,value in network_closeness_centrality.items():
    closeness_centrality_df = closeness_centrality_df.append(pd.DataFrame([[key,value]],columns=['screen_name','closeness_centrality']))
closeness_centrality_df.reset_index(drop=True,inplace=True)

betweenness_centrality_df = pd.DataFrame(columns=['screen_name','betweenness_centrality'])
for key,value in network_betweenness_centrality.items():
    betweenness_centrality_df = betweenness_centrality_df.append(pd.DataFrame([[key,value]],columns=['screen_name','betweenness_centrality']))
betweenness_centrality_df.reset_index(drop=True,inplace=True)

temp_df = pd.merge(degree_centrality_df,eigenvector_centrality_df,on='screen_name',how='inner')
temp_df1 = pd.merge(temp_df,closeness_centrality_df,on='screen_name',how='inner')
centrality_df = pd.merge(temp_df1,betweenness_centrality_df,on='screen_name',how='inner')
print(centrality_df.shape)
centrality_df.head()

In [None]:
undirected_comm_centrality = pd.merge(js_community_df,centrality_df,on='screen_name',how='inner')
print(undirected_comm_centrality.shape)
undirected_comm_centrality.head()

In [None]:
all_week_nw_dfs = [network_stats_df_week1,network_stats_df_week2,network_stats_df_week3,network_stats_df_week4]
undirected_node_json_list_week1 = []
undirected_node_json_list_week2 = []
undirected_node_json_list_week3 = []
undirected_node_json_list_week4 = []
all_undirected_week_lists = [undirected_node_json_list_week1,undirected_node_json_list_week2,undirected_node_json_list_week3,
                             undirected_node_json_list_week4]
for i in range(4):
    week_df = all_week_nw_dfs[i]
    node_df_i = pd.merge(week_df,undirected_comm_centrality,on='screen_name',how='inner')
    node_df_i['agg_sentiment_polarity'] = round(node_df_i['agg_sentiment_polarity'],6)
    node_df_i['degree_centrality'] = round(node_df_i['degree_centrality'],6)
    node_df_i['eigenvector_centrality'] = round(node_df_i['eigenvector_centrality'],6)
    node_df_i['closeness_centrality'] = round(node_df_i['closeness_centrality'],6)
    node_df_i['betweenness_centrality'] = round(node_df_i['betweenness_centrality'],6)
    for index,row in node_df_i.iterrows():
        new_dict = {'name':row['screen_name'],
                    'total_tweet_count':row['total_tweet_count'],
                    'total_retweet_count':row['total_retweet_count'],
                    'followers_count':row['max_followers_count'],
                    'friends_count':row['max_friends_count'],
                    'brexit_tweet_sentiment':row['agg_sentiment_polarity'],
                    'degree_centrality':row['degree_centrality'],
                    'eigenvector_centrality':row['eigenvector_centrality'],
                    'closeness_centrality':row['closeness_centrality'],
                    'betweenness_centrality':row['betweenness_centrality'],
                    'community_kernighanLin':row['kernighanLin_ID'],
                    'community_GModularity':row['GModularity_ID'],
                    'community_kClique':row['kClique_ID'],
                    'community_labelProp':row['labelProp_ID'],
                    'community_girvanNew':row['girvanNew_ID'],
                    'community_louvain':row['louvain_ID'],
                    'community_leiden':row['leiden_ID']}
        all_undirected_week_lists[i].append(new_dict)


In [None]:
mutual_following_df = network_df[network_df['has_mutual_following']==True]
mutual_following_df.reset_index(drop=True,inplace=True)
print(mutual_following_df.shape)
mutual_following_df.head()

In [None]:
node_df = pd.read_csv('nodes_list_retweet_thresh_10.csv').drop(columns=['Unnamed: 0'],axis=1).rename(columns={'Twitter_users':'screen_name'})
retweet_node_df = pd.merge(tweet_df[['screen_name','retweet_count']],node_df,on='screen_name',how='inner')
retweet_node_df = retweet_node_df.groupby('screen_name').sum()
retweet_node_df.reset_index(inplace=True)
retweet_node_df['retweet_count'] = retweet_node_df['retweet_count'].astype(float)
retweet_node_df.head()

retweet_edge_df = mutual_following_df[['source_screen_name','destination_screen_name']].copy()
retweet_edge_df.reset_index(drop=True,inplace=True)
retweet_edge_df['edge_weight'] = 1
retweet_edge_df['source_index'] = ''
retweet_edge_df['dest_index'] = ''
df = retweet_edge_df.copy()
for ind,row in df.iterrows():
    source_name = row['source_screen_name']
    source_index = retweet_node_df[retweet_node_df['screen_name']==source_name].index.values.astype(int)[0]
    df.at[ind,'source_index'] = source_index
    
    dest_name = row['destination_screen_name']
    dest_index = retweet_node_df[retweet_node_df['screen_name']==dest_name].index.values.astype(int)[0]
    df.at[ind,'dest_index'] = dest_index
    
undirected_links_json_df = df[['source_index','dest_index','edge_weight']]
print(undirected_links_json_df.shape)
undirected_links_json_df.head()

In [None]:
undirected_links_json_list = []
for index,row in undirected_links_json_df.iterrows():
    new_dict = {'source':row['source_index'],
                'target':row['dest_index'],
                'weight':row['edge_weight']}
    undirected_links_json_list.append(new_dict)

In [None]:
all_undirected_nodes_week_lists = [undirected_node_json_list_week1,undirected_node_json_list_week2,undirected_node_json_list_week3,
                             undirected_node_json_list_week4]
for i in range(4):
    js_json_data = {'nodes':all_undirected_nodes_week_lists[i],'links':undirected_links_json_list}
    with open('undirected_json_data_week_'+str(i+1)+'.json', 'w') as f:
        print(js_json_data, file=f)

#### Directed Edges

In [None]:
network_df = pd.read_csv('Outputs\mutual_folling_info_retweet_thresh_'+str(retweet_threshold)+'.csv')
network_df = network_df.drop(columns=['Unnamed: 0'],axis='columns')

source_follow_dest_df = network_df[network_df['source_follow_dest']==True]
source_follow_dest_df.reset_index(drop=True,inplace=True)
source_follow_dest_df = source_follow_dest_df.drop(columns=['dest_follow_source'],axis='columns')

dest_follow_source_df = network_df[network_df['dest_follow_source']==True]
dest_follow_source_df.reset_index(drop=True,inplace=True)
dest_follow_source_df = dest_follow_source_df.drop(columns=['source_follow_dest'],axis='columns')
dest_follow_source_df = dest_follow_source_df.rename(columns={'source_screen_name':'dest_screen_name','destination_screen_name':'source_screen_name','dest_follow_source':'source_follow_dest'})
dest_follow_source_df = dest_follow_source_df.rename(columns={'dest_screen_name':'destination_screen_name'})
dest_follow_source_df = dest_follow_source_df[['source_screen_name','destination_screen_name','has_mutual_following','source_follow_dest']]

following_df = pd.concat([source_follow_dest_df,dest_follow_source_df])
following_df.reset_index(drop=True,inplace=True)
print(following_df.shape)
following_df.head()

In [None]:
import ast
nodes_df = pd.read_csv('nodes_list_retweet_thresh_'+str(retweet_threshold)+'.csv')
nodes_df = nodes_df.drop(['Unnamed: 0'],axis='columns')

tweet_df = pd.read_csv('Outputs/brexit_tweets_april.csv',lineterminator='\n')
tweet_df['mentions'] = tweet_df['mentions\r'].str.strip()
tweet_df['dummy_count'] = 1
tweet_df = tweet_df.drop(['Unnamed: 0','id','location','retweeted','mentions\r'],axis='columns')

nodes = list(nodes_df['Twitter_users'])

ret_thresh_tweet_df = tweet_df[tweet_df['screen_name'].isin(nodes)]
ret_thresh_tweet_df.reset_index(drop=True,inplace=True)
ret_thresh_tweet_df['mentions_list'] = ''
for index,row in ret_thresh_tweet_df.iterrows():
    list_string = row['mentions']
    ls_to_list = ast.literal_eval(list_string)
    ret_thresh_tweet_df.at[index,'mentions_list'] = ls_to_list
    
df = ret_thresh_tweet_df[['screen_name','mentions_list']].groupby('screen_name')['mentions_list'].apply(list)
df = pd.DataFrame(df)
df['screen_name'] = df.index
df = df[['screen_name','mentions_list']]
df['mentions_flatten_list'] = ''
for index,row in df.iterrows():
    l = row['mentions_list']
    flatten_list = [item[1:] for sublist in l for item in sublist] #Setting item[1:] removes '@' symbol which is required!
    df.at[index,'mentions_flatten_list'] = flatten_list
df.reset_index(drop=True,inplace=True)

following__edge_weight_df = pd.merge(following_df,df[['screen_name','mentions_flatten_list']],
                                     left_on='source_screen_name',right_on='screen_name',how='inner')
following__edge_weight_df['edge_weight'] = 0.0
following__edge_weight_df = following__edge_weight_df.drop(['screen_name'],axis='columns')
for index,row in following__edge_weight_df.iterrows():
    c_mentions = row['mentions_flatten_list'].count(row['destination_screen_name'])
    following__edge_weight_df.at[index,'edge_weight'] = 0.5+c_mentions

following__edge_weight_df = following__edge_weight_df.sort_values(['source_screen_name','destination_screen_name'],
                                                                 ascending=[True,False])
print(following__edge_weight_df.shape)
following__edge_weight_df.head(6)

In [None]:
#VERTICES DATA
node_df = pd.read_csv('nodes_list_retweet_thresh_10.csv').drop(columns=['Unnamed: 0'],axis=1).rename(columns={'Twitter_users':'screen_name'})
retweet_node_df = pd.merge(tweet_df[['screen_name','retweet_count']],node_df,on='screen_name',how='inner')
retweet_node_df = retweet_node_df.groupby('screen_name').sum()
retweet_node_df.reset_index(inplace=True)
retweet_node_df['retweet_count'] = retweet_node_df['retweet_count'].astype(float)
retweet_node_df.to_csv('map_equation_vertices.csv')
retweet_node_df.head()

#LINK DATA
retweet_edge_df = following__edge_weight_df[['source_screen_name','destination_screen_name','edge_weight']]
retweet_edge_df.reset_index(drop=True,inplace=True)
retweet_edge_df['source_index'] = ''
retweet_edge_df['dest_index'] = ''
df = retweet_edge_df.copy()
for ind,row in df.iterrows():
    source_name = row['source_screen_name']
    source_index = retweet_node_df[retweet_node_df['screen_name']==source_name].index.values.astype(int)[0]
    df.at[ind,'source_index'] = source_index
    
    dest_name = row['destination_screen_name']
    dest_index = retweet_node_df[retweet_node_df['screen_name']==dest_name].index.values.astype(int)[0]
    df.at[ind,'dest_index'] = dest_index
    
js_link_df = df[['source_index','dest_index','edge_weight']]
#js_link_df.to_csv('map_equation_edges.csv')
print(js_link_df.shape)
js_link_df.head()

In [None]:
directed_link_json_list = []
for index,row in js_link_df.iterrows():
    new_dict = {'source':row['source_index'],
                'target':row['dest_index'],
                'weight':row['edge_weight']}
    directed_link_json_list.append(new_dict)
directed_link_json_list[:2]

In [None]:
all_nodes_week_lists = [undirected_node_json_list_week1,undirected_node_json_list_week2,
                        undirected_node_json_list_week3,undirected_node_json_list_week4]
for i in range(4):
    js_json_data = {'nodes':all_nodes_week_lists[i],'links':directed_link_json_list}
    with open('directed_json_data_week_'+str(i+1)+'.json', 'w') as f:
        print(js_json_data, file=f)

In [None]:
len(directed_link_json_list)