In [17]:
import networkx as nx
from networkx.algorithms import community 
import pandas as pd
import numpy as np
from operator import itemgetter


In [2]:
df = pd.read_csv('clickstream-enwiki-2020-01.tsv',nrows=2000,sep='\t',header=None)

In [3]:
df.columns = ['From','To','RelationType','Count']

In [4]:
df = df[df['RelationType']=='link']

In [6]:
df.head()

Unnamed: 0,From,To,RelationType,Count
0,Eddie_Albert,The_Dude_Goes_West,link,17
2,Gale_Storm,The_Dude_Goes_West,link,15
5,Ascoli_Calcio_1898_F.C.,Gianluca_Scamacca,link,87
7,2019–20_Coppa_Italia,Gianluca_Scamacca,link,333
8,2018_UEFA_European_Under-19_Championship,Gianluca_Scamacca,link,23


In [7]:
nodes_from = df['From'].values
nodes_to = df['To'].values


In [8]:
nodes = np.concatenate((nodes_from, nodes_to))

In [9]:
edges = []
for r in df.values:
    edge = (r[0],r[1])
    edges.append(edge)


In [10]:
G = nx.Graph()
G.add_nodes_from(nodes)
G.add_edges_from(edges)

In [11]:
density = nx.density(G)
density

0.001247852343680885

In [12]:
triadic_closure = nx.transitivity(G)
triadic_closure

0

In [13]:
degree_dict = dict(G.degree(G.nodes()))
nx.set_node_attributes(G, degree_dict, 'degree')

In [15]:
G.nodes['Gianluca_Scamacca']

{'degree': 7}

In [18]:
sorted_degree = sorted(degree_dict.items(), key=itemgetter(1), reverse=True)
print("Top 20 nodes by degree:")
for d in sorted_degree[:20]:
    print(d)


Top 20 nodes by degree:
('Ejaculation', 205)
('Bombardier_CRJ100/200', 145)
('Purépecha', 53)
('Odesa_International_Airport', 50)
('Flora_Robson', 43)
('Bobby_Keys', 42)
('Blyth_Spartans_A.F.C.', 40)
('Ostracon', 39)
('Anal_canal', 39)
('Hukbalahap_Rebellion', 39)
('Squaw_Valley_Ski_Resort', 37)
('Carl_Orff', 34)
('Vaginismus', 32)
('The_Client_List_(TV_series)', 29)
('A_Christmas_Carol_(2004_film)', 23)
('1968_Winter_Olympics', 23)
('Motorized_scooter', 21)
('Vacuum_expectation_value', 20)
('1963_NFL_Championship_Game', 17)
('Provinces_of_New_Zealand', 17)


In [19]:
betweenness_dict = nx.betweenness_centrality(G) # Run betweenness centrality
eigenvector_dict = nx.eigenvector_centrality(G) # Run eigenvector centrality

# Assigning each to an attribute
nx.set_node_attributes(G, betweenness_dict, 'betweenness')
nx.set_node_attributes(G, eigenvector_dict, 'eigenvector')


In [20]:
sorted_betweenness = sorted(betweenness_dict.items(), key=itemgetter(1), reverse=True)

print("Top 20 nodes by betweenness centrality:")
for b in sorted_betweenness[:20]:
    print(b)


Top 20 nodes by betweenness centrality:
('Ejaculation', 0.03140840933251273)
('Bombardier_CRJ100/200', 0.009735355004755777)
('Anal_canal', 0.008629170629814059)
('Vaginismus', 0.006048695425129152)
('Levator_ani', 0.005262993128768314)
('Pudendal_nerve', 0.003578955221096999)
('Squaw_Valley_Ski_Resort', 0.0013386113131539194)
('Purépecha', 0.0012849922602062702)
('Odesa_International_Airport', 0.0011423189541020908)
('1968_Winter_Olympics', 0.0009404315634383335)
('The_Client_List_(TV_series)', 0.0009320390160204406)
('Flora_Robson', 0.0008420522575952554)
('Bobby_Keys', 0.0008028870363117552)
('A_Christmas_Carol_(2004_film)', 0.0007837706783043324)
('Blyth_Spartans_A.F.C.', 0.000727354109550719)
('Ostracon', 0.000690986404073183)
('Hukbalahap_Rebellion', 0.000690986404073183)
('Masters_and_Johnson', 0.0006888216596995189)
('Vagina', 0.0006888216596995189)
('Sexual_intercourse', 0.0006888216596995189)


In [21]:
#First get the top 20 nodes by betweenness as a list
top_betweenness = sorted_betweenness[:20]

#Then find and print their degree
for tb in top_betweenness: # Loop through top_betweenness
    degree = degree_dict[tb[0]] # Use degree_dict to access a node's degree, see footnote 2
    print("Name:", tb[0], "| Betweenness Centrality:", tb[1], "| Degree:", degree)


Name: Ejaculation | Betweenness Centrality: 0.03140840933251273 | Degree: 205
Name: Bombardier_CRJ100/200 | Betweenness Centrality: 0.009735355004755777 | Degree: 145
Name: Anal_canal | Betweenness Centrality: 0.008629170629814059 | Degree: 39
Name: Vaginismus | Betweenness Centrality: 0.006048695425129152 | Degree: 32
Name: Levator_ani | Betweenness Centrality: 0.005262993128768314 | Degree: 3
Name: Pudendal_nerve | Betweenness Centrality: 0.003578955221096999 | Degree: 2
Name: Squaw_Valley_Ski_Resort | Betweenness Centrality: 0.0013386113131539194 | Degree: 37
Name: Purépecha | Betweenness Centrality: 0.0012849922602062702 | Degree: 53
Name: Odesa_International_Airport | Betweenness Centrality: 0.0011423189541020908 | Degree: 50
Name: 1968_Winter_Olympics | Betweenness Centrality: 0.0009404315634383335 | Degree: 23
Name: The_Client_List_(TV_series) | Betweenness Centrality: 0.0009320390160204406 | Degree: 29
Name: Flora_Robson | Betweenness Centrality: 0.0008420522575952554 | Degree:

In [22]:
communities = community.greedy_modularity_communities(G)

In [23]:
modularity_dict = {} # Create a blank dictionary
for i,c in enumerate(communities): # Loop through the list of communities, keeping track of the number for the community
    for name in c: # Loop through each person in a community
        modularity_dict[name] = i # Create an entry in the dictionary for the person, where the value is which group they belong to.

# Now you can add modularity information like we did the other metrics
nx.set_node_attributes(G, modularity_dict, 'modularity')


In [25]:
#First get a list of just the nodes in that class
class0 = [n for n in G.nodes() if G.nodes[n]['modularity'] == 0]

# Then create a dictionary of the eigenvector centralities of those nodes
class0_eigenvector = {n:G.nodes[n]['eigenvector'] for n in class0}

# Then sort that dictionary and print the first 5 results
class0_sorted_by_eigenvector = sorted(class0_eigenvector.items(), key=itemgetter(1), reverse=True)

print("Modularity Class 0 Sorted by Eigenvector Centrality:")
for node in class0_sorted_by_eigenvector[:5]:
    print("Name:", node[0], "| Eigenvector Centrality:", node[1])

Modularity Class 0 Sorted by Eigenvector Centrality:
Name: Ejaculation | Eigenvector Centrality: 0.7065150468259145
Name: Levator_ani | Eigenvector Centrality: 0.051902400924233344
Name: Masters_and_Johnson | Eigenvector Centrality: 0.05129749747456322
Name: Vagina | Eigenvector Centrality: 0.05129749747456322
Name: Sexual_intercourse | Eigenvector Centrality: 0.05129749747456322


In [39]:
#for i,c in enumerate(communities): # Loop through the list of communities
#    if len(c) > 2: # Filter out modularity classes with 2 or fewer nodes
#        print('Class '+str(i)+':', list(c)) # Print out the classes and their members

In [28]:
from networkx.readwrite import json_graph

In [30]:
json_data = json_graph.node_link_data(G)

In [32]:
import json

In [42]:
#json.dumps(json_data)