# Setup

In [1]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx

In [2]:
# read in data
account_lookup = pd.read_csv("data/musae_git_target.csv")  # contains GitHub user IDs and corresponding account names
connections = pd.read_csv("data/musae_git_edges.csv")  # containing all of the connections between different GitHub accounts/users

# Data Cleaning and Transformation

In [3]:
# drop unnecessary "ml_target" column in account_lookup
account_lookup.drop(columns=["ml_target"], inplace=True)

In [4]:
# basic info about account_lookup data
print(f"DATAFRAME SHAPE: {account_lookup.shape}\n--------------------\nDATAFRAME INFO")
account_lookup.info()
print("--------------------\nDATAFRAME DESCRIPTION\n")
print(account_lookup.describe())

DATAFRAME SHAPE: (37700, 2)
--------------------
DATAFRAME INFO
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37700 entries, 0 to 37699
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      37700 non-null  int64 
 1   name    37700 non-null  object
dtypes: int64(1), object(1)
memory usage: 589.2+ KB
--------------------
DATAFRAME DESCRIPTION

                 id
count  37700.000000
mean   18849.500000
std    10883.196911
min        0.000000
25%     9424.750000
50%    18849.500000
75%    28274.250000
max    37699.000000


In [5]:
# basic info about connections data
print(f"DATAFRAME SHAPE: {connections.shape}\n--------------------\nDATAFRAME INFO")
connections.info()
print("--------------------\nDATAFRAME DESCRIPTION\n")
print(connections.describe())

DATAFRAME SHAPE: (289003, 2)
--------------------
DATAFRAME INFO
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 289003 entries, 0 to 289002
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   id_1    289003 non-null  int64
 1   id_2    289003 non-null  int64
dtypes: int64(2)
memory usage: 4.4 MB
--------------------
DATAFRAME DESCRIPTION

               id_1           id_2
count  289003.00000  289003.000000
mean    14812.59490   23778.812767
std     10155.54884    9690.937213
min         0.00000      16.000000
25%      6134.00000   16689.000000
50%     13413.00000   25268.000000
75%     22612.00000   31890.000000
max     37694.00000   37699.000000


In [6]:
# first join - matchup id_1 with corresponding account name
connections = connections.merge(account_lookup, left_on="id_1", right_on="id")

# drop extraneous "id" column
connections.drop(columns=["id"], inplace=True)

# rename "name" column to clarify which ID it corresponds to
connections.rename(columns={"name": "name_1"}, inplace=True)

In [7]:
# second join - matchup id_2 with corresponding account name
connections = connections.merge(account_lookup, left_on="id_2", right_on="id")

# drop extraneous "id" column
connections.drop(columns=["id"], inplace=True)

# rename "name" column to clarify which ID it corresponds to
connections.rename(columns={"name": "name_2"}, inplace=True)

In [8]:
# display transformed dataframe
connections

Unnamed: 0,id_1,id_2,name_1,name_2
0,0,23977,Eiryyy,airtoxin
1,1,34526,shawflying,ghosind
2,1,2370,shawflying,jasondu
3,1,14683,shawflying,chaoslawful
4,1,29982,shawflying,dead-horse
...,...,...,...,...
288998,37527,37596,SusmoyBarman1,rusenask
288999,37529,37601,brannondorsey,khacluan
289000,37644,2347,shriphani,bamos
289001,25879,2347,jovanidash21,bamos


# Graph Building and Export

In [9]:
# create graph
g = nx.Graph()

In [10]:
# add nodes
for i in account_lookup.index:
    g.add_node(account_lookup["id"][i], name=account_lookup["name"][i])

In [11]:
# add edges
for i in connections.index:
    left_user = connections["id_1"][i]
    right_user = connections["id_2"][i]
    
    # check for current weight
    current_weight = g.get_edge_data(left_user, right_user, default={"weight":0})["weight"]

    # add edge
    g.add_edge(left_user, right_user, weight=current_weight+1)

In [12]:
# check that all connections were added to the graph
print("Expected Nodes:", account_lookup.shape[0], "\t Total Nodes:", len(g.nodes))
print("Expected Edges:", connections.shape[0], "\t Total Edges:", len(g.edges))

Expected Nodes: 37700 	 Total Nodes: 37700
Expected Edges: 289003 	 Total Edges: 289003


In [13]:
# calculate degree centrality for all nodes
degree_centrality = nx.degree_centrality(g)
n = 20

print("TOP", n, "NODES BY DEGREE CENTRALITY\n-------------------------")
# display top n nodes by degree centrality - how many connections they have
for u in sorted(degree_centrality, key=degree_centrality.get, reverse=True)[:n]:
    print("ID:", u, "   NAME:", g.nodes[u]["name"], "-", degree_centrality[u])

TOP 20 NODES BY DEGREE CENTRALITY
-------------------------
ID: 31890    NAME: dalinhuang99 - 0.25088198625958247
ID: 27803    NAME: nfultz - 0.18793601952306427
ID: 35773    NAME: addyosmani - 0.08817210005570439
ID: 19222    NAME: Bunlong - 0.07846361972466113
ID: 13638    NAME: gabrielpconceicao - 0.06546592747818245
ID: 36652    NAME: rfthusn - 0.062150189660203185
ID: 18163    NAME: nelsonic - 0.05103583649433672
ID: 9051    NAME: getify - 0.04766704687126979
ID: 35008    NAME: mdo - 0.0416721928963633
ID: 10001    NAME: ronenhamias - 0.041592615188731794
ID: 36628    NAME: SuriyaaKudoIsc - 0.03997453513355792
ID: 7027    NAME: jeresig - 0.039390965277593566
ID: 19253    NAME: JonnyBanana - 0.03724236717154301
ID: 2078    NAME: mbostock - 0.03628743467996499
ID: 33671    NAME: shayan-taheri - 0.03573039072654447
ID: 5629    NAME: pengliheng - 0.03490808774768561
ID: 73    NAME: kentcdodds - 0.03360831852303774
ID: 3712    NAME: isaacs - 0.033528740815406244
ID: 11051    NAME: sdra

In [14]:
# calculate betweenness centrality for all nodes
between_centrality = nx.betweenness_centrality(g, k=300)
n = 20

print("TOP", n, "NODES BY BETWEENNESS CENTRALITY\n-------------------------")
# display top n nodes by betweenness centrality - how well-connected they are
for u in sorted(between_centrality, key=between_centrality.get, reverse=True)[:n]:
    print("ID:", u, "   NAME:", g.nodes[u]["name"], "-", between_centrality[u])

TOP 20 NODES BY BETWEENNESS CENTRALITY
-------------------------
ID: 31890    NAME: dalinhuang99 - 0.2883355436610856
ID: 27803    NAME: nfultz - 0.21716297509061358
ID: 19222    NAME: Bunlong - 0.056237828802086
ID: 35773    NAME: addyosmani - 0.04138721736391726
ID: 13638    NAME: gabrielpconceicao - 0.030142471694000725
ID: 10001    NAME: ronenhamias - 0.027264326088247953
ID: 36652    NAME: rfthusn - 0.024265066417751283
ID: 19253    NAME: JonnyBanana - 0.024052394418631908
ID: 18163    NAME: nelsonic - 0.02295150300818241
ID: 33671    NAME: shayan-taheri - 0.021909019036689455
ID: 35008    NAME: mdo - 0.01888895986935473
ID: 73    NAME: kentcdodds - 0.018140055480647246
ID: 2078    NAME: mbostock - 0.017215316342604224
ID: 36628    NAME: SuriyaaKudoIsc - 0.016305099114292777
ID: 25477    NAME: mcanthony - 0.015336358607216145
ID: 28957    NAME: ahmetabdi - 0.014732091751420514
ID: 7027    NAME: jeresig - 0.014103052738472321
ID: 14242    NAME: sibelius - 0.013856185378007644
ID: 5

In [18]:
# create subgraphs centered around top 4 accounts with the highest degree and eigenvector centralities
sub_g_dalin = nx.ego_graph(g, 31890, radius=1)
sub_g_nfultz = nx.ego_graph(g, 27803, radius=1)
sub_g_bunlong = nx.ego_graph(g, 19222, radius=1)
sub_g_addyosmani = nx.ego_graph(g, 35773, radius=1)

# print each user's number of connections
print("dalinhuang99's direct connections: ", len(sub_g_dalin.nodes))
print("nfultz's direct connections: ", len(sub_g_nfultz.nodes))
print("Bunlong's direct connections: ", len(sub_g_bunlong.nodes))
print("addyosmani's direct connections: ", len(sub_g_addyosmani.nodes))

dalinhuang99's direct connections:  9459
nfultz's direct connections:  7086
Bunlong's direct connections:  2959
addyosmani's direct connections:  3325


In [19]:
# -- export to graphml files --
# nx.write_graphml(g, "git_network.graphml")
# nx.write_graphml(sub_g_dalin, "git_dalin.graphml")
# nx.write_graphml(sub_g_nfultz, "git_nfultz.graphml")
# nx.write_graphml(sub_g_bunlong, "git_bunlong.graphml")
# nx.write_graphml(sub_g_addyosmani, "git_addyosmani.graphml")