In [1]:
import pandas as pd
import networkx as nx # documentation at https://networkx.org/documentation/
import matplotlib.pyplot as plt
from scipy import stats
import collections

In [2]:
# read in original dataset and results from Random Edge (sampled at k=0.1)

dataOriginal = pd.read_csv("web-Stanford.txt", sep="\t")
print(dataOriginal.shape)

#dataRE = pd.read_csv("Random_Edge_Output.csv")
#print(dataRE.shape)

(2312497, 2)


In [3]:
# dataRE

Unnamed: 0.1,Unnamed: 0,FromNode,ToNode
0,1106494,38194,34573
1,917528,253227,183862
2,1735707,134268,265957
3,1189018,126750,43843
4,1951112,142185,215730
...,...,...,...
231244,1881687,226897,261616
231245,1456095,68300,222332
231246,599113,60754,52796
231247,310791,115140,259094


Goals, calculate D-stat for:

1. in-degree,
2. out-degree,
3. dist of weakly-connected components (WCC),
4. dist of strongly connected components,
5. hop plot,
6. hop plot on WCC,
7. dis of clustering coeff

### 1. in-degree

count total number of connections into a given node

In [3]:
in_degree_OG = dataOriginal.groupby("ToNode").count()["FromNode"]
in_degree_RE = dataOriginal.groupby("ToNode").count()["FromNode"]

In [4]:
# ks_2samp returns test statistic D, the maximum vertical distance between the 2 eCDFs of the samples as used in the paper
# scipy ks_2samp https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ks_2samp.html

stats.ks_2samp(in_degree_OG, in_degree_RE)

KstestResult(statistic=-0.0, pvalue=1.0)

In [5]:
stats.ks_2samp(in_degree_OG, in_degree_RE)

-0.0

### 2. out-degree

count total number of connections coming out of a given node

In [58]:
out_degree_OG = dataOriginal.groupby("FromNode").count()["ToNode"]
out_degree_RE = dataRE.groupby("FromNode").count()["ToNode"]

In [60]:
stats.ks_2samp(out_degree_OG, out_degree_RE)

KstestResult(statistic=0.6563402611877971, pvalue=0.0)

### 3. WCC 

The distribution of sizes of weakly connected components (“wcc”): a set of nodes is weakly connected if for any pair of nodes u and v there exists an undirected path from u to v.

In [83]:
# create directed graphs

G_Original = nx.DiGraph()
G_Original = nx.from_pandas_edgelist(dataOriginal, "FromNode", "ToNode", create_using=nx.DiGraph())

G_RE = nx.DiGraph()
G_RE = nx.from_pandas_edgelist(dataRE, "FromNode", "ToNode", create_using=nx.DiGraph())

In [84]:
# use WCC method from networkx https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.weakly_connected_components.html#networkx.algorithms.components.weakly_connected_components

wcc_OG = nx.weakly_connected_components(G_Original)
wcc_sizes_OG = collections.Counter([len(wcc) for wcc in wcc_OG]).keys()

wcc_RE = nx.weakly_connected_components(G_RE)
wcc_sizes_RE = collections.Counter([len(wcc) for wcc in wcc_RE]).keys()

In [89]:
stats.ks_2samp(list(wcc_sizes_OG), list(wcc_sizes_RE))

KstestResult(statistic=0.17651612903225805, pvalue=0.02292064878272837)

### 4. SCC
The distribution of sizes of strongly connected components (“scc”): a set of nodes is strongly connected, if for any pair of nodes u and v, there exists a directed path from u to v and from v to u

In [93]:
# use SCC method from networkx https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.strongly_connected_components.html#networkx.algorithms.components.strongly_connected_components

scc_OG = nx.strongly_connected_components(G_Original)
scc_sizes_OG = collections.Counter([len(wcc) for wcc in scc_OG]).keys()

scc_RE = nx.strongly_connected_components(G_RE)
scc_sizes_RE = collections.Counter([len(wcc) for wcc in scc_RE]).keys()

In [92]:
stats.ks_2samp(list(scc_sizes_OG), list(scc_sizes_RE))

KstestResult(statistic=0.96, pvalue=1.9602208745084226e-11)

### 5. hop plot
Hop-plot: the number P(h) of reachable pairs of nodes at distance h or less; h is the number of hops

## Code Scrapbook


In [None]:
# # create graph objects

# G_Original = nx.DiGraph()
# G_Original = nx.from_pandas_edgelist(dataOriginal, "FromNode", "ToNode")

# G_RE = nx.DiGraph()
# G_RE = nx.from_pandas_edgelist(dataRE, "FromNode", "ToNode")

# use networkx degree attribute https://networkx.org/documentation/stable/reference/generated/networkx.classes.function.degree.html#networkx.classes.function.degree

# degree_OG = [deg for node, deg in nx.degree(G_Original)] # .degree returns [node, degree]
# degree_RE = [deg for node, deg in nx.degree(G_RE)]


In [None]:
# # plot WCC
# # https://www.programcreek.com/python/example/120125/networkx.weakly_connected_components

# size_seq = sorted(wcc_sizes_RE.keys())
# size_hist = [wcc_sizes_RE[x] for x in size_seq]

# plt.figure(figsize=(16, 12))
# plt.clf()
# plt.loglog(size_seq, size_hist, 'ro-')
# plt.title("WCC Size Distribution")
# plt.xlabel("Size")
# plt.ylabel("Number of WCCs")
# plt.show()