# Compute the statistics of datasets using SNAP

In [1]:
import snap
import time

In [2]:
dataroot = "../datasets/"

In [3]:
datasets = ['reddit', 'weibo', 'amazon', 'yelp', 'tfinance',
            'elliptic', 'tolokers', 'questions', 'dgraphfin', 'tsocial']

In [4]:
times = []
for dataset in datasets:
    print(dataset)
    
    alg_st = time.time()
    Graph = snap.LoadEdgeList(snap.TUNGraph, dataset+".snap", 0, 1, Separator=' ')
    
    
    # compute statistics
    outfile = "info-"+dataset+".txt"
    
    Graph.PrintInfo("Python type TUNGraph", outfile, False)
    
    
    result, DegToCCfV = Graph.GetClustCfAll()
#     for item in DegToCCfV:
#         print("degree: %d, clustering coefficient: %f" % (item.GetVal1(), item.GetVal2()))
#     print("average clustering coefficient", result[0])
#     print("closed triads", result[1])
#     print("open triads", result[2])

    alg_ed = time.time()
    
    print("Elasped Time:\t%f sec" % (alg_ed-alg_st))
    
    with open(outfile, "a") as f:
        f.write("  Average clustering coefficient:\t%f\n" % (result[0]))

reddit
Elasped Time:	1.590011 sec
weibo
Elasped Time:	6.866438 sec
amazon
Elasped Time:	368.240388 sec
yelp
Elasped Time:	37.755798 sec
tfinance
Elasped Time:	2178.727380 sec
elliptic
Elasped Time:	0.641634 sec
tolokers
Elasped Time:	11.775784 sec
questions
Elasped Time:	2.071067 sec
dgraphfin
Elasped Time:	188.594150 sec
tsocial
Elasped Time:	1122.659544 sec


## Transform statistics into Pandas.DataFrame

In [34]:
import numpy as np
import pandas as pd

In [35]:
datasets = ['reddit', 'weibo', 'amazon', 'yelp', 'tfinance',
            'elliptic', 'tolokers', 'questions', 'dgraphfin', 'tsocial']

In [36]:
## Get all metric names
metrics = []
with open(f"info-{datasets[0]}.txt", "r") as f:
    lines = f.readlines()
    lines = lines[1:]
    for line in lines:
        line = line.strip().split(":")
        metrics.append(line[0])
print(metrics)

['Nodes', 'Edges', 'Zero Deg Nodes', 'Zero InDeg Nodes', 'Zero OutDeg Nodes', 'NonZero In-Out Deg Nodes', 'Unique directed edges', 'Unique undirected edges', 'Self Edges', 'BiDir Edges', 'Closed triangles', 'Open triangles', 'Frac. of closed triads', 'Connected component size', 'Strong conn. comp. size', 'Approx. full diameter', '90% effective diameter', 'Average clustering coefficient']


In [43]:
df_results = pd.DataFrame({"Metrics": metrics}, columns=["Metrics"]+datasets)

In [44]:
for d_idx, dataset in enumerate(datasets):
    print(dataset)
    
    with open(f"info-{dataset}.txt", "r") as f:
        lines = f.readlines()
        lines = lines[1:]

    for line in lines:
        line = line.strip().split(":")
        metric = line[0]
        value = float(line[-1])
        
        m_idx = metrics.index(metric)
        df_results[dataset][m_idx] = value
        

reddit
weibo
amazon
yelp
tfinance
elliptic
tolokers
questions
dgraphfin
tsocial


In [45]:
df_results

Unnamed: 0,Metrics,reddit,weibo,amazon,yelp,tfinance,elliptic,tolokers,questions,dgraphfin,tsocial
0,Nodes,10984.0,8405.0,11944.0,45954.0,39357.0,203769.0,11758.0,48921.0,3700550.0,5781065.0
1,Edges,89500.0,385676.0,4429520.0,3892933.0,21261900.0,438124.0,530758.0,202461.0,7697810.0,78886573.0
2,Zero Deg Nodes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Zero InDeg Nodes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Zero OutDeg Nodes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,NonZero In-Out Deg Nodes,10984.0,8405.0,11944.0,45954.0,39357.0,203769.0,11758.0,48921.0,3700550.0,5781065.0
6,Unique directed edges,168016.0,762947.0,8847096.0,7739912.0,42484443.0,672479.0,1049758.0,356001.0,11695070.0,151992081.0
7,Unique undirected edges,89500.0,385676.0,4429520.0,3892933.0,21261900.0,438124.0,530758.0,202461.0,7697810.0,78886573.0
8,Self Edges,10984.0,8405.0,11944.0,45954.0,39357.0,203769.0,11758.0,48921.0,3700550.0,5781065.0
9,BiDir Edges,168016.0,762947.0,8847096.0,7739912.0,42484443.0,672479.0,1049758.0,356001.0,11695070.0,151992081.0


In [54]:
def make_pretty(styler, caption, precision):
    styler.set_caption(f"{caption}")
#     styler.format(rain_condition)
#     styler.format_index(lambda v: v.strftime("%A"))
    styler.format(precision=precision)
#     styler.highlight_max(subset=datasets ,color = 'pink', axis = 0)
#     styler.background_gradient(axis=None, vmin=1, vmax=5, cmap="YlGnBu")
    styler.format(precision=precision+4, subset=pd.IndexSlice[[12,13,14,16,17], :])
    return styler
def show_df_results(df, caption, precision):
    return df.style.pipe(make_pretty, caption, precision)

In [55]:
show_df_results(df_results, "", 0)

Unnamed: 0,Metrics,reddit,weibo,amazon,yelp,tfinance,elliptic,tolokers,questions,dgraphfin,tsocial
0,Nodes,10984.0,8405.0,11944.0,45954.0,39357.0,203769.0,11758.0,48921.0,3700550.0,5781065.0
1,Edges,89500.0,385676.0,4429520.0,3892933.0,21261900.0,438124.0,530758.0,202461.0,7697810.0,78886573.0
2,Zero Deg Nodes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Zero InDeg Nodes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Zero OutDeg Nodes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,NonZero In-Out Deg Nodes,10984.0,8405.0,11944.0,45954.0,39357.0,203769.0,11758.0,48921.0,3700550.0,5781065.0
6,Unique directed edges,168016.0,762947.0,8847096.0,7739912.0,42484443.0,672479.0,1049758.0,356001.0,11695070.0,151992081.0
7,Unique undirected edges,89500.0,385676.0,4429520.0,3892933.0,21261900.0,438124.0,530758.0,202461.0,7697810.0,78886573.0
8,Self Edges,10984.0,8405.0,11944.0,45954.0,39357.0,203769.0,11758.0,48921.0,3700550.0,5781065.0
9,BiDir Edges,168016.0,762947.0,8847096.0,7739912.0,42484443.0,672479.0,1049758.0,356001.0,11695070.0,151992081.0


In [56]:
df_results.to_csv(f"statistics.csv", index=False)

In [2]:
# print extended statistics to file 'info-pngraph.txt'
Graph = snap.GenRndGnm(snap.TNGraph, 100, 1000)
Graph.PrintInfo("Python type TNGraph", "info-pngraph.txt", False)

In [3]:
# print basic statistics to standard output
Network = snap.GenRndGnm(snap.TNEANet, 100, 1000)
Network.PrintInfo("Python type TNEANet")

In [4]:
Graph = snap.GenRndGnm(snap.TNGraph, 100, 1000)
result, DegToCCfV = Graph.GetClustCfAll()
for item in DegToCCfV:
    print("degree: %d, clustering coefficient: %f" % (item.GetVal1(), item.GetVal2()))
print("average clustering coefficient", result[0])
print("closed triads", result[1])
print("open triads", result[2])

degree: 13, clustering coefficient: 0.161713
degree: 14, clustering coefficient: 0.190842
degree: 15, clustering coefficient: 0.196546
degree: 16, clustering coefficient: 0.172619
degree: 17, clustering coefficient: 0.212796
degree: 18, clustering coefficient: 0.182633
degree: 19, clustering coefficient: 0.174297
degree: 20, clustering coefficient: 0.176079
degree: 21, clustering coefficient: 0.194883
degree: 22, clustering coefficient: 0.178811
degree: 23, clustering coefficient: 0.187705
degree: 24, clustering coefficient: 0.199244
degree: 25, clustering coefficient: 0.176347
degree: 26, clustering coefficient: 0.184743
degree: 27, clustering coefficient: 0.192745
degree: 28, clustering coefficient: 0.230769
degree: 29, clustering coefficient: 0.186097
('average clustering coefficient', 0.1866464767232778)
('closed triads', 1114)
('open triads', 14550)


In [5]:
Graph = snap.GenRndGnm(snap.TNGraph, 100, 1000)
diam = Graph.GetBfsFullDiam(100, False)
print(diam)

3


In [2]:
## test weighted edgelist
with open("graph.txt", "w") as f:
    f.write("0 1 2\n")
    f.write("0 2 2\n")
    f.write("0 3 2\n")
    f.write("0 4 2\n")
    f.write("0 5 2\n")
Graph = snap.LoadEdgeList(snap.TUNGraph, "graph.txt", 0, 1, '\t')

In [3]:
print(Graph)

<snap.snap.PUNGraph; proxy of <Swig Object of type 'PUNGraph *' at 0x7f8b0c184540> >


In [4]:
Graph.PrintInfo("Python type TNGraph", "info-pngraph.txt", False)

RuntimeError: Execution stopped: (0<=ValN)&&(ValN<Vals) [Reason:'Index:-1 Vals:0 MxVals:0 Type:4TVecI7TKeyDatI4TInt4TFltEiE'], file ../../snap/glib-core/ds.h, line 487

In [5]:
labels = {}
for NI in Graph.Nodes():
        labels[NI.GetId()] = str(NI.GetId())
Graph.DrawGViz(snap.gvlDot, "output.png", " ", labels)

In [6]:
Graph = snap.GenRndGnm(snap.TNGraph, 10, 50)
labels = {}
for NI in Graph.Nodes():
        labels[NI.GetId()] = str(NI.GetId())
Graph.DrawGViz(snap.gvlDot, "output.png", " ", labels)

In [7]:
snap.SaveEdgeList(Graph, "test.txt", "Save as tab-separated list of edges")
Graph = snap.LoadEdgeList(snap.TNGraph, "test.txt", 0, 1)

In [2]:
Graph = snap.LoadEdgeList(snap.TUNGraph, "reddit.snap", 0, 1, Separator=' ')

In [3]:
Graph.PrintInfo("Python type TNGraph", "info-pngraph.txt", False)

In [4]:
result, DegToCCfV = Graph.GetClustCfAll()
for item in DegToCCfV:
    print("degree: %d, clustering coefficient: %f" % (item.GetVal1(), item.GetVal2()))
print("average clustering coefficient", result[0])
print("closed triads", result[1])
print("open triads", result[2])

degree: 2, clustering coefficient: 2.000000
degree: 3, clustering coefficient: 1.000000
degree: 4, clustering coefficient: 0.833333
degree: 5, clustering coefficient: 0.600000
degree: 6, clustering coefficient: 0.533333
degree: 7, clustering coefficient: 0.428571
degree: 8, clustering coefficient: 0.392857
degree: 9, clustering coefficient: 0.333333
degree: 10, clustering coefficient: 0.311111
degree: 11, clustering coefficient: 0.272727
degree: 12, clustering coefficient: 0.257576
degree: 13, clustering coefficient: 0.230769
degree: 14, clustering coefficient: 0.219780
degree: 15, clustering coefficient: 0.200000
degree: 16, clustering coefficient: 0.191667
degree: 17, clustering coefficient: 0.176471
degree: 18, clustering coefficient: 0.169935
degree: 19, clustering coefficient: 0.157895
degree: 20, clustering coefficient: 0.152632
degree: 21, clustering coefficient: 0.142857
degree: 22, clustering coefficient: 0.138528
degree: 23, clustering coefficient: 0.130435
degree: 24, cluste

In [5]:
diam = Graph.GetBfsFullDiam(100, False)
print(diam)

7
