# Project Wiki-talks

We choose to analyze the wiki-talks datasets of Vietnamese and Swedish Wikipedia as these were able to upload to Jupyter.


In [2]:
import csv #import the Python CSV library
import networkx as nx #import NetworkX
import numpy as np #import numpy for ...
import community #import community (https://pypi.python.org/pypi/python-louvain/0.3)
import powerlaw #import powerlaw library for testing fits
#force drawing of graphs inline for ipython notebook
%matplotlib inline 
import matplotlib.pyplot as plt #import matplotlib for plotting/drawing grpahs


## Vietnamese


In [3]:
with open('vi-wiki-talk.csv', 'rb') as file_handle:
    next(file_handle, '')   # skip the header line (NOTE the first list in the CSV file doesn't contain an edge)
    Vietnamese = nx.read_edgelist(file_handle, delimiter='\t', create_using=nx.DiGraph(), 
                         nodetype=str, data=(('time', str),), encoding="utf-8")
print "done"

done


In [4]:
N = Vietnamese.order() #G.order(), gives number of nodes 
L = Vietnamese.size() #G.size(), gives number of edges

avg_deg = float(L) / N #calculate average degree

#print out statistics
print "Vietnamese statistics"
print "Nodes: ", N
print "Edges: ", L
print "Average degree: ", avg_deg

Vietnamese statistics
Nodes:  338714
Edges:  426086
Average degree:  1.25795213661


### In-degree and out-degree

Let's see if the in-degree and out-degree differ a maximum of 100.


In [22]:
in_degrees_vi = Vietnamese.in_degree()  # dictionary node:degree
out_degrees_vi = Vietnamese.out_degree()

def count_similar_degrees(in_degrees, out_degrees):
    """Count all the nodes for which the in-degree is within a 100 range of the out-degree."""
    counter_same = 0
    counter_one = 0
    counter_two = 0
    counter_in_not_out = 0
    counter_out_not_in = 0 
    for node, in_degree in in_degrees.iteritems():
            out_degree = out_degrees[node]
            if out_degree == 0 and not in_degree == 0:
                counter_in_not_out += 1
            if in_degree == 0 and not out_degree == 0:
                counter_out_not_in += 1
            if out_degree == in_degree:
                counter_same += 1
            if out_degree in (in_degree - 1, in_degree + 1):
                counter_one += 1
            if out_degree in (in_degree - 2, in_degree + 2):
                counter_two += 1
     
    print "Nodes with the same in-degree and out-degree: ", float(counter_same) / len(in_degrees) * 100
    print "Nodes in-degree and out-degree differ 1: ", float(counter_one) / len(in_degrees) * 100
    print "Nodes in-degree and out-degree differ 2: ", float(counter_two) / len(in_degrees) * 100
    print "Nodes with more than 2 difference: ",  float(len(in_degrees) - counter_same - counter_one - counter_two) / len(in_degrees) * 100
    print "Nodes which only received, not send: ", float(counter_in_not_out) / len(in_degrees) * 100
    print "Nodes which only sent, not received: ",float(counter_out_not_in) / len(in_degrees) * 100



count_similar_degrees(in_degrees_vi, out_degrees_vi)
print "smallest in-degree: ", min(in_degrees_vi)
print "smallest out-degree: ", min(out_degrees_vi)
print "highest in_degree: ", max(in_degrees_vi)
print "highest out_degree: ", max(out_degrees_vi)
print "amount of nodes with in_degree 0: ", sum(1 for x in in_degrees_vi.values() if x == 0)
print "amount of nodes with out-degree 0: ", sum(1 for x in out_degrees_vi.values() if x == 0)

 Nodes with the same in-degree and out-degree:  0.692914966609
Nodes in-degree and out-degree differ 1:  88.1640558111
Nodes in-degree and out-degree differ 2:  5.29916094404
Nodes with more than 2 difference:  5.84386827825
Nodes which only received, not send:  97.7346670052
Nodes which only sent, not received:  0.0239139805263
smallest in-degree:  0
smallest out-degree:  0
highest in_degree:  99999
highest out_degree:  99999
amount of nodes with in_degree 0:  81
amount of nodes with out-degree 0:  331041


## Swedish

In [15]:
with open('sv-wiki-talk.csv', 'rb') as file_handle:
    next(file_handle, '')   # skip the header line (NOTE the first list in the CSV file doesn't contain an edge)
    Swedish = nx.read_edgelist(file_handle, delimiter='\t', create_using=nx.DiGraph(), 
                         nodetype=str, data=(('time', str),), encoding="utf-8")
print "done"

done


In [16]:
N_sv = Swedish.order()
L_sv = Swedish.size()

avg_deg_sv = float(L_sv) / N_sv

print "Swedish statistics"
print "Nodes: ", N_sv
print "Edges: ", L_sv
print "Average degree: ", avg_deg_sv

Swedish statistics
Nodes:  120833
Edges:  261494
Average degree:  2.16409424578


In [24]:
in_degrees_sv = Swedish.in_degree()  # dictionary node:degree
out_degrees_sv = Swedish.out_degree()

count_similar_degrees(in_degrees_sv, out_degrees_sv)
print "smallest in-degree: ", min(in_degrees_sv)
print "smallest out-degree: ", min(out_degrees_sv)
print "highest in_degree: ", max(in_degrees_sv)
print "highest out_degree: ", max(out_degrees_sv)
print "amount of nodes with in_degree 0: ", sum(1 for x in in_degrees_sv.values() if x == 0)
print "amount of nodes with out-degree 0: ", sum(1 for x in out_degrees_sv.values() if x == 0)

Nodes with the same in-degree and out-degree:  2.90566318804
Nodes in-degree and out-degree differ 1:  51.3692451565
Nodes in-degree and out-degree differ 2:  29.9173239099
Nodes with more than 2 difference:  15.8077677456
Nodes which only received, not send:  86.2595483022
Nodes which only sent, not received:  0.897105923051
smallest in-degree:  0
smallest out-degree:  0
highest in_degree:  99995
highest out_degree:  99995
amount of nodes with in_degree 0:  1084
amount of nodes with out-degree 0:  104230
