# Project Wiki-talks

We choose to analyze the wiki-talks datasets of Vietnamese and Swedish Wikipedia as these were able to upload to Jupyter.


In [2]:
import csv #import the Python CSV library
import networkx as nx #import NetworkX
import numpy as np #import numpy for ...
import community #import community (https://pypi.python.org/pypi/python-louvain/0.3)
import powerlaw #import powerlaw library for testing fits
#force drawing of graphs inline for ipython notebook
%matplotlib inline 
import matplotlib.pyplot as plt #import matplotlib for plotting/drawing grpahs


## Vietnamese


In [3]:
with open('vi-wiki-talk.csv', 'rb') as file_handle:
    next(file_handle, '')   # skip the header line (NOTE the first list in the CSV file doesn't contain an edge)
    Vietnamese = nx.read_edgelist(file_handle, delimiter='\t', create_using=nx.DiGraph(), 
                         nodetype=str, data=(('time', str),), encoding="utf-8")
print "done"

done


In [4]:
N = Vietnamese.order() #G.order(), gives number of nodes 
L = Vietnamese.size() #G.size(), gives number of edges

avg_deg = float(L) / N #calculate average degree

#print out statistics
print "Vietnamese statistics"
print "Nodes: ", N
print "Edges: ", L
print "Average degree: ", avg_deg

Vietnamese statistics
Nodes:  338714
Edges:  426086
Average degree:  1.25795213661


### In-degree and out-degree

Let's see if the in-degree and out-degree differ a maximum of 100.


In [9]:
in_degrees_vi = Vietnamese.in_degree()  # dictionary node:degree
out_degrees_vi = Vietnamese.out_degree()

def count_similar_degrees(in_degrees, out_degrees):
    """Count all the nodes for which the in-degree is within a 100 range of the out-degree."""
    counter_same = 0
    counter_one = 0
    counter_two = 0
    counter_in_not_out = 0
    counter_out_not_in
    for node, in_degree in in_degrees.iteritems():
        try:
            out_degree = out_degrees_vi[node]
            if out_degree == in_degree:
                counter_same += 1
            elif out_degree in [in_degree - 1, in_degree + 1]:
                counter_one += 1
            elif out_degree in [in_degree - 2, in_degree + 2]:
                counter_two += 1
        except KeyError:
            counter_in_not_out += 1
    for key in out_degrees:
        try:
            in_degrees[key]
        except KeyError:
            
    print "Nodes with the same in-degree and out-degree: ", counter_same
    print "Nodes in-degree and out-degree differ 1: ", counter_one
    print "Nodes in-degree and out-degree differ 2: ", counter_two
    print "Nodes with more than 2 difference: ",  len(in_degrees) - counter_same - counter_one - counter_two - counter_in_not_out
    print "Nodes without reactions to incoming: ", counter_in_not_out
    print "Nodes which only sent, not received: ", counter_out_not_in
    return counter_same



count_similar_degrees(in_degrees_vi, out_degrees_vi)
print "smallest in-degree: ", min(in_degrees_vi)
print "smallest out-degree: ", min(out_degrees_vi)
print "amount of nodes with in_degree 0: ", sum(1 for x in in_degrees_vi.values() if x == 0)
print "amount of nodes with out-degree 0: ", sum(1 for x in out_degrees_vi.values() if x == 0)

Nodes with the same in-degree and out-degree:  2347
Nodes in-degree and out-degree differ 1:  298624
Nodes in-degree and out-degree differ 2:  17949
Nodes with more than 2 difference:  19794
Nodes without reactions to incoming:  0
Nodes which only sent, not received:  338714
smallest in-degree:  0
smallest out-degree:  0
amount of nodes with in_degree 0:  81
amount of nodes with out-degree 0:  331041


## Swedish

In [6]:
with open('sv-wiki-talk.csv', 'rb') as file_handle:
    next(file_handle, '')   # skip the header line (NOTE the first list in the CSV file doesn't contain an edge)
    Swedish = nx.read_edgelist(file_handle, delimiter='\t', create_using=nx.DiGraph(), 
                         nodetype=str, data=(('time', str),), encoding="utf-8")
print "done"

done


In [7]:
N_sv = Swedish.order()
L_sv = Swedish.size()

avg_deg_sv = float(L_sv) / N_sv

print "Swedish statistics"
print "Nodes: ", N_sv
print "Edges: ", L_sv
print "Average degree: ", avg_deg_sv

Swedish statistics
Nodes:  120833
Edges:  261494
Average degree:  2.16409424578


In [8]:
in_degrees_sv = Swedish.in_degree()  # dictionary node:degree
out_degrees_sv = Swedish.out_degree()

count_similar_degrees(in_degrees_sv, out_degrees_sv)
print "smallest in-degree: ", min(in_degrees_sv)
print "smallest out-degree: ", min(out_degrees_sv)
print "amount of nodes with in_degree 0: ", sum(1 for x in in_degrees_sv.values() if x == 0)
print "amount of nodes with out-degree 0: ", sum(1 for x in out_degrees_sv.values() if x == 0)

Nodes with the same in-degree and out-degree:  1729
Nodes in-degree and out-degree differ 1:  45002
Nodes in-degree and out-degree differ 2:  29665
Nodes with more than 2 difference:  17854
Nodes without reactions to incoming:  26583
Nodes which only sent, not received:  120833
smallest in-degree:  0
smallest out-degree:  0
amount of nodes with in_degree 0:  1084
amount of nodes with out-degree 0:  104230
