# Building and analysing a paper co-authorship network

In [None]:
import graph_tool_extras as gte
from pathlib import Path

import distribution as dst

from itertools import combinations
from random import random, choices, seed

from math import log

## Introduction

In this notebook, a network of Github developers in which the nodes are develpers and the edges, if they exist, are mutual follower releationships.

https://www.kaggle.com/datasets/rozemberczki/musae-github-social-network

The dataset was created with the aim of supporting the following comparative research about different social networks:

https://arxiv.org/pdf/1909.13021.pdf

In [None]:
FOLDER_PATH = Path.cwd() / 'archive'
EDGES_FILE_PATH = FOLDER_PATH / 'musae_git_edges.csv'

## Randomly limiting the Data

Given that the dataset exceded the maximium size requirement of this project, the data was limited by randomly selecting node's indexes. The list is turned to set to reduce the time complexity of the loop inside while parsing the file.

In [None]:
seed(10)
listaNodesIndex = range(0, 37701)
chosenNodes = choices(listaNodesIndex, k=int(37700 / 1.2))
chosenNodesSet = set(chosenNodes)

## Understanding the data

An extensive social network of GitHub developers was collected from the public API in June 2019. Nodes are developers who have starred at most minuscule 10 repositories, and edges are mutual follower relationships between them. The vertex features are extracted based on the location; repositories starred, employer and e-mail address. The task related to the graph is binary node classification - one has to predict whether the GitHub user is a web or a machine learning developer. This targeting feature was derived from the job title of each user.

(Text extracted from https://www.kaggle.com/datasets/gitanjali1425/github-social-network-graph-dataset)

## Creating functions to build the network

In [None]:
def get_or_add_vertex(g, id):
    u = g.vertex_by_id(id)
    if u is None:
        u = g.add_vertex_by_id(id)
    return u

In [None]:
def get_or_add_edge(g, author_a, author_b):
    e = g.edge_by_ids(author_a, author_b)
    if e is None:
        e = g.add_edge_by_ids(author_a, author_b)
    return e

## Reading the data and building the network

In [None]:
g = gte.Graph(directed=False)

In [None]:
with open(EDGES_FILE_PATH) as file:

    next(file)
    
    for line in file:
        parts = line.split(',')

        author_a = int(parts[0])
        author_b = int(parts[1].replace('\n', ''))

        if (author_a in chosenNodesSet and author_b in chosenNodesSet):
            vertex_a = get_or_add_vertex(g, author_a)
            vertex_b = get_or_add_vertex(g, author_b)
            edge = get_or_add_edge(g, author_a, author_b)

In [None]:
g = gte.clean(g)

In [None]:
g

In [None]:
gte.save(g, 'github_social.net.gz')

## Configuring the layout and rendering the network

In [None]:
from graph_tool import draw
import netpixi

In [None]:
layout = draw.sfdp_layout(g)

In [None]:
gte.move(g, layout)

In [None]:
gte.save(g, 'github_social_layout.net.gz')

In [None]:
r = netpixi.render('github_social_layout.net.gz', infinite=True)

## Improving network vizualization

In [None]:
r.vertex_default(
    size=1,
    color=0xff7700,
    bwidth=0.2,
    bcolor=0x0000ff,
)

In [None]:
r.edge_default(
    width=0.2,
    color=0xffffff,
    curve1=0,
    curve2=0,
)

## Calculating Density and Transitivity

In [None]:
g.density()

In [None]:
g.transitivity()

## Analysing Degree Distribution

In [None]:
degrees = g.get_total_degrees()

In [None]:
degrees.describe()

In [None]:
degrees.hist();

In [None]:
dst.not_normal(degrees)

In [None]:
dst.more_powerlaw_than_lognormal(degrees)

In [None]:
dst.more_powerlaw_than_exponential(degrees)

## Analyzing Distance Distribution

In [None]:
distances = g.get_distances()

In [None]:
distances.describe()

In [None]:
distances.hist();

In [None]:
log(g.num_vertices())