# Building and analysing a paper co-authorship network

In [5]:
import graph_tool_extras as gte
from pathlib import Path

## Introduction

In this notebook, a network of co-authorship of papers about condensed matter authors was built. The database in the file below is used. It's from the Stanford Network Analysis Project's dataset at https://networks.skewed.de/net/arxiv_authors#CondMat_draw.

In [12]:
FOLDER_PATH = Path.cwd() / 'archive'
EDGES_FILE_PATH = FOLDER_PATH / 'web-Google.txt'

## Understanding the data

| Concept of vertices                                                                                                 | Concept of edges                                                                                                                             | Operationalization of vertices                                                                                                                                                                                                                                                           | Operationalization of edges                                                                                                                                                                                                                                                  |
|---------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Webpage                                                                                                             | An edge between i and j indicates co-authorship between i and j.                                                                           | Each vertex represents an author of a paper in the cond-mat category on the arXiv platform. The data was collected from January 1993 to April 2003.                                                                                                                                 | An edge between authors i and j indicates a co-authorship relationship between them. In cases where other k authors also contributed to the production of the paper, these additional k authors are connected to both authors i and j, as well as to each other, through edges. |


## Creating functions to build the network

In [13]:
def get_or_add_vertex(g, id):
    u = g.vertex_by_id(id)
    if u is None:
        u = g.add_vertex_by_id(id)
    return u

In [14]:
def get_or_add_edge(g, author_a, author_b):
    e = g.edge_by_ids(author_a, author_b)
    if e is None:
        e = g.add_edge_by_ids(author_a, author_b)
    return e

## Reading the data and building the network

In [20]:
g = gte.Graph(directed=True)

In [30]:
with open(EDGES_FILE_PATH) as file:

    next(file)
    next(file)
    next(file)
    next(file)
    
    for line in file:
        parts = line.split('\t')

        author_a = parts[0]
        author_b = parts[1].replace('\n', '')
        
        vertex_a = get_or_add_vertex(g, author_a)
        vertex_b = get_or_add_vertex(g, author_b)
        edge = get_or_add_edge(g, author_a, author_b)

['0', '11342\n']
['11342']


In [41]:
g = gte.clean(g)

In [42]:
gte.save(g, 'cond_mat_authors.net.gz')

## Configuring the layout and rendering the network

In [43]:
from graph_tool import draw
import netpixi

In [44]:
layout = draw.sfdp_layout(g)

In [45]:
gte.move(g, layout)

In [46]:
gte.save(g, 'cond_mat_authors_layout.net.gz')

In [47]:
r = netpixi.render('cond_mat_authors_layout.net.gz', infinite=True)

## Improving network vizualization

In [51]:
r.vertex_default(
    size=1,
    color=0xff0000,
    bwidth=1,
    bcolor=0x007700,
)

In [52]:
r.edge_default(
    width=0.2,
    color=0xffffff,
    curve1=0,
    curve2=0,
)

## Analyzing the network