# Network analysis

## 1. Read dataset linking wikipedia articles with publications and create a bipartite graph

In [1]:
import os
import numpy as np
import pandas as pd
import networkx as nx
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
base_path = '../data/raw'
processed_path = '../data/processed'

In [3]:
# read TSV data
df = pd.read_csv(os.path.join(base_path,'enwiki.tsv'), sep='\t', parse_dates=['timestamp'],infer_datetime_format=True)

# Convert mistakenly converted type nan to string 'NaN' (wikipedia page name)
df.page_title = df.page_title.fillna("NaN")

df.head(5)

Unnamed: 0,page_id,page_title,rev_id,timestamp,type,id
0,2867096,Mu Aquilae,503137751,2012-07-19 16:08:41,doi,10.1051/0004-6361:20078357
1,2867096,Mu Aquilae,508363722,2012-08-20 22:56:21,arxiv,astro-ph/0604502
2,2867096,Mu Aquilae,508363722,2012-08-20 22:56:21,arxiv,astro-ph/0003329
3,2867096,Mu Aquilae,508363722,2012-08-20 22:56:21,arxiv,0708.1752
4,2867096,Mu Aquilae,503137751,2012-07-19 16:08:41,doi,10.1051/0004-6361:20064946


**Create a bipartite graph**

Prepare lists of nodes and edges from the dataframe

In [12]:
# list of unique web page ids (web_page nodes)
wp_ids = df.page_id.unique()

# list of unique web page names
wp_titles = df.page_title.unique()

# list of unique publications (publication nodes)
pub_ids = df.id.unique()

# list of references (edges)
edges = [(page, pub) for page, pub in zip(wp_titles, pub_ids)]

In [47]:
wp_node_list = [(title, {'bipartite':'web_page', 'pid': pid,'address':'/wiki/'+title.replace(' ','_'), 'ptype':'topic','depth':0}) for pid, title in zip(wp_ids, wp_titles)]

In [48]:
pub_node_list = [(pub_id, {'bipartite':'publication'}) for pub_id in pub_ids]

Create a bipartite directed graph 

In [49]:
G = nx.DiGraph()
G.add_nodes_from(wp_node_list)#, bipartite='web_page')
G.add_nodes_from(pub_node_list)#, bipartite='publication')
G.add_edges_from(edges)

Make lists of web_page and publication nodes (same as above but, now extracted directly from the graph)

In [50]:
wpage_nodes = [node for node, data in G.nodes(data=True) if data['bipartite']=='web_page']
pub_nodes = [node for node, data in G.nodes(data=True) if data['bipartite']=='publication']

Calculate bipartite degree centrality

In [7]:
dcent = nx.bipartite.degree_centrality(G, wpage_nodes)

Assign degree of centrality to the two partitions and rank nodeds within each partition

In [8]:
# webpage ranking
wpage_dcent = [(node, dcent[node]) for node in wpage_nodes]
wpage_rank = sorted(wpage_dcent, key=lambda x: x[1], reverse=True)

# publication ranking
pub_dcent = [(node, dcent[node]) for node in pub_nodes]
pub_rank = sorted(pub_dcent, key=lambda x: x[1], reverse=True)

## 2. Create category hierarchy starting at articles based on Wikipedia scraping

Add category pages to the graph

In [57]:
import requests
import bs4
from collections import deque

Define a queue for pages to be explored in a breadth-first search

In [52]:
wpage_nodes = [(node, data) for node, data in G.nodes(data=True) if data['bipartite']=='web_page']

# Add all base pages to the queue
queue = deque(wpage_nodes)

Also define a set of page addresses for quick search of node presence.

In [53]:
node_titles = set([node for node, data in G.nodes(data=True) if data['bipartite']=='web_page'])

Base URLs for wiki pages and wiki API

In [54]:
wiki_url = 'http://en.wikipedia.org'
api_query = 'https://en.wikipedia.org/w/api.php?action=query&format=json&titles='

Define a function for scraping wiki pages for categories

In [55]:
def get_cats(address):
    """
    Accept a wiki page element and return a list of category elements found on the page.
    """

    r = requests.get(wiki_url+address)
    
    soup = bs4.BeautifulSoup(r.text, 'html.parser')
    
    cats = soup.select('#mw-normal-catlinks ul li a')
    
    return cats

Cycle over queue and append new nodes and edges to the graph

In [None]:
it = 0
while len(queue) > 0:
    
    # pop the first node in queue
    node = queue.popleft()
    
    # get the address of the page
    address = node[1]['address']
    depth = node[1]['depth']
    
    # print each 100 cycles
    if it%100 == 0:
        print('Iteration:', it, len(queue), len(G.nodes()), len(G.edges()), node[0], node[1]['ptype'])
    it += 1
    
    if depth < 3:
    
        # scrape the page and return tags for categories
        cats = get_cats(address)
    
        # create nodes for the categories (if new)
        for c in cats:
        
            # get category page title (== node identification)
            title = c.get_text()
        
            # create a new edge
            G.add_edge(title, node[0])
        
            if title not in node_titles:
            
                # get referenced page title
                title = c.get_text()
            
                # get category page address
                cat_ref = c.get('href')
            
                # create a new node tuple
                new_depth = depth + 1
                new_node = (title, {'address':cat_ref, 'ptype':'category', 'depth':new_depth})
                #print(new_node)
            
                # create a new node
                G.add_node(title, address=cat_ref, ptype='category')
 
                # update queue and node_titles set
                queue.append(new_node)
                node_titles.add(title)

Iteration: 0 857734 3048498 857765 Industrial techno topic
Iteration: 100 858218 3049082 858421 William White (architect) topic
Iteration: 200 858617 3049581 858998 Cavendish banana topic
Iteration: 300 859000 3050064 859595 Lawfare topic
Iteration: 400 859360 3050524 860188 Accademia di San Luca topic


## 3. Analyze the network - degree centrality for page/category subgraphs

In [9]:
def create_subgraph(G, node):
    nodes = nx.single_source_shortest_path(G,node).keys()
    return G.subgraph(nodes)