# Publication recommendation system

## 1. Read dataset linking wikipedia articles with publications and create a bipartite graph of the relation

In [5]:
import os
import numpy as np
import pandas as pd
import networkx as nx
%matplotlib inline
import matplotlib.pyplot as plt

Read wikipedia references data from a TSV file

In [6]:
base_path = '../data/raw'
processed_path = '../data/processed'

In [7]:
# read TSV data
df = pd.read_csv(os.path.join(base_path,'enwiki.tsv'), sep='\t', parse_dates=['timestamp'],infer_datetime_format=True)

# Convert mistakenly converted type nan to string 'NaN' (wikipedia page name)
df.page_title = df.page_title.fillna("NaN")

df.head(5)

Unnamed: 0,page_id,page_title,rev_id,timestamp,type,id
0,2867096,Mu Aquilae,503137751,2012-07-19 16:08:41,doi,10.1051/0004-6361:20078357
1,2867096,Mu Aquilae,508363722,2012-08-20 22:56:21,arxiv,astro-ph/0604502
2,2867096,Mu Aquilae,508363722,2012-08-20 22:56:21,arxiv,astro-ph/0003329
3,2867096,Mu Aquilae,508363722,2012-08-20 22:56:21,arxiv,0708.1752
4,2867096,Mu Aquilae,503137751,2012-07-19 16:08:41,doi,10.1051/0004-6361:20064946


Base URLs for wiki pages and wiki API

In [8]:
wiki_url = 'http://en.wikipedia.org'
api_query = 'https://en.wikipedia.org/w/api.php?action=query&format=json&titles='
cat_tree = 'https://en.wikipedia.org/wiki/Special:CategoryTree'

**Create a bipartite graph connecting wiki pages and publications**

Prepare lists of nodes and edges from the dataframe

In [9]:
# list of unique web page ids (web_page nodes)
wp_ids = df.page_id.unique()

# list of unique web page names
wp_titles = df.page_title.unique()
wp_set = set(wp_titles)

# list of unique publications (publication nodes)
pub_ids = [(ptype, pid) for ptype, pid in df[['type', 'id']].values]
pub_set = set(pub_ids)

# list of references (edges)
edges = [(page, (ptype, pid)) for page, ptype, pid in df[['page_title', 'type', 'id']].values]

In [10]:
wp_node_list = [(title, {'bipartite':'web_page', 'pid': pid,'address':'/wiki/'+title.replace(' ','_'), 'ptype':'topic','depth':0}) for pid, title in zip(wp_ids, wp_titles)]

In [11]:
pub_node_list = [(pub_id, {'bipartite':'publication'}) for pub_id in pub_ids] #, 'type':pub_id

Create a directed graph (the topic pages and publications form a bipartite graph)

In [12]:
G = nx.DiGraph()
G.add_nodes_from(wp_node_list)#, bipartite='web_page')
G.add_nodes_from(pub_node_list)#, bipartite='publication')
G.add_edges_from(edges)

## 2. Create category hierarchy starting at articles based on Wikipedia scraping

Add category pages to the graph

In [13]:
import requests
import bs4
from collections import deque

Function to obtain a link and (possibly) title to a publication with a given ID

In [14]:
def pub_info(pub_id):
    """
    Return the title of the publication with given pub_id
    
    Parameters
    ----------
    pub_id: tuple (str, str)
            publication id type (doi, isbn, ...) and number

    Returns
    -------
    link: str
          link to a web page with publicatio information
    title: str
           publication title or 'N/A' if not available
    """
    
    # urls for publications with different types of ids
    urls = {'doi':'https://doi.org/',
            'arxiv':'https://arxiv.org/abs/',
            'isbn':'https://isbnsearch.org/isbn/',
            'pmid':'https://www.ncbi.nlm.nih.gov/pubmed/',
            'pmc':'https://'
           }
    
    link = urls[pub_id[0]] + pub_id[1]

    selector = {'doi':'h1',
                'arxiv':'h1',# ."title_mathjax"',
                'isbn':'h1',
                'pmid':'h1',
                'pmc':'h1'
               }

    # try to find the title of a publication; if not successful, return N/A.
    try:
        r = requests.get(link)
        try:
            soup = bs4.BeautifulSoup(r.text, 'html.parser')
            if pub_id[0] == 'arxiv':
                title = soup.select(selector[pub_id[0]])[-1].get_text()
            else:
                title = soup.select(selector[pub_id[0]])[0].get_text()
        except:
            title = 'N/A'
    except requests.exceptions.RequestException:
        title = 'N/A'
    
    return link, title

In [15]:
def get_cats(address):
    """
    Accept a wiki page element and return a list of category elements found on the page.
    Parameters
    ----------
    address: str
             Address of a wikipedia web page (without the wiki_url base)
    Returns
    -------
    cats: List of BeautifulSoup tag objects
          Contains (not-hidden) category information for subsequent processing   
    """

    r = requests.get(wiki_url+address)

    soup = bs4.BeautifulSoup(r.text, 'html.parser')
    
    cats = soup.select('#mw-normal-catlinks ul li a')

    return cats 

Function to find categories of selected pages and add them to a given graph.
Returns the updated graph

In [16]:
wpage_nodes = [(node, data) for node, data in G.nodes(data=True) if data['bipartite']=='web_page']

In [17]:
# set for quickly checking presence of web_page nodes
node_titles = set([node for node, data in G.nodes(data=True) if data['bipartite']=='web_page'])

In [21]:
def add_cats(G):

    wpage_nodes = [(node, data) for node, data in G.nodes(data=True) if data['bipartite']=='web_page']
    
    for page in wpage_nodes:

        # get the address of the page
        address = node[1]['address']

        # scrape the page and return tags for categories
        cats = get_cats(address)

        # create nodes for the categories (if new)
        for c in cats:

            # get category page title (== node identification)
            title = c.get_text()

            if title not in node_titles:

                # get category page address
                cat_ref = c.get('href')

                # create a new node tuple
                new_node = (title, {'address':cat_ref, 'ptype':'category'})
                #print(new_node)

                # create a new node
                G.add_node(title, address=cat_ref, ptype='category')

                # update queue and node_titles set
                node_titles.add(title)
                
            # create a new edge pointing from higher level category to the present page
            G.add_edge(title, node[0])

    return G

Go over catogory nodes and find derived topic articles from the catogory tree form.
Use web scraping of the form.

In [49]:
def get_cat_children(cat_name):
    """
    Obtains wikipedia articles that are children of a given category
    
    Parameters
    ----------
    cat_name: str
              Category name
              
    Returns
    -------
    article_list: list of Beautiful tag objects
    """
    
    cat_tree = 'https://en.wikipedia.org/wiki/Special:CategoryTree'
    cat_name = cat_name.replace(' ','+')
    query = '?target=' + cat_name + '&mode=all&namespaces=0&title=Special%3ACategoryTree'
    
    r = requests.get(cat_tree+query)
    
    soup = bs4.BeautifulSoup(r.text, 'html.parser')
    
    article_list = soup.find_all('a', {'class':'CategoryTreeLabelPage'})
    
    return article_list

In [50]:
article_list = get_cat_tree('Quantum Monte Carlo')

In [51]:
for article in article_list:
    print(article.get_text())
    print(article.get('href'),'\n')

Quantum Monte Carlo
/wiki/Quantum_Monte_Carlo 

Auxiliary-field Monte Carlo
/wiki/Auxiliary-field_Monte_Carlo 

CASINO
/wiki/CASINO 

Diffusion Monte Carlo
/wiki/Diffusion_Monte_Carlo 

Gaussian quantum Monte Carlo
/wiki/Gaussian_quantum_Monte_Carlo 

Path integral molecular dynamics
/wiki/Path_integral_molecular_dynamics 

Path integral Monte Carlo
/wiki/Path_integral_Monte_Carlo 

Reptation Monte Carlo
/wiki/Reptation_Monte_Carlo 

Time-dependent variational Monte Carlo
/wiki/Time-dependent_variational_Monte_Carlo 

Variational Monte Carlo
/wiki/Variational_Monte_Carlo 



Function to rank publications in a given subgraph by their bipartite degree centrality.

In [19]:
def rank_publications(G):
    """
    Calcualate bipartite degree centrality ranking of publications within a given graph G.
    
    Parameters
    ----------
    G: networkx graph object
       usually a subgraph of the main graph containing publication nodes (bipartite=publication)
    
    Returns
    -------
    pub_rank: list of tuples (graph node, degree centrality) ordered by degree centrality
    """
    
    # Select publication and topic page nodes
    pub_nodes = [node for node, data in G.nodes(data=True) if data['bipartite'] == 'publication']
    wpage_nodes = [node for node, data in G.nodes(data=True) if data['bipartite'] == 'web_page']

    # Create a bipartite subgraph of topic nodes and publications
    SG = G.subgraph(pub_nodes + wpage_nodes)

    # calcualate degree centrality
    dcent = nx.bipartite.degree_centrality(SG, wpage_nodes)
    
    # save webpage nodes with their degree centrality
    pub_dcent = [(node, dcent[node]) for node in pub_nodes]
    
    # sort publication nodes from highest to lowest
    pub_rank = sorted(pub_dcent, key=lambda x: x[1], reverse=True)
    
    return pub_rank

Function to find N most highle referenced publications related to the original publication

In [20]:
from collections import Counter

def find_most_relevant(G, publication, n_highest_pubs=10):
    """
    Finds and prints N most relevant publications to the original publication (based on citations)
    
    Parameters
    ----------
    publication: tuple (str, str)
                 Identification of the original publication based on id type and number
    n_highest_pubs: int
                    number of publications to list
                    
    Returns
    -------
    Prints list on the screen
    """
    
    # Original publication and its wikipedia citations
    print('Original publication:', publication, '\nTitle:', pub_info(publication)[1],'\n\n')
    page_list = G.predecessors(publication)
    print(len(page_list), 'pages referring to the publication:\n', page_list,'\n\n')
    
    # Level 1 - citing topic pages
    # Find all publications referenced by wiki pages that also cite the original publication
    infos = []
    for page in page_list:
        infos.extend([pub for pub in G.successors(page)])
    
    # create a subgraph of related pages and publications
    SG = G.subgraph(page_list + infos)
    
    # Add category nodes and descending pages
    add_cats(SG)
        
    # Level 2 - Categories from citing topic pages
    # Find pages linked to the topic pages through common category
    # Get additional publications through their descendants (exclude those in level 1)
    for page in page_list:
        # Add category nodes to the graph
        cats = get_cats(page)
    
    # Select N most highly cited publications
    pub_counts = Counter(infos)
    del pub_counts[publication]
    most_cited = pub_counts.most_common(n_highest_pubs)
    
    # Print information about the highly cited publications
    for i, pub in enumerate(most_cited, 1):
        info = pub_info(pub[0])
        print('Rank:', i, '\nCitations:', pub[1])
        print('ID:', pub[0])
        print('Source:', info[0])
        print('Title:', info[1],'\n')

** Tests of the recommendation system**

* so far only links through topic wikipedia pages
* todo: add catogory information - important for publications with fewer linking wikipedia pages

In [25]:
# Test 1: a random astronomical publication
find_most_relevant(('arxiv', '1208.3048'), 10)

Original publication: ('arxiv', '1208.3048') 
Title: Title:
Radial velocities for the Hipparcos-Gaia Hundred-Thousand-Proper-Motion  project 


243 pages referring to the publication:
 ['8 Cancri', 'Mu1 Cancri', '12 Cancri', '28 Cancri', 'Beta Sextantis', 'Kappa Canis Majoris', 'Nu3 Canis Majoris', 'Xi2 Canis Majoris', 'Gamma Canis Minoris', 'Delta1 Canis Minoris', 'Delta3 Canis Minoris', 'Epsilon Canis Minoris', 'Zeta Canis Minoris', 'Nu Arietis', 'Omicron Arietis', 'Upsilon Tauri', 'Sigma Tauri', 'Pi Tauri', 'Psi Tauri', 'Psi Velorum', 'Upsilon2 Eridani', 'Omega Herculis', 'Theta Cassiopeiae', 'Epsilon Trianguli Australis', 'Kappa Trianguli Australis', 'Iota Trianguli Australis', 'Theta Trianguli Australis', 'Eta Trianguli Australis', 'Pi Tucanae', 'Lambda1 Tucanae', 'Lambda2 Tucanae', 'Upsilon Ursae Majoris', 'Kappa Volantis', 'Omega Ursae Majoris', 'Tau Ursae Majoris', 'Rho Ursae Majoris', '44 Andromedae', 'HR 515', 'Mu Aurigae', 'Xi Aurigae', '45 Aurigae', 'Delta Ursae Minoris', '

In [26]:
# Test 2: Publication introducing a popular molecular model
find_most_relevant(('doi', '10.1021/j100308a038'), 10)

Original publication: ('doi', '10.1021/j100308a038') 
Title: The missing term in effective pair potentials 


2 pages referring to the publication:
 ['Water model', 'Solvent models'] 


Rank: 1 
Citations: 2
ID: ('pmid', '25660403')
Source: https://www.ncbi.nlm.nih.gov/pubmed/25660403
Title: PubMed 

Rank: 2 
Citations: 2
ID: ('doi', '10.1039/C5CP00288E')
Source: https://doi.org/10.1039/C5CP00288E
Title: N/A 

Rank: 3 
Citations: 1
ID: ('doi', '10.1021/jp973084f')
Source: https://doi.org/10.1021/jp973084f
Title: All-Atom Empirical Potential for Molecular Modeling and Dynamics Studies of Proteins †   

Rank: 4 
Citations: 1
ID: ('doi', '10.1063/1.2038787')
Source: https://doi.org/10.1063/1.2038787
Title: N/A 

Rank: 5 
Citations: 1
ID: ('doi', '10.1063/1.2360276')
Source: https://doi.org/10.1063/1.2360276
Title: N/A 

Rank: 6 
Citations: 1
ID: ('doi', '10.1021/jz501780a')
Source: https://doi.org/10.1021/jz501780a
Title: Building Water Models: A Different Approach 

Rank: 7 
Citations: 1