# Publication recommendation system

## 1. Read dataset linking wikipedia articles with publications and create a bipartite graph of the relation

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import networkx as nx
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
%load_ext autoreload
%autoreload 2

### Read wikipedia references from a TSV file

In [4]:
base_path = '../data/raw'
processed_path = '../data/processed'

In [5]:
# read TSV data
df = pd.read_csv(os.path.join(base_path,'enwiki.tsv'), sep='\t', parse_dates=['timestamp'],infer_datetime_format=True)

# Convert mistakenly converted type nan to string 'NaN' (wikipedia page name)
df.page_title = df.page_title.fillna("NaN")

df.head(5)

Unnamed: 0,page_id,page_title,rev_id,timestamp,type,id
0,2867096,Mu Aquilae,503137751,2012-07-19 16:08:41,doi,10.1051/0004-6361:20078357
1,2867096,Mu Aquilae,508363722,2012-08-20 22:56:21,arxiv,astro-ph/0604502
2,2867096,Mu Aquilae,508363722,2012-08-20 22:56:21,arxiv,astro-ph/0003329
3,2867096,Mu Aquilae,508363722,2012-08-20 22:56:21,arxiv,0708.1752
4,2867096,Mu Aquilae,503137751,2012-07-19 16:08:41,doi,10.1051/0004-6361:20064946


In [6]:
book_title = 'Designing Great Beers: The Ultimate Guide to Brewing Classic Beer Styles'
df[df.page_title == 'Lager']
#df[df.id == '9781491957660']

Unnamed: 0,page_id,page_title,rev_id,timestamp,type,id
1377870,19555312,Lager,679306283,2015-09-03 19:19:28,isbn,9780984075614
1377871,19555312,Lager,30671727,2005-12-09 03:28:04,isbn,0195154797
1377872,19555312,Lager,679305035,2015-09-03 19:12:13,isbn,0761184988
1377873,19555312,Lager,771614071,2017-03-22 15:31:18,doi,10.1073/pnas.1105430108


In [39]:
book_title = 'winnie the pooh'
df[df.page_title.str.contains(book_title, case=False)]

Unnamed: 0,page_id,page_title,rev_id,timestamp,type,id
270399,6284586,Winnie the Pooh and Christmas Too,450818249,2011-09-16 15:10:40,isbn,0760756341
3647176,1495388,The Many Adventures of Winnie the Pooh,264717192,2009-01-17 19:13:13,isbn,0025839004
3647177,1495388,The Many Adventures of Winnie the Pooh,268524905,2009-02-04 19:11:37,isbn,157806712X
3647178,1495388,The Many Adventures of Winnie the Pooh,264336692,2009-01-15 22:04:53,isbn,0452259932
3647179,1495388,The Many Adventures of Winnie the Pooh,268524905,2009-02-04 19:11:37,isbn,0896592324


### Create a directed bipartite graph of references from wikipedia pages to publications

**Create a bipartite graph connecting wiki pages and publications**

In [43]:
# import the project module containing functions for reading data from wikipedia
# and working with the graph-based recommendation system
sys.path.append('../src')
from recomm.graph_rank import GraphRank

Create a GraphRank object - a graph-based model for publication recommendation.

In [44]:
gr = GraphRank()

In [45]:
gr.build_graph(df, 'page_title', 'page_id', 'type', 'id')

In [46]:
gr.G.node['Lager']

{'address': '/wiki/Lager',
 'bipartite': 'web_page',
 'depth': 0,
 'pid': 19555312,
 'ptype': 'topic'}

** Tests of the recommendation system**

In [None]:
gr.find_most_relevant(('isbn','0025839004'), 10)

Original publication: ('isbn', '0025839004') 
Title: Forbidden 


7 pages referring to the publication:
 ['Time Cat: The Remarkable Journeys of Jason and Gareth', 'Genevieve Foster', 'Jean Ingelow', 'The Chronicles of Prydain', 'Padraic Colum', 'The Princess and the Goblin', 'The Many Adventures of Winnie the Pooh'] 


Rank: 1 
Citations: 1
ID: ('isbn', '0826415164')
Source: https://isbnsearch.org/isbn/0826415164
Title: Forbidden 

Rank: 2 
Citations: 1
ID: ('isbn', '0395653800')
Source: https://isbnsearch.org/isbn/0395653800
Title: Forbidden 

Rank: 3 
Citations: 1
ID: ('isbn', '157806712X')
Source: https://isbnsearch.org/isbn/157806712X
Title: Forbidden 

Rank: 4 
Citations: 1
ID: ('isbn', '0452259932')
Source: https://isbnsearch.org/isbn/0452259932
Title: Forbidden 

Rank: 5 
Citations: 1
ID: ('isbn', '0896592324')
Source: https://isbnsearch.org/isbn/0896592324
Title: Forbidden 

Number of categories: 107


In [None]:
gr.find_most_relevant(('doi', '10.1021/j100308a038'), 10)