# User's papers

Select author's papers with the following command in a shell (the raw file is `publications-v8.txt` and the author is Gabriella Pasi in this example):  
`awk -v RS='' -v ORS='\n\n' '/(G\. |Gabriella )Pasi\W/' publications-v8.txt > ../data/pasi_papers.txt`  
The result file must be located in the folder `./data/`.

In [None]:
import os, re, time
import pandas as pd

import MySQLdb

In [2]:
def list2paper(l, r_index=None, r_author=None, r_title=None, r_abstract=None, r_cite=None):
    """
    Transform a raw data paper formatted as a list into dict
    """
    p = {'index': None, 'authors': [], 'title': None, 'abstract': None, 'citations': []}
    
    if r_index is None:
        r_index = re.compile('^#index(.*)')
    if r_author is None:
        r_author = re.compile('^#@(.*)')
    if r_title is None:
        r_title = re.compile('^#\*(.*)')
    if r_abstract is None:
        r_abstract = re.compile('^#!(.*)')
    if r_cite is None:
        r_cite = re.compile('^#%(.*)')
        
    for s in l:
        m_index = r_index.match(s)
        if m_index is not None:
            p['index'] = m_index.group(1)
        
        m_author = r_author.match(s)
        if m_author is not None:
            p['authors'] = [a.strip() for a in m_author.group(1).split(',')]
        
        m_title = r_title.match(s)
        if m_title is not None:
            p['title'] = m_title.group(1)
        
        m_abstract = r_abstract.match(s)
        if m_abstract is not None:
            p['abstract'] = m_abstract.group(1)
        
        m_cite = r_cite.match(s)
        if m_cite is not None:
            p['citations'].append(m_cite.group(1))
    
    return p

In [3]:
def get_author_papers(author_name, author_slug, input_file):
    """
    Return the list of papers written by the author (list of dicts)
    """
    # Split result into a list of lists (each sublist is a paper)
    papers = []
    with open(input_file, 'r') as f:
        content = f.readlines()
        p = []
        for l in content:
            if l.strip() != '':
                p.append(l)
            else:
                papers.append(p)
                p = [] 
    
    papers = [list2paper(l) for l in papers]
    
    author_papers = []
    papers_without_abstract = []
    
    for p in papers:
        if author_name in p['authors']:
            if p['abstract'] is not None:
                author_papers.append(p)
            else:
                papers_without_abstract.append(p)
    
    return author_papers, papers_without_abstract

In [4]:
author_papers, _ = get_author_papers('Gabriella Pasi', 'pasi', '../data/pasi_papers.txt')

# Generate citations file

In [5]:
def generate_citations_file(author_papers, author_slug):
    citations = []
    for p in author_papers:
        for c in p['citations']:
            citations.append([p['index'], c])
    
    pd.DataFrame(citations).to_csv('../data/' + author_slug + '_citations.csv', header=['citing', 'cited'], index=False)

In [6]:
generate_citations_file(author_papers, 'pasi')

# Get cited papers

In [7]:
def get_cited_papers(cited):
    db = MySQLdb.connect(user='root', passwd='root', db='dblp-v8')
    c = db.cursor()

    start_time = time.time()

    # Select papers authored by user
    c.execute("SELECT id, title, abstract FROM papers p WHERE p.abstract != '' AND p.id IN (" + ','.join(["%s"] * len(cited)) + ")", tuple(cited))
    return c.fetchall()

In [8]:
citations = pd.read_csv('../data/pasi_citations.csv')
cited = citations['cited'].unique()
cited_papers = get_cited_papers(cited)

# Non relevant papers

In [9]:
def get_bad_papers(input_file):
    """
    Return the list of "bad" papers written (list of dicts)
    """
    # Split result into a list of lists (each sublist is a paper)
    papers = []
    with open(input_file, 'r') as f:
        content = f.readlines()
        p = []
        for l in content:
            if l.strip() != '':
                p.append(l)
            else:
                papers.append(p)
                p = [] 
    
    papers = [list2paper(l) for l in papers]
    
    papers_with_abstract = []
    
    for p in papers:
        if p['abstract'] is not None:
            papers_with_abstract.append(p)
    
    return papers_with_abstract

In [10]:
bad_papers = get_bad_papers('../data/bad_papers.txt')

In [11]:
def get_bad_cited_papers(bad_papers):
    citations = []
    for p in bad_papers:
        for c in p['citations']:
            citations.append([p['index'], c])
    
    citations_df = pd.DataFrame(citations, columns=['citing', 'cited'])
    cited = citations_df['cited'].unique()
    
    db = MySQLdb.connect(user='root', passwd='root', db='dblp-v8')
    c = db.cursor()

    start_time = time.time()

    c.execute("SELECT id, title, abstract FROM papers p WHERE p.abstract != '' AND p.id IN (" + ','.join(["%s"] * len(cited)) + ")", tuple(cited))
    
    return c.fetchall()

In [12]:
bad_cited_papers = get_bad_cited_papers(bad_papers)

In [13]:
bad_cited_papers = [{'index': p[0], 'title': p[1], 'abstract': p[2]} for p in bad_cited_papers]

# Dataset generation

In [14]:
def format_data(author_papers, citations, cited_papers, bad_papers, bad_cited_papers):
    a_papers = tuple([(p['index'], p['title'], p['abstract']) for p in author_papers])
    cites = tuple([(c[0], c[1]) for c in citations.as_matrix()])
    b_papers = tuple([(p['index'], p['title'], p['abstract']) for p in (bad_papers + bad_cited_papers)])
    
    return a_papers, cites, cited_papers, b_papers

In [15]:
from dataset_tools import *

In [16]:
author_papers, cites, cited_papers, bad_papers = format_data(author_papers, citations, cited_papers, bad_papers, bad_cited_papers)

In [17]:
# Generate global vocabulary
author_vocab = generate_vocab(author_papers)
global_vocab = generate_vocab(bad_papers)
tokens = list(set(author_vocab + global_vocab))

In [18]:
num_entries = 6
output_file = '../data/datasets/dataset-pasi'

print("Preparing dataset...")
papers_feat, citations_feat, bad_feat, ngrams = prepare_dataset(author_papers, cites, cited_papers, bad_papers, tokens)

print("")
num_entries = 6
print("Building computable dataset...")
inputs = build_dataset(papers_feat, citations_feat, bad_feat, num_entries)

print("")
print("Saving dataset to file: " + output_file + ".npz")
dataset_to_file(inputs, ngrams, output_file)
print("Done.")

Preparing dataset...
Processed 74 papers in 0.513s
Processed 766 citation relations in 0.001s
Processed 351 cited papers in 1.919s
Processed 786 papers in 4.070s
Done.

Building computable dataset...
Generated dataset with 351 samples in 0.523s

Saving dataset to file: ../data/datasets/dataset-pasi.npz
Done.
