In [1]:
# http://stanford.edu/~mgorkove/cgi-bin/rpython_tutorials/Using%20Python%20to%20Convert%20PDFs%20to%20Text%20Files.php

In [2]:
from joblib import dump, load

In [3]:
import pandas as pd 
import numpy as np
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [4]:
from scrapePDF import convertMultiple

In [5]:
def recommend_lda(model, lda_X, tf_article, papers):
    dists = np.zeros((lda_X.shape[0],))
    article = model.transform(tf_article)
    
    for idx, row in enumerate(lda_X):
        dists[idx] = np.linalg.norm(row-article)
    index = np.argsort(dists)[1]
    return papers['title'][index]
    

In [6]:
## some global parameters 
no_topics = 5
no_top_words = 10

In [7]:
authors = pd.read_csv('authors.csv')

In [8]:
paper_authors = pd.read_csv('paper_authors.csv')

In [12]:
papers = pd.read_csv('papers.csv')

In [19]:
paper_authors.head()

Unnamed: 0,id,paper_id,author_id
0,1,63,94
1,2,80,124
2,3,80,125
3,4,80,126
4,5,80,127


In [25]:
authors.head()

Unnamed: 0,id,name
0,1,Hisashi Suzuki
1,10,David Brady
2,100,Santosh S. Venkatesh
3,1000,Charles Fefferman
4,10000,Artur Speiser


In [31]:
list(authors[authors['id'].isin([1, 10])]['name'])

['Hisashi Suzuki', 'David Brady']

In [8]:
papers.columns

Index(['id', 'year', 'title', 'event_type', 'pdf_name', 'abstract',
       'paper_text'],
      dtype='object')

In [9]:
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=500, stop_words='english')
tf = tf_vectorizer.fit_transform(papers['paper_text'][1:500])

In [10]:
tf_feature_names = tf_vectorizer.get_feature_names()

In [11]:
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)



In [12]:
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
data model learning set models using algorithm used training neural
Topic 1:
function information model neural data matrix noise spike time vector
Topic 2:
model figure input visual cells time cell neural image network
Topic 3:
network neural input units networks time training output hidden figure
Topic 4:
learning function networks set algorithm training error state network neural


In [89]:
lda_X = lda.transform(tf)

In [93]:
dump(lda_X, 'lda_X.joblib')

['lda_X.joblib']

In [94]:
lda_X = load('lda_X.joblib')

In [95]:
dump(lda, 'lda_model.joblib') 

['lda_model.joblib']

In [96]:
lda = load('lda_model.joblib') 

In [97]:
dump(tf_vectorizer, 'tf_vectorizer.joblib')

['tf_vectorizer.joblib']

In [68]:
tf_vectorizer = load('tf_vectorizer.joblib')

In [83]:
dump(tf, 'tf_matrix.joblib')

['tf_matrix.joblib']

In [84]:
tf = load('tf_matrix.joblib')

In [69]:
## Test the scraping PDF funtionality 

In [85]:
import os 

In [86]:
os.getcwd()

'C:\\Users\\myli\\Desktop\\paper-tiger\\ML'

In [87]:
pdfDir = os.getcwd() + '\\pdf\\'
text = convertMultiple(pdfDir)

In [73]:
tf_text = tf_vectorizer.transform([text])

In [92]:
recommend_lda(lda, lda_X, tf_text, papers)

'Neural Network Recognizer for Hand-Written Zip Code Digits'