In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
import pandas as pd
import numpy as np
import json
import pickle

In [3]:
# download from https://www.kaggle.com/Cornell-University/arxiv

def line_generator(fname='/home/tomek/projects/paper-finder/arxiv-metadata-oai-snapshot.json'):
    with open(fname, 'r') as f:
        for line in f:
            yield line

def make_dataframe(n=1000):
    li = []
    gen = line_generator()
    for idx, line in enumerate(gen):
        d = json.loads(line)
        li.append(d)        
        if idx >= n-1: break
    
    return pd.DataFrame(li)

# Smaller dataframe
Make a dataframe that fits into 16GBs of RAM

In [4]:
df = make_dataframe(1e6)
df_id_abst = df[['id', 'abstract']]

In [5]:
# We're probably interested in cs.[CV|CL|LG|AI|NE]/stat.ML 
mask_ai = df.categories.apply(lambda s: 'cs.AI' in s).values
mask_cv = df.categories.apply(lambda s: 'cs.CV' in s).values
mask_cl = df.categories.apply(lambda s: 'cs.CL' in s).values
mask_lg = df.categories.apply(lambda s: 'cs.LG' in s).values
mask_ne = df.categories.apply(lambda s: 'cs.NE' in s).values
mask_stat_ml = df.categories.apply(lambda s: 'stat.ML ' in s).values

In [6]:
mask_all = np.vstack([mask_ai, mask_cv, mask_cl, mask_lg, mask_ne, mask_stat_ml])
mask_all = np.sum(mask_all, axis=0, dtype=np.bool)

In [7]:
df_roi = df[mask_all]

In [8]:
with open('df_arxiv_id_abst_ai_cv_cl_lg_ne_ml.pkl', 'wb') as f:
    pickle.dump(df_roi, f)

## Fit a tf-idf vectorizer

In [9]:
v = TfidfVectorizer(input='content', 
        encoding='utf-8', decode_error='replace', strip_accents='unicode', 
        lowercase=True, analyzer='word', stop_words='english', 
        token_pattern=r'(?u)\b[a-zA-Z_][a-zA-Z0-9_]+\b',
        ngram_range=(1, 2), max_features = None, 
        norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True,
        max_df=1.0, min_df=1)

In [10]:
tfidf_v = TfidfVectorizer(stop_words='english', ngram_range=(1, 1))
%time tfidf_v.fit(df.abstract)

# Alternatively we can fit the vectorizer only on the subset topic
# tfidf_v = TfidfVectorizer(stop_words='english', ngram_range=(1, 1))
# %time tfidf_v.fit(df.abstract[mask_cv == True])

CPU times: user 1min 20s, sys: 874 ms, total: 1min 21s
Wall time: 1min 21s


TfidfVectorizer(stop_words='english')

In [11]:
with open('df_arxiv_id_abst_full.pkl', 'wb') as f:
    pickle.dump(df_id_abst, f)

In [12]:
with open('tf_idf_vectorizer_ngram1.pkl', 'wb') as f:
    pickle.dump(tfidf_v, f)

In [13]:
X_tfidf = tfidf_v.transform(df.abstract)

with open('X_tfidf.pkl', 'wb') as f:
    pickle.dump(X_tfidf, f)