In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [1]:
import pandas as pd
import json
import pickle

In [2]:
def line_generator(fname='arxiv-metadata-oai-snapshot.json'):
    with open(fname, 'r') as f:
        for line in f:
            yield line

def make_dataframe(n=1000):
    li = []
    gen = line_generator()
    for idx, line in enumerate(gen):
        d = json.loads(line)
        li.append(d)        
        if idx >= n-1: break
    
    return pd.DataFrame(li)

In [3]:
%time df = make_dataframe(100)

CPU times: user 8.96 ms, sys: 728 µs, total: 9.69 ms
Wall time: 12.2 ms


# Smaller dataframe
Make a dataframe that fits into 16GBs of RAM

In [5]:
df_id_abst = df[['id', 'abstract']]

In [6]:
with open('id+abstract.pkl', 'wb') as f:
    pickle.dump(df_id_abst, f)

In [7]:
# We're probably interested in cs.[CV|CL|LG|AI|NE]/stat.ML 
mask_ai = df.categories.apply(lambda s: 'cs.AI' in s)
mask_cv = ai = df.categories.apply(lambda s: 'cs.CV' in s)

In [8]:
df_ai = df[mask_ai]
df_cv = df[mask_cv]

In [9]:
with open('full_ai.pkl', 'wb') as f:
    pickle.dump(df_ai, f)

In [10]:
with open('full_cv.pkl', 'wb') as f:
    pickle.dump(df_cv, f)

# Try fitting a tf-idf vectorizer

In [11]:
v = TfidfVectorizer(input='content', 
        encoding='utf-8', decode_error='replace', strip_accents='unicode', 
        lowercase=True, analyzer='word', stop_words='english', 
        token_pattern=r'(?u)\b[a-zA-Z_][a-zA-Z0-9_]+\b',
        ngram_range=(1, 2), max_features = None, 
        norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True,
        max_df=1.0, min_df=1)

In [15]:
tfidf_v = TfidfVectorizer(stop_words='english', ngram_range=(1, 1))
%time tfidf_v.fit(df.abstract)

CPU times: user 2min 38s, sys: 1.78 s, total: 2min 40s
Wall time: 2min 40s


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [14]:
tfidf_v = TfidfVectorizer(stop_words='english', ngram_range=(1, 1))
%time tfidf_v.fit(df.abstract[mask_cv == True])

CPU times: user 5.57 s, sys: 58.7 ms, total: 5.63 s
Wall time: 5.64 s


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)