In [0]:
# install nltk and download language packages
!python3 -m nltk.downloader wordnet punkt averaged_perceptron_tagger

In [0]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import KFold, train_test_split
from util import load_json, dump_json

seed = 17

In [11]:
# text processing and cleanup
!python3 preprocess_data.py -a output1.txt -o data1.json
!python3 preprocess_data.py -a output2.txt -o data2.json

49it [00:00, 61.78it/s]
55it [00:00, 69.27it/s]


In [0]:
# put data into a pandas
articles1 = load_json('data1.json')
articles2 = load_json('data2.json')
articles = articles1 + articles2
article_properties = list(sorted(articles[0].keys()))
articles_dict = {article_property: [article[article_property] for article in articles] for article_property in article_properties}
articles_df = pd.DataFrame.from_dict(articles_dict)
articles_train_df, articles_test_df = train_test_split(articles_df, test_size=0.25, random_state=17)

In [32]:
articles_df.head()

Unnamed: 0,abstract,fos,id,references,title,year
0,study simple genetic algorithm rank selection ...,"[combinatorics, genetic algorithm, mathematics...",1576444016,"[1512727602, 1515457845, 1530282223, 154150411...",The quasispecies regime for the simple genetic...,2017
1,introduce parameter discus behavior genetic al...,"[mathematical optimization, fitness proportion...",1279022491,"[1522597852, 1556650278, 1576444016, 163903268...",The quasispecies regime for the simple genetic...,2017
2,paper provide unify iteration complexity analy...,"[gradient descent, regular polygon, mathematic...",1499137793,"[1964401556, 1974524469, 1974661392, 201691023...",Iteration complexity analysis of block coordin...,2017
3,nonconvex optimization arises area computation...,"[mathematics, normal convergence, convergence ...",1456665855,"[1499137793, 1779260694, 1919585499, 192866242...",A Globally Convergent Algorithm for Nonconvex ...,2017
4,abstract extend infinite graph matroidal chara...,"[pancyclic graph, discrete mathematics, combin...",1545645393,"[1966836246, 1973373196, 1975467856, 202933647...",Dual trees must share their ends,2017


In [0]:
# count occurrences of words in training data
count_vectorizer = CountVectorizer()
corpus = [article['abstract'] for _, article in articles_train_df.iterrows()]
count_vectorizer.fit(corpus)

# create a dataframe of counts slowly to not fill up the memory
step_size = 1000
count_cols = ['vocab_{}'.format(word) for word in count_vectorizer.get_feature_names()]
counts_df = pd.DataFrame(columns=count_cols)
for i in range(0, len(articles_train_df), step_size):
  corpus = [article['abstract'] for _, article in articles_train_df.iloc[[k for k in range(i, min(i+step_size+1, len(articles_train_df)))]].iterrows()]
  counts = count_vectorizer.transform(corpus)
  step_counts_df = pd.DataFrame(counts.todense(), columns=count_cols)
  counts_df = pd.concat([counts_df, step_counts_df], ignore_index=True)

In [34]:
counts_df.shape

(78, 1787)

In [0]:
# add the counts dataframe to training dataframe
train_counts_df = pd.concat([articles_train_df, counts_df], axis=1)
train_counts_no_nans_df = train_counts_df[count_cols].fillna(0)

In [0]:
# do the same thing for test, except use the same vectorizer as train
counts_df = pd.DataFrame(columns=count_cols)
for i in range(0, len(articles_test_df), step_size):
  corpus = [article['abstract'] for _, article in articles_test_df.iloc[[k for k in range(i, min(i+step_size+1, len(articles_test_df)))]].iterrows()]
  counts = count_vectorizer.transform(corpus)
  step_counts_df = pd.DataFrame(counts.todense(), columns=count_cols)
  counts_df = pd.concat([counts_df, step_counts_df], ignore_index=True)

In [0]:
# add the counts dataframe to test dataframe
test_counts_df = pd.concat([articles_test_df, counts_df], axis=1)
test_counts_no_nans_df = test_counts_df[count_cols].fillna(0)

In [38]:
# check for nulls
train_counts_no_nans_df[count_cols].isnull().sum().sum()

0

In [39]:
# vocabulary size
len(count_vectorizer.get_feature_names())

1787

In [0]:
# function that will keep score for a given k for top k candidates
def get_test_score(model, train_df, test_df, scores, top_k=10):
  # check if top_k vectors with highest cosine similarity have one of the cited papers
  score = 0
  num_examples = len(test_df)
  score_indices = np.argsort(scores, axis=1)
  for i, (_, row) in enumerate(test_df.iterrows()):
    citations = row['references']
    if type(citations) == list:
      top_k_docs = score_indices[i][::-1][:top_k]
      top_k_docs_ids = train_df.iloc[top_k_docs]['id'].tolist()
      if any(citation in top_k_docs_ids for citation in citations):
        score += 1
      elif not any(train_df['id'].isin(citations)):
        num_examples -= 1
    else:
      num_examples -= 1
    
    #print(i, num_examples, score, score / num_examples)

  return score / num_examples

In [65]:
# CV grid search for best n_components
n_splits = 2
kf = KFold(n_splits=n_splits, random_state=seed)
scores = []
base_n_components = 12
for n in range(base_n_components, 15):
  fold_scores = []
  for i, (train_index, test_index) in enumerate(kf.split(train_counts_no_nans_df)):
    train_df = train_counts_no_nans_df.iloc[train_index]
    validate_df = train_counts_no_nans_df.iloc[test_index]
    lda = LatentDirichletAllocation(n_components=n, random_state=seed)
    lda.fit(train_df[count_cols])
    X = lda.transform(train_counts_no_nans_df[count_cols])
    X_validate = lda.transform(validate_df[count_cols])
    score = get_test_score(lda, train_counts_df, train_counts_df.iloc[test_index], np.matmul(X_validate, X.T))
    fold_scores += [score]
    print('N:', n, 'Score:', score)
  scores += [sum(fold_scores) / len(fold_scores)]

# found the best n
best_n_components = base_n_components + scores.index(max(scores))

N: 12 Score: 0.09090909090909091
N: 12 Score: 0.07142857142857142
N: 13 Score: 0.045454545454545456
N: 13 Score: 0.2857142857142857
N: 14 Score: 0.09090909090909091
N: 14 Score: 0.21428571428571427


In [66]:
# evaluate lda
best_lda = LatentDirichletAllocation(n_components=best_n_components, random_state=seed)
best_lda.fit(train_counts_no_nans_df[count_cols])
X_train = best_lda.fit_transform(train_counts_no_nans_df[count_cols])
X_test = best_lda.transform(test_counts_no_nans_df[count_cols])
scores = np.matmul(X_test, X_train.T)
print('Top 5:', get_test_score(best_lda, train_counts_df, test_counts_df, np.matmul(X_test, X_train.T), top_k=5))
print('Top 10:', get_test_score(best_lda, train_counts_df, test_counts_df, np.matmul(X_test, X_train.T), top_k=10))
print('Top 20:', get_test_score(best_lda, train_counts_df, test_counts_df, np.matmul(X_test, X_train.T), top_k=20))

Top 5: 0.0
Top 10: 0.16666666666666666
Top 20: 0.25


In [67]:
test_counts_no_nans_df

Unnamed: 0,vocab_000,vocab_00965v1,vocab_10,vocab_100,vocab_10e,vocab_10e3,vocab_10e9,vocab_11,vocab_12,vocab_15,vocab_1503,vocab_185,vocab_1982,vocab_1994,vocab_2007,vocab_2008,vocab_201,vocab_2010,vocab_2012,vocab_2014,vocab_2015,vocab_203,vocab_21,vocab_23,vocab_29,vocab_2d,vocab_2l,vocab_418,vocab_429,vocab_43,vocab_45,vocab_49,vocab_60,vocab_604,vocab_61,vocab_66,vocab_83,vocab_84,vocab_ability,vocab_absence,...,vocab_vital,vocab_vm,vocab_vms,vocab_von,vocab_voting,vocab_vrs,vocab_vulnerable,vocab_wafer,vocab_wait,vocab_warp,vocab_weak,vocab_weaknesses,vocab_web,vocab_weight,vocab_weighted,vocab_wheel,vocab_wide,vocab_widely,vocab_widespread,vocab_wiley,vocab_window,vocab_wireless,vocab_wish,vocab_wlan,vocab_wmsn,vocab_wmsns,vocab_word,vocab_work,vocab_workload,vocab_world,vocab_worthwhile,vocab_wrap,vocab_write,vocab_wsns,vocab_year,vocab_yield,vocab_zhang,vocab_łojasiewicz,vocab_łuczak,vocab_λf
0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
