In [0]:
# install nltk and download language packages
!python3 -m nltk.downloader wordnet punkt averaged_perceptron_tagger

In [0]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import KFold, train_test_split
from util import load_json, dump_json

seed = 17

In [41]:
# text processing and cleanup
!python3 preprocess_data.py -a output.txt -o data.json

9991it [01:14, 133.64it/s]


In [0]:
# put data into a pandas
articles = load_json('data.json')
article_properties = list(sorted(articles[0].keys()))
articles_dict = {article_property: [article[article_property] for article in articles] for article_property in article_properties}
articles_df = pd.DataFrame.from_dict(articles_dict)
articles_train_df, articles_test_df = train_test_split(articles_df, test_size=0.001, random_state=17)

In [44]:
articles_df.head()

Unnamed: 0,abstract,fos,id,references,title,year
0,vehicle communication channel characteristic e...,"[computer science, bit error rate, computer ne...",1000096266,"[1801089468, 1963850605, 2064076416, 211953192...",Pilots Aided Channel Estimation for Doubly Sel...,2013
1,develop coupled thermomechanical model include...,"[mathematical optimization, mathematics, conti...",1000117647,"[2115718968, 2151110682]",Mould-taper asymptotics and air gap formation ...,2015
2,increase popularity social network globalize e...,"[data security, secrecy, database, encryption,...",100013375,"[132109442, 1768601545, 2100727725, 2119028650...",Data Protection and Privacy Preservation Using...,2012
3,research design privacy preserve schedule serv...,"[real time compute, computer network, computer...",100013628,"[1979788481, 1995801303, 2016721869, 211567220...",Privacy-preserving scheduling mechanism for eh...,2012
4,design privacy aware system gain attention rec...,"[system engineering, data mining, privacy desi...",100014528,"[34239548, 140430729, 1510827835, 1918077612, ...",Applying Soft Computing Technologies for Imple...,2012


In [0]:
# count occurrences of words in training data
count_vectorizer = CountVectorizer()
corpus = [article['abstract'] for _, article in articles_train_df.iterrows()]
count_vectorizer.fit(corpus)

# create a dataframe of counts slowly to not fill up the memory
step_size = 1000
count_cols = ['vocab_{}'.format(word) for word in count_vectorizer.get_feature_names()]
counts_df = pd.DataFrame(columns=count_cols)
for i in range(0, len(articles_train_df), step_size):
  corpus = [article['abstract'] for _, article in articles_train_df.iloc[[k for k in range(i, min(i+step_size+1, len(articles_train_df)))]].iterrows()]
  counts = count_vectorizer.transform(corpus)
  step_counts_df = pd.DataFrame(counts.todense(), columns=count_cols)
  counts_df = pd.concat([counts_df, step_counts_df], ignore_index=True)

In [0]:
# add the counts dataframe to training dataframe
train_counts_df = pd.concat([articles_train_df, counts_df], axis=1)
train_counts_no_nans_df = train_counts_df[count_cols].fillna(0)

In [49]:
# check for nulls
train_counts_no_nans_df[count_cols].isnull().sum().sum()

0

In [52]:
# vocabulary size
len(count_vectorizer.get_feature_names())

27403

In [0]:
# function that will keep score for a given k for top k candidates
def get_test_score(model, train_df, test_df, scores, top_k=200):
  # check if top_k vectors with highest cosine similarity have one of the cited papers
  score = 0
  num_examples = len(test_df)
  score_indices = np.argsort(scores, axis=1)
  for i, (_, row) in enumerate(test_df.iterrows()):
    citations = row['references']
    if type(citations) == list:
      top_k_docs = score_indices[i][::-1][:top_k]
      top_k_docs_ids = train_df.iloc[top_k_docs]['id'].tolist()
      if any(citation in top_k_docs_ids for citation in citations):
        score += 1
      elif not any(train_df['id'].isin(citations)):
        num_examples -= 1
    else:
      num_examples -= 1
    
    print(i, num_examples, score, score / num_examples)

  return score / num_examples

In [0]:
# CV grid search for best n_components
n_splits = 2
kf = KFold(n_splits=n_splits, random_state=seed)
scores = []
base_n_components = 12
for n in range(base_n_components, 15):
  fold_scores = []
  for i, (train_index, test_index) in enumerate(kf.split(train_counts_no_nans_df)):
    train_df = train_counts_no_nans_df.iloc[train_index]
    validate_df = train_counts_no_nans_df.iloc[test_index]
    lda = LatentDirichletAllocation(n_components=n, random_state=seed)
    lda.fit(train_df[count_cols])
    X = lda.transform(train_counts_no_nans_df[count_cols])
    X_validate = lda.transform(validate_df[count_cols])
    score = get_test_score(lda, train_counts_df, train_counts_df.iloc[test_index], np.matmul(X_validate, X.T))
    fold_scores += [score]
    print(n, score)
  scores += [sum(fold_scores) / len(fold_scores)]

# found the best n
best_n_components = base_n_components + scores.index(max(scores))

In [0]:
# evaluate lda
best_lda = LatentDirichletAllocation(n_components=best_n_components, random_state=seed)
best_lda.fit(train_counts_df[count_cols])
X_train = best_lda.fit_transform(train_counts_df[count_cols])
X_test = best_lda.transform(test_counts_df[count_cols])
print('Top 5:', get_test_score(best_lda, train_counts_df, X_train, test_counts_df, X_test, top_k=5))
print('Top 10:', get_test_score(best_lda, train_counts_df, X_train, test_counts_df, X_test, top_k=10))
print('Top 20:', get_test_score(best_lda, train_counts_df, X_train, test_counts_df, X_test, top_k=20))