In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datasketch import MinHash, MinHashLSH, MinHashLSHForest
import string
import time
import numpy as np
import datetime

First we need to set up the Kaggle api for downloading Kaggle datasets

In [None]:
!pip install -q kaggle
!pip install datasketch[scipy]

Collecting datasketch[scipy]
  Downloading datasketch-1.5.3-py2.py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 1.8 MB/s 
Installing collected packages: datasketch
Successfully installed datasketch-1.5.3


In [None]:
 # upload the kaggle.json file provided
 # if doesn't work for some reason see https://www.kaggle.com/general/74235 to create new api key

 from google.colab import files 
 files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"ujandeb","key":"9de9f735d28f4c4507a802d6fd74c937"}'}

In [None]:
# create a directory for the kaggle api key

!mkdir ~/.kaggle 
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Next we download the nips-papers dataset

In [None]:
# download and unzip the nips-papers dataset
!kaggle datasets download -d benhamner/nips-papers
!unzip nips-papers.zip

Downloading nips-papers.zip to /content
 93% 132M/141M [00:01<00:00, 118MB/s]
100% 141M/141M [00:01<00:00, 117MB/s]
Archive:  nips-papers.zip
  inflating: authors.csv             
  inflating: database.sqlite         
  inflating: paper_authors.csv       
  inflating: papers.csv              


We create a pandas dataframe to store all papers (year, title and abstract)

In [None]:
# read nips papers into a pandas dataframe
df = pd.read_csv('papers.csv')

df = df.drop(columns=['id', 'event_type', 'pdf_name', 'paper_text'])  # removing redundant columns

Here's what it looks like :

In [None]:
df.head()

Unnamed: 0,year,title,abstract
0,1987,Self-Organization of Associative Database and ...,Abstract Missing
1,1987,A Mean Field Theory of Layer IV of Visual Cort...,Abstract Missing
2,1988,Storing Covariance by the Associative Long-Ter...,Abstract Missing
3,1994,Bayesian Query Construction for Neural Network...,Abstract Missing
4,1994,"Neural Network Ensembles, Cross Validation, an...",Abstract Missing


Data Collection :

Next we need to extract the titles and abstracts of papers from NIPS 2017 and 2020

In [None]:
# function that extracts titles and articles of nips papers from the website given the year
def get_title_and_abstract(year):

  base_url = 'https://papers.nips.cc'
  page = requests.get(base_url+'/paper/'+str(year)) # page url

  # using beautifulsoup to get extract page contents
  soup = BeautifulSoup(page.content, 'html.parser')
  paper_list = soup.find_all('li')[2:]  # papers start from the 3rd item
  df_list = []  # list to store title and abstracts
  for item in paper_list:
    title = item.select('a')[0].get_text()  # extracting the title of the paper
    link = item.select('a')[0].get('href')  # extracting the partial link of the paper
    paper_page = requests.get(base_url+link)  # geting the page of the paper
    psoup = BeautifulSoup(paper_page.content, 'html.parser')
    abstract = psoup.find_all('p')[2].get_text() # abstract of the paper
    df_list.append([year, title, abstract])

  return df_list

# get 2017 and 2020 papers (can take a long time to run at times)
df_2017 = pd.DataFrame(get_title_and_abstract(2017), columns=['year', 'title', 'abstract']) 
df_2020 = pd.DataFrame(get_title_and_abstract(2020), columns=['year', 'title', 'abstract']) 

# merge all dataframes
df_lat = df_2017.append(df_2020, ignore_index=True)
df = df.append(df_lat, ignore_index=True)
df = pd.DataFrame().append([df, df_2017,df_2020, ignore_index = True])

# removing papers with no abstract
df = df[df.abstract != 'Abstract Missing']

# remove duplicates
df = df.drop_duplicates()

# reset index
df = df.reset_index(drop=True)

In [None]:
df = pd.DataFrame().append([df, df_2017,df_2020])

 Pre-processing:

We create word tokens to use as shingles from the papers using their title and abstracts. We convert each paper (title, abstract) into a set of word tokens (shingles) and use minhash to create signatures. We use the datasketch library for this : http://ekzhu.com/datasketch/index.html

In [None]:
perm = 256  # number of permutations for minhash

In [None]:
set_dict = {} # dict for sets of shingles corresponding to each paper (index -> {token1, token2,..})
minhash_dict = {} # dict for minhashes corresponding to each set (index -> minhash) [same index as above]
for index, row in df.iterrows():
  text = row.title + ' ' + row.abstract # concatenating title and abstract
  text = text.lower() # lowercase
  text = text.translate(str.maketrans('', '', string.punctuation))  # removing punctuation
  shingles = set(text.split())  # spliting into words and creating set
  set_dict[index] = shingles  # (index -> {token1, token2,..})
  m = MinHash(num_perm=perm)  # initializing minhash
  for d in shingles:
    m.update(d.encode('utf8'))
  minhash_dict[index] = m # (index -> minhash)

Build LSH:

Building a LSH data structure to store each paper from 1987-2016

In [None]:
thresh = 0.3  # the Jaccard similarity threshold for LSH

In [None]:
# create LSH index
lsh = MinHashLSH(threshold=thresh, num_perm=perm)

for key, value in minhash_dict.items():
  #print(key)
  if df.iloc[[key]].year.item() != 2017 and df.iloc[[key]].year.item() != 2020: # ignoring 2017 and 2020 papers
    lsh.insert(key, value)  # inserting paper signatures into the LSH data structure

Original Papers in 2017 : Find top 5 unique or original papers in 2017, i. e. papers which  had  least  similarity  to  papers  in  previous  years (1987-2016)

We query with all the papers from 2017 in order to find the top 5 unique ones. This requires tuning the Jaccard similarity threshold. We first tune the threshold to find 2017 papers which have Jaccard similarity less than the threshold for all previous year papers. That is, they are different from all previous year papers given the threshold. This gives us the papers that are the most unique compared to previous years. We find that for a threshold of 0.35 there are 76 such papers from 2017.

In [None]:
thresh = 0.35  # the Jaccard similarity threshold for LSH

# create LSH index
lsh = MinHashLSH(threshold=thresh, num_perm=perm)

for key, value in minhash_dict.items():
  if df.iloc[[key]].year.item() != 2017 and df.iloc[[key]].year.item() != 2020: # ignoring 2017 and 2020 papers
    lsh.insert(key, value)  # inserting paper signatures into the LSH data structure

indices_2017 = []
for index, row in df.iterrows():
  if row['year'] == 2017:
    m = minhash_dict[index]
    result = lsh.query(m)
    if result == []: indices_2017.append(index)

print(len(result))

76


Next, we lower the threshold to 0.25 and query with the 76 papers found in the previous step. Each result is a list of papers it matched with, with the given Jaccard similarity (approximately, since LSH). We order the results in the increasing order of their lengths. That is, the first result has the lowest number of matches. This means the first paper in the sorted list is the most unique followed by the 2nd one and so on. We take the first 5 results as our required top 5 unique papers. 

In [None]:
thresh = 0.25  # the Jaccard similarity threshold for LSH

# create LSH index
lsh = MinHashLSH(threshold=thresh, num_perm=perm)

for key, value in minhash_dict.items():
  #print(key)
  if df.iloc[[key]].year.item() != 2017 and df.iloc[[key]].year.item() != 2020: # ignoring 2017 and 2020 papers
    lsh.insert(key, value)  # inserting paper signatures into the LSH data structure

lengths = []  # to store lengths of results
indices = []  # to store the indices corresponding to the results
for index in indices_2017:
  m = minhash_dict[index]
  result = lsh.query(m)
  lengths.append(len(result))
  indices.append(index)

for idx in list(np.argsort(lengths)[:5]):
  print(df.iloc[[indices[idx]]]['title'].item())

Decomposable Submodular Function Minimization: Discrete and Continuous
Submultiplicative Glivenko-Cantelli and Uniform Convergence of Revenues
Semisupervised Clustering, AND-Queries and Locally Encodable Source Coding
On-the-fly Operation Batching in Dynamic Computation Graphs
Modulating early visual processing by language


Classic  Papers:  Our  goal  now  is  to  find  the  top  5  oldest,  classic  papers  in  NIPS  proceedings 
whose topics were relevant even in 2020. Design your scoring function and justify its choice.

We use MinHash LSH Forest to get top 5 matches for all papers in 2020. For each such result we score the paper as follow. For each of the matches for a paper, the score is equal to the number of years of the oldest match from 2020. We sort the scores and take the papers corresponding to the highest 5 scores. 

In [None]:
forest = MinHashLSHForest(num_perm=perm)

for key, value in minhash_dict.items():
  if df.iloc[[key]].year.item() != 2020:  # excluding 2020 papers
    forest.add(key, value)

forest.index()  # required according to the documentation

def get_score(result):
  max_score = -1
  for index in result:
    year = df.iloc[[index]].year.item() # year corresponding to index
    s = (datetime.date(2020, 1, 1) - datetime.date(year, 1, 1)).days // 365 # number of years of the match from 2020
    if s > max_score: max_score = s
    return max_score

scores = []
indices_2020 = []
for index, row in df.iterrows():
  if row['year'] == 2020:
    m = minhash_dict[index]
    result = forest.query(m, 5)
    scores.append(get_score(result))
    indices_2020.append(index)

for idx in list(np.argsort(scores)[len(scores)-5:]):  # last 5 indices correspond to the highest scores
  print(df.iloc[[indices_2020[idx]]]['year'].item())
  print(df.iloc[[indices_2020[idx]]]['title'].item())
  print(' ')

1987
An Artificial Neural Network for Spatio-Temporal Bipolar Patterns: Application to Phoneme Classification
 
2001
Minimax Probability Machine
 
1987
New Hardware for Massive Neural Networks
 
1987
Neural Net and Traditional Classifiers
 
1987
Self-Organization of Associative Database and Its Applications
 
