In [None]:
#imports

import numpy as np
import pandas as pd
from google.colab import drive
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from sklearn.metrics import precision_score
import nltk
import random
nltk.download('stopwords')

from nltk.corpus import stopwords
import string
stopset = stopwords.words('english') + list(string.punctuation)
nltk.download('punkt')
import re

import io


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
#!pip install faiss-gpu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[K     |████████████████████████████████| 85.5 MB 77 kB/s 
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [None]:
#!sudo apt-get install libomp-dev

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'sudo apt autoremove' to remove it.
The following additional packages will be installed:
  libomp5
Suggested packages:
  libomp-doc
The following NEW packages will be installed:
  libomp-dev libomp5
0 upgraded, 2 newly installed, 0 to remove and 7 not upgraded.
Need to get 239 kB of archives.
After this operation, 804 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libomp5 amd64 5.0.1-1 [234 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libomp-dev amd64 5.0.1-1 [5,088 B]
Fetched 239 kB in 0s (692 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 2.)
debc

In [None]:
#load train embeddings/convert to array

drive.mount('/content/drive', force_remount=True)

clue_embed_df = pd.read_csv('/content/drive/MyDrive/clueembed2.csv')
answer_embed_df = pd.read_csv('/content/drive/MyDrive/answerembed2.csv')
clue_embed = np.array(clue_embed_df, order = 'c', dtype='float32')
answer_embed = np.array(answer_embed_df, order = 'c', dtype='float32')

Mounted at /content/drive


In [None]:
#set up FAISS L2 index with 50 voronoi cell split, retrieve 1000 most similar answer embeddings to each clue

import faiss
nlist = 50  
quantizer = faiss.IndexFlatL2(384)
index = faiss.IndexIVFFlat(quantizer, 384, nlist)
index.train(answer_embed)
index.add(answer_embed)
D, I = index.search(clue_embed, 1000)

In [None]:
#training recall

np.mean([bool(i in I[i]) for i in range(len(I))])

0.48433426194900137

In [None]:
del clue_embed

In [None]:
del index, quantizer

In [None]:
del D

In [None]:
#export I matrix

pd.DataFrame(I).to_csv('/content/drive/MyDrive/I_train.csv', index=False)

In [None]:
del I

In [None]:
#load validation clue embeddings

clue_embed_df = pd.read_csv('/content/drive/MyDrive/valclueembed2.csv')
clue_embed = np.array(clue_embed_df, order = 'c', dtype='float32')

In [None]:
#same FAISS process, this time for validation clues

nlist = 50
quantizer = faiss.IndexFlatL2(384)
index = faiss.IndexIVFFlat(quantizer, 384, nlist)
index.train(answer_embed)
index.add(answer_embed)
D, I = index.search(clue_embed, 1000)

In [None]:
#load train/val/test sets for answers

nyt_train = pd.read_csv('/content/drive/MyDrive/nyttrain.csv')
nyt_val = pd.read_csv('/content/drive/MyDrive/nytval.csv')
nyt_test = pd.read_csv('/content/drive/MyDrive/nyttest.csv')

In [None]:
#answer sets

train_answers = np.array([k for j in [list(eval(i)) for i in nyt_train['answer']] for k in j])
val_answers = np.array([k for j in [list(eval(i)) for i in nyt_val['answer']] for k in j])
test_answers = np.array([k for j in [list(eval(i)) for i in nyt_test['answer']] for k in j])

In [None]:
#validation recall

np.mean([bool(val_answers[i] in train_answers[I[i]]) for i in range(len(val_answers))])

0.36854566345182965

In [None]:
#export I index

pd.DataFrame(I).to_csv('/content/drive/MyDrive/I_val.csv', index=False)

In [None]:
del clue_embed, index, quantizer, D, I

In [None]:
#load test clue embeddings

clue_embed_df = pd.read_csv('/content/drive/MyDrive/testclueembed2.csv')
clue_embed = np.array(clue_embed_df, order = 'c', dtype='float32')

In [None]:
#FAISS process for test clues

nlist = 50
quantizer = faiss.IndexFlatL2(384)
index = faiss.IndexIVFFlat(quantizer, 384, nlist)
index.train(answer_embed)
index.add(answer_embed)
D, I = index.search(clue_embed, 1000)

In [None]:
#test recall

np.mean([bool(test_answers[i] in train_answers[I[i]]) for i in range(len(test_answers))])

0.36020375161707635

In [None]:
#export I matrix

pd.DataFrame(I).to_csv('/content/drive/MyDrive/I_test.csv', index=False)

In [None]:
del clue_embed, index, quantizer, D, I