In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [98]:
import pandas as pd
import numpy as np
import time
import spacy
import random
from pathlib import Path
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext import data 
import torchtext
from nltk.tokenize.treebank import TreebankWordDetokenizer
import csv
from itertools import islice
import nltk
nltk.download('punkt')

from nltk import word_tokenize,sent_tokenize
from nltk.tokenize import wordpunct_tokenize
import regex as re
import pickle

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

# Setting device on GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

torch.backends.cudnn.deterministic = True

# Set the random seed manually for reproducibility.
torch.manual_seed(1234)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Using device: cuda

Tesla T4
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


<torch._C.Generator at 0x7f4351323ed0>

In [0]:
# path = "drive/My Drive/FiQA/"

# docs = {}

# with open(path + "FiQA_train_doc_final.tsv",'r') as f:
#     for line in f:
#         # [qid, doc_id, rank]
#         line = line.strip().split('\t')

#         if len(line) == 4:
#             docid = int(line[1])
#             doc = line[2]

#             docs[docid] = []
#             docs[docid].append(doc)

# # take(5, docs.items())

In [0]:
path = "drive/My Drive/FiQA/"

# Doc ID to Doc text
collection = pd.read_csv(path + "FiQA_train_doc_final.tsv", sep="\t")
collection = collection[['docid', 'doc']]
collection = collection.sort_values(by=['docid'])

# Question ID and Question text
query_df = pd.read_csv(path + "FiQA_train_question_final.tsv", sep="\t")

queries = query_df[['qid', 'question']]

all_answers_id = collection['docid'].to_list()
#print(all_answers_id)

# Question ID and Answer ID pair
qid_docid = pd.read_csv(path + "FiQA_train_question_doc_final.tsv", sep="\t")

qid_docid = qid_docid [['qid', 'docid']]

In [79]:
# Cleaning data
empty_docs = []
empty_id = []

for index, row in collection.iterrows():
    if pd.isna(row['doc']):
        empty_docs.append(row['docid'])
        empty_id.append(index)

# print(empty_docs)
print(empty_id)

# Remove empty answers
collection2 = collection.drop(empty_id)

[741, 1158, 1329, 3208, 3937, 5266, 6399, 11237, 11407, 12328, 13459, 14869, 16143, 16326, 18297, 19839, 19944, 20739, 22923, 23949, 24603, 28093, 30966, 31492, 34386, 34539, 34697, 38382, 39802, 42403, 43008, 45178, 46794, 50845, 53111, 54963, 56460, 57452]


In [80]:
# Answers Tokenization
# tokenizer.tokenize(s)
def pre_process(doc):
    
    doc = str(doc)
    
    x = re.sub('[…“”%!&"@#()\-\*\+,/:;<=>?@[\]\^_`{\}~]', ' ', doc)
    
    y = re.sub('[\.\']', "", x)
    
    z = y.lower()
    
    return z

collection2['doc_processed'] = collection2['doc'].apply(pre_process)
collection2['tokenized_ans'] = collection2.apply(lambda row: wordpunct_tokenize(row['doc_processed']), axis=1)
collection2['ans_len'] = collection2.apply(lambda row: len(row['tokenized_ans']), axis=1)

collection2.head(5)

Unnamed: 0,docid,doc,doc_processed,tokenized_ans,ans_len
0,3,I'm not saying I don't like the idea of on-the...,im not saying i dont like the idea of on the j...,"[im, not, saying, i, dont, like, the, idea, of...",76
1,31,So nothing preventing false ratings besides ad...,so nothing preventing false ratings besides ad...,"[so, nothing, preventing, false, ratings, besi...",78
2,56,You can never use a health FSA for individual ...,you can never use a health fsa for individual ...,"[you, can, never, use, a, health, fsa, for, in...",74
3,59,Samsung created the LCD and other flat screen ...,samsung created the lcd and other flat screen ...,"[samsung, created, the, lcd, and, other, flat,...",54
4,63,Here are the SEC requirements: The federal sec...,here are the sec requirements the federal sec...,"[here, are, the, sec, requirements, the, feder...",222


In [82]:
len(collection2)

57600

In [61]:
avg_ans_count = collection2['ans_len'].mean()

print(avg_ans_count)

7821956
135.79784722222223


In [83]:
# Questions Tokenization
queries = queries.copy()
queries['q_processed'] = queries['question'].apply(pre_process)
queries['tokenized_q'] = queries.apply(lambda row: wordpunct_tokenize(row['q_processed']), axis=1)
queries['q_len'] = queries.apply(lambda row: len(row['tokenized_q']), axis=1)

queries.head(5)

Unnamed: 0,qid,question,q_processed,tokenized_q,q_len
0,0,What is considered a business expense on a bus...,what is considered a business expense on a bus...,"[what, is, considered, a, business, expense, o...",10
1,1,Claiming business expenses for a business with...,claiming business expenses for a business with...,"[claiming, business, expenses, for, a, busines...",9
2,2,Transferring money from One business checking ...,transferring money from one business checking ...,"[transferring, money, from, one, business, che...",10
3,3,Having a separate bank account for business/in...,having a separate bank account for business in...,"[having, a, separate, bank, account, for, busi...",13
4,4,Business Expense - Car Insurance Deductible Fo...,business expense car insurance deductible fo...,"[business, expense, car, insurance, deductible...",13


In [63]:
avg_q_count = queries['q_len'].mean()

print(avg_q_count)

72936
10.971119133574007


In [0]:
collection2.to_csv("processed_ans.csv", index=None)
queries.to_csv("processed_q.csv", index=None)

In [102]:
word2index = {0: "PAD"}
word2count = {}

idx = 1

for index, row in collection2.iterrows():
    for word in row['tokenized_ans']:
        if word not in word2index:
            word2index[word] = idx
            idx += 1
            word2count[word] = 1
        else:
            word2count[word] += 1

ans_vocab_size = len(word2index)

print("Answer vocab size: {}".format(ans_vocab_size))

idx = len(word2index)

for index, row in queries.iterrows():
    for word in row['tokenized_q']:
        if word not in word2index:
            word2index[word] = idx
            idx += 1
            word2count[word] = 1
        else:
            word2count[word] += 1

print(len(word2index))

q_vocab_size = len(word2index) - ans_vocab_size
print("Question vocab size: {}".format(q_vocab_size))

Answer vocab size: 84909
85034
Question vocab size: 125


In [96]:
# Reduce the size of the vocabuary
word2idx = {0: "PAD"}
idx = 1

for word, count in word2count.items():
    if count > 3:
        if word not in word2idx:
            word2idx[word] = idx
            idx += 1

print(word2idx)



In [97]:
len(word2idx)

28240

In [0]:
with open('vocab.pickle', 'wb') as handle:
    pickle.dump(word2idx, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [121]:
qid_rel = {}

for index, row in qid_docid.iterrows():
    
    if row['qid'] not in qid_rel:
        qid_rel[row['qid']] = []
    qid_rel[row['qid']].append(row['docid'])
    
take(10, qid_rel.items())

[(0, [18850]),
 (1, [14255]),
 (2, [308938]),
 (3, [296717, 100764, 314352, 146317]),
 (4, [196463]),
 (5, [69306]),
 (6, [560251, 188530, 564488]),
 (7, [411063]),
 (8, [566392, 65404]),
 (9, [509122, 184698])]

In [0]:
test = qid_docid[:5]
d = {0: [1, 2], 1: [3, 4], 2: [1, 5], 3: [5, 6]}

In [0]:
# Answer Ranking for each question
doc_ranking = pd.read_csv(path + "run_train_small.tsv", sep="\t", header=None)
doc_ranking = doc_ranking.rename(columns={0: 'qid', 1: 'doc_id', 3:'rank'})

# Create dict for query id and ranked candidates
# key: query ids, values: list of 1000 ranked candidates
qid_ranked_docs = {}

with open(path + "run2_train.tsv",'r') as f:
    for line in f:
        # [qid, doc_id, rank]
        line = line.strip().split('\t')
        qid = int(line[0])
        doc_id = int(line[1])
        rank = int(line[2])
        
        if qid not in qid_ranked_docs:
            # Create a list of size 1000 for each query to store the candidates
            candidates = [0]*1000
            qid_ranked_docs[qid] = candidates
        qid_ranked_docs[qid][rank-1] = doc_id
        
#take(1, qid_ranked_docs.items())

In [47]:
train_set = []

for index, row in test.iterrows():
    for k, v in qid_ranked_docs.items():
        if k == row['qid']:
            tmp = []
            tmp.append(row['qid'])
            tmp.append(row['docid'])
            tmp.append(v)
            train_set.append(tmp)

print(train_set[0])

[0, 18850, [531578, 417981, 324911, 524879, 397608, 216077, 173212, 104464, 326261, 434846, 528838, 234436, 571062, 481692, 207449, 338700, 196374, 153377, 327002, 421301, 11538, 375748, 406418, 238271, 322893, 130631, 73427, 560087, 483385, 156554, 531442, 541809, 192843, 553328, 562777, 209224, 351672, 324513, 18850, 283505, 55200, 367754, 297841, 455984, 540395, 160340, 577284, 565935, 354716, 552845, 287474, 179144, 292748, 310612, 194308, 76618, 100764, 534997, 392484, 155490, 83059, 11132, 557186, 348787, 136071, 192516, 234743, 391619, 468741, 12729, 219313, 365558, 396056, 462831, 146657, 178942, 79411, 292919, 309909, 447231, 400230, 540325, 74688, 354511, 245447, 79397, 120500, 237207, 32072, 588509, 308472, 258155, 388042, 18934, 358631, 381151, 145148, 594531, 81599, 195207, 291717, 537363, 182168, 595759, 431110, 352640, 583956, 38963, 14317, 216783, 83346, 285449, 413681, 278460, 10093, 283079, 257168, 114541, 153541, 559866, 538208, 248761, 334107, 547941, 475273, 67370,

In [61]:
# Create dict for question id and relevant passgages
# keys: query ids, values: list of relevant passages
qid_rel = {}

for index, row in qid_docid.iterrows():
    if row['qid'] not in qid_rel:
        qid_rel[row['qid']] = []
    qid_rel[row['qid']].append(row['docid'])
    if row['docid'] ==  7915:
        print(row['qid'])
    
#take(10, qid_rel.items())

4074


In [64]:
queries.head(5)

queries.iloc[4074]

qid                                                     6218
question    To use a line of credit or withdraw from savings
Name: 4074, dtype: object