In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
from utils import *
from process_data import *

path = "drive/My Drive/Thesis/"

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
docs = load_answers_to_dict(path + "data/raw/FiQA_train_doc_final.tsv")

In [0]:
collection = load_answers_to_df(path + "data/raw/FiQA_train_doc_final.tsv")
queries = load_questions_to_df(path + "data/raw/FiQA_train_question_final.tsv")
# Question ID and Answer ID pair
qid_docid = load_qid_docid_to_df(path + "data/raw/FiQA_train_question_doc_final.tsv")

In [0]:
print("Number of answers: {}".format(len(collection)))
print("Number of questions: {}".format(len(queries)))
print("Number of QA pairs: {}".format(len(qid_docid)))

Number of answers: 57638
Number of questions: 6648
Number of QA pairs: 17110


# **Clean data**

In [0]:
# Cleaning data
empty_docs, empty_id = get_empty_docs(collection)
# Remove empty answers from collection of answers
collection_cleaned = collection.drop(empty_id)
# Remove empty answers from qa pairs
qid_docid = qid_docid[~qid_docid['docid'].isin(empty_docs)]

print("Number of answers after cleaning: {}".format(len(collection_cleaned)))
print("Number of QA pairs after cleaning: {}".format(len(qid_docid)))

Number of answers after cleaning: 57600
Number of QA pairs after cleaning: 17072


# **Prepare data for Anserini**

In [0]:
# Write collection df to file
save_tsv(path + "data/retrieval/collection_cleaned.tsv", collection_cleaned)

# Convert collection df to JSON file for Anserini's document indexer
collection_to_json(path + "data/retrieval/collection_json/docs.json", path + "data/retrieval/collection_cleaned.tsv")

# **Split question and answer pairs into train, test, and validation sets**

In [0]:
train_label, test_label, valid_label = split_label(qid_docid)

# Save label
save_pickle(path + "data/retrieval/train/qid_rel_train.pickle", train)
save_pickle(path + "data/retrieval/test/qid_rel_test.pickle", test)
save_pickle(path + "data/retrieval/valid/qid_rel_valid.pickle", valid)

In [0]:
train_questions, test_questions, valid_questions = split_question(train_label, test_label, valid_label, queries)

# Save the questions dataset
save_tsv(path + "data/retrieval/train/train_questions", train_questions)
save_tsv(path + "data/retrieval/test/test_questions", test_questions)
save_tsv(path + "data/retrieval/valid/valid_questions", valid_questions)

In [0]:
# Number of questions in each set
print("Number of questions in the training set: {}".format(len(train_questions)))
print("Number of questions in the testing set: {}".format(len(test_questions)))
print("Number of questions in the validation set: {}".format(len(valid_questions)))

Number of questions in the training set: 5681
Number of questions in the testing set: 333
Number of questions in the validation set: 632


In [0]:
def process_questions(queries):
    queries = queries.copy()
    queries['q_processed'] = queries['question'].apply(pre_process)
    queries['tokenized_q'] = queries.apply(lambda row: wordpunct_tokenize(row['q_processed']), axis=1)
    queries['q_len'] = queries.apply(lambda row: len(row['tokenized_q']), axis=1)

    return queries

def pre_process(doc):
    doc = str(doc)
    x = re.sub('[…“”%!&"@#()\-\*\+,/:;<=>?@[\]\^_`{\}~]', ' ', doc)
    y = re.sub('[\.\']', "", x)
    z = y.lower()
    return z


In [0]:
# # Questions Tokenization
# def process_questions(queries):
#     queries = queries.copy()
#     queries['q_processed'] = queries['question'].apply(pre_process)
#     queries['tokenized_q'] = queries.apply(lambda row: wordpunct_tokenize(row['q_processed']), axis=1)
#     queries['q_len'] = queries.apply(lambda row: len(row['tokenized_q']), axis=1)

#     return queries

# train_questions = process_questions(train_questions)
# train_questions = train_questions[['qid', 'q_processed']]

# test_questions = process_questions(test_questions)
# test_questions = test_questions[['qid', 'q_processed']]

# valid_questions = process_questions(valid_questions)
# valid_questions = valid_questions[['qid', 'q_processed']]


# # avg_q_count = queries['q_len'].mean()
# # print(avg_q_count)

# print(len(train_questions))
# print(len(test_questions))
# print(len(valid_questions))

# with open(path + "data/train_questions.tsv",'w') as write_tsv:
#     write_tsv.write(train_questions.to_csv(sep='\t', index=False, header=False))

# with open(path + "data/test_questions.tsv",'w') as write_tsv:
#     write_tsv.write(test_questions.to_csv(sep='\t', index=False, header=False))

# with open(path + "data/valid_questions.tsv",'w') as write_tsv:
#     write_tsv.write(valid_questions.to_csv(sep='\t', index=False, header=False))

In [0]:
# # Answers Tokenization
# collection2['doc_processed'] = collection2['doc'].apply(pre_process)
# collection2['tokenized_ans'] = collection2.apply(lambda row: wordpunct_tokenize(row['doc_processed']), axis=1)
# collection2['ans_len'] = collection2.apply(lambda row: len(row['tokenized_ans']), axis=1)

collection2.head(5)

# len(collection2)

# avg_ans_count = collection2['ans_len'].mean()

# print(avg_ans_count)

In [0]:
# with open('empty_docs.pickle', 'wb') as handle:
#     pickle.dump(empty_docs, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [0]:
# word2index = {"PAD": 0}
# word2count = {}

# idx = 1

# for index, row in collection2.iterrows():
#     for word in row['tokenized_ans']:
#         if word not in word2index:
#             word2index[word] = idx
#             idx += 1
#             word2count[word] = 1
#         else:
#             word2count[word] += 1
            
# ans_vocab_size = len(word2index)

# print("Answer vocab size: {}".format(ans_vocab_size))

# idx = len(word2index)

# for index, row in queries.iterrows():
#     for word in row['tokenized_q']:
#         if word not in word2index:
#             word2index[word] = idx
#             idx += 1
#             word2count[word] = 1
#         else:
#             word2count[word] += 1

# print(len(word2index))

# q_vocab_size = len(word2index) - ans_vocab_size
# print("Question vocab size: {}".format(q_vocab_size))

In [0]:
# # Reduce the size of the vocabuary
# word2idx = {"PAD": 0}
# word2c = {}
# idx = 1

# for word, count in word2count.items():
#     if count > 3:
#         if word not in word2idx:
#             word2idx[word] = idx
#             idx += 1
#             word2c[word] = count

# print(word2idx)

In [0]:
# # Vocab size
# len(word2idx)

# c = Counter(word2c)
# mc = c.most_common(30)

In [0]:
# with open('vocab_full.pickle', 'wb') as handle:
#     pickle.dump(word2index, handle, protocol=pickle.HIGHEST_PROTOCOL)
# with open('vocab.pickle', 'wb') as handle:
#     pickle.dump(word2idx, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('vocab_count.pickle', 'wb') as handle:
#     pickle.dump(word2c, handle, protocol=pickle.HIGHEST_PROTOCOL)

# # Label to answer text
# docid_tokenized = collection2[['docid', 'tokenized_ans']]

# docid_tokenized.head(5)

# label_to_ans = {}

# for index, row in docid_tokenized.iterrows():
#     label_to_ans[row['docid']] = row['tokenized_ans']

# print(take(5, label_to_ans.items()))

# with open('label_ans.pickle', 'wb') as handle:
#     pickle.dump(label_to_ans, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [0]:
# # Question to question text
# q_tokenized = queries[['qid', 'tokenized_q']]

# qid_to_text = {}

# for index, row in q_tokenized.iterrows():
#     qid_to_text[row['qid']] = row['tokenized_q']

# print(take(5, qid_to_text.items()))

# with open('qid_text.pickle', 'wb') as handle:
#     pickle.dump(qid_to_text, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [0]:
# Answer Ranking for each question
# doc_ranking = pd.read_csv(path + "data-bert/cands_train_500.tsv", sep="\t", header=None)
# doc_ranking = doc_ranking.rename(columns={0: 'qid', 1: 'doc_id', 2:'rank'})

# Create dict for query id and ranked candidates
# key: query ids, values: list of 1000 ranked candidates
qid_ranked_docs = {}

with open(path + "data-bert/cands_train_500.tsv",'r') as f:
    for line in f:
        # [qid, doc_id, rank]
        line = line.strip().split('\t')
        qid = int(line[0])
        doc_id = int(line[1])
        rank = int(line[2])
        
        if qid not in qid_ranked_docs:
            # Create a list of size 1000 for each query to store the candidates
            candidates = [0]*500
            qid_ranked_docs[qid] = candidates
        qid_ranked_docs[qid][rank-1] = doc_id
        
print(take(1, qid_ranked_docs.items()))

save_pickle('qid_ranked_docs_100.pickle', qid_ranked_docs)

In [0]:
# qid_rel = {}

# for index, row in qid_docid.iterrows():
    
#     if row['qid'] not in qid_rel:
#         qid_rel[row['qid']] = []
#     qid_rel[row['qid']].append(row['docid'])
    
# with open('qid_rel.pickle', 'wb') as handle:
#     pickle.dump(qid_rel, handle, protocol=pickle.HIGHEST_PROTOCOL)