In [3]:
from google.colab import drive
drive.mount('/content/drive')

path = "drive/My Drive/FiQA/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import pickle
from collections import Counter
from tqdm import tqdm
import itertools
import pandas as pd
from itertools import islice

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

def remove_empty(test_set):
    for index, row in enumerate(test_set):
        for doc in row[1]:
            if doc in empty_docs:
                del test_set[index]
    return test_set

def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

def save_pickle(path, data):
    with open(path, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [0]:
train_set = load_pickle(path + 'data-bert/data_train_50.pickle')
empty_docs = load_pickle(path+'empty_docs.pickle')
train_set = [x for x in train_set if x[1] not in empty_docs]

In [7]:
!pip install transformers

import torch
from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)



Loading BERT tokenizer...


In [0]:
collection = pd.read_csv(path+"data-bert/collection_new.tsv", sep="\t", header=None)
collection = collection.rename(columns={0: 'docid', 1: 'doc'})

def load_questions(path):
    """
    Returns a dataframe of cols: qid, question
    """
    # Question ID and Question text
    query_df = pd.read_csv(path, sep="\t")
    queries = query_df[['qid', 'question']]

    return queries

queries = load_questions(path + "FiQA_train_question_final.tsv")

In [0]:
queries['tokenized_q'] = queries['question'].apply(lambda x: tokenizer.tokenize(x))
queries['len'] = queries['tokenized_q'].apply(lambda x: len(x))

In [10]:
queries.len.max()

40

In [0]:
collection['tokenized_a'] = collection['doc'].apply(lambda x: tokenizer.tokenize(x))
collection['len'] = collection['tokenized_a'].apply(lambda x: len(x))

In [12]:
collection.head(5)

Unnamed: 0,docid,doc,tokenized_a,len
0,3,I'm not saying I don't like the idea of on-the...,"[i, ', m, not, saying, i, don, ', t, like, the...",100
1,31,So nothing preventing false ratings besides ad...,"[so, nothing, preventing, false, ratings, besi...",95
2,56,You can never use a health FSA for individual ...,"[you, can, never, use, a, health, f, ##sa, for...",100
3,59,Samsung created the LCD and other flat screen ...,"[samsung, created, the, lcd, and, other, flat,...",62
4,63,Here are the SEC requirements: The federal sec...,"[here, are, the, sec, requirements, :, the, fe...",254


In [13]:
queries.head(5)

Unnamed: 0,qid,question,tokenized_q,len
0,0,What is considered a business expense on a bus...,"[what, is, considered, a, business, expense, o...",11
1,1,Claiming business expenses for a business with...,"[claiming, business, expenses, for, a, busines...",9
2,2,Transferring money from One business checking ...,"[transferring, money, from, one, business, che...",10
3,3,Having a separate bank account for business/in...,"[having, a, separate, bank, account, for, busi...",18
4,4,Business Expense - Car Insurance Deductible Fo...,"[business, expense, -, car, insurance, de, ##d...",17


In [0]:
label_to_ans = {}

for index, row in collection.iterrows():
    label_to_ans[row['docid']] = row['tokenized_a']

# Question to question text
qid_to_text = {}

for index, row in queries.iterrows():
    qid_to_text[row['qid']] = row['tokenized_q']

In [0]:
save_pickle(path+"data-bert/label_to_ans.pickle", label_to_ans)
save_pickle(path+"data-bert/qid_to_text.pickle", qid_to_text)

In [0]:
def add_question_token(q_tokens):
    c = "[CLS]"
    s = "[SEP]"
    q_tokens = [c] + q_tokens
    q_tokens = q_tokens + [s]

    return q_tokens

def add_ans_token(a_tokens):
    s = "[SEP]"
    a_tokens = a_tokens + [s]

    return a_tokens

In [17]:
train_df = pd.DataFrame(train_set)

train_df = train_df.rename(columns={0: 'qid', 1: 'pos', 2:'neg'})

train_pos = train_df[['qid', 'pos']]
train_pos = train_pos.rename(columns={'pos': 'docid'})
train_pos['label'] = train_pos.apply(lambda x: 1, axis=1)
train_pos = train_pos.drop_duplicates()

train_neg = train_df[['qid', 'neg']]
train_neg = train_neg.rename(columns={'neg': 'docid'})
train_neg['label'] = train_neg.apply(lambda x: 0, axis=1)

train = pd.concat([train_pos, train_neg]).sort_values(by=['qid'])

train.head(5)

Unnamed: 0,qid,docid,label
0,0,18850,1
20,0,408058,0
21,0,343708,0
22,0,311069,0
23,0,139094,0


In [18]:
train['question'] = train['qid'].apply(lambda x: qid_to_text[x])
train['ans_cand'] = train['docid'].apply(lambda x: label_to_ans[x])
train['ques_token'] = train['question'].apply(lambda x: add_question_token(x))
train['ans_cand'] = train['ans_cand'].apply(lambda x: add_ans_token(x))

# train['seq'] = train[['question', 'ans_cand']].apply(lambda x: ' '.join(x), axis=1)

train.head(5)

Unnamed: 0,qid,docid,label,question,ans_cand,ques_token
0,0,18850,1,"[what, is, considered, a, business, expense, o...","[the, irs, guidance, pertaining, to, the, subj...","[[CLS], what, is, considered, a, business, exp..."
20,0,408058,0,"[what, is, considered, a, business, expense, o...","[the, advice, you, were, given, in, the, other...","[[CLS], what, is, considered, a, business, exp..."
21,0,343708,0,"[what, is, considered, a, business, expense, o...","[the, us, with, ##holding, tax, applies, to, s...","[[CLS], what, is, considered, a, business, exp..."
22,0,311069,0,"[what, is, considered, a, business, expense, o...","[&, gt, ;, you, also, have, to, think, about, ...","[[CLS], what, is, considered, a, business, exp..."
23,0,139094,0,"[what, is, considered, a, business, expense, o...","[they, are, similar, in, the, sense, that, the...","[[CLS], what, is, considered, a, business, exp..."


In [20]:
train.at[0, "ques_token"]

array([list(['[CLS]', 'what', 'is', 'considered', 'a', 'business', 'expense', 'on', 'a', 'business', 'trip', '?', '[SEP]']),
       list(['[CLS]', 'what', 'is', 'considered', 'a', 'business', 'expense', 'on', 'a', 'business', 'trip', '?', '[SEP]'])],
      dtype=object)

In [0]:
train.to_csv(path+"data-bert/train.csv", index=False, header=False)

In [101]:
train.question.str.len().max()
train.ans_cand.str.len().max()

16990

In [89]:
train = train[['qid', 'docid', 'label', 'seq']]
train.head(5)

Unnamed: 0,qid,docid,label,seq
0,0,18850,1,[CLS] What is considered a business expense on...
20,0,408058,0,[CLS] What is considered a business expense on...
21,0,343708,0,[CLS] What is considered a business expense on...
22,0,311069,0,[CLS] What is considered a business expense on...
23,0,139094,0,[CLS] What is considered a business expense on...


In [0]:
# Get the lists of sentences and their labels.
sequences = train.seq.values
labels = train.label.values

In [95]:
# Print the original sentence.
print(' Original: ', sequences[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(sequences[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sequences[0])))

 Original:  [CLS] What is considered a business expense on a business trip? [SEP] The IRS Guidance pertaining to the subject.  In general the best I can say is your business expense may be deductible.  But it depends on the circumstances and what it is you want to deduct. Travel Taxpayers who travel away from home on business may deduct related   expenses, including the cost of reaching their destination, the cost   of lodging and meals and other ordinary and necessary expenses.   Taxpayers are considered “traveling away from home” if their duties   require them to be away from home substantially longer than an   ordinary day’s work and they need to sleep or rest to meet the demands   of their work. The actual cost of meals and incidental expenses may be   deducted or the taxpayer may use a standard meal allowance and reduced   record keeping requirements. Regardless of the method used, meal   deductions are generally limited to 50 percent as stated earlier.    Only actual costs for lo

In [0]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []

for seq in sequences:
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Map tokens to their IDs.
    encoded_seq = tokenizer.encode(
                        seq,                      # Sentence to encode.
                        add_special_tokens = False, # Add '[CLS]' and '[SEP]'
                   )
    
    # Add the encoded sentence to the list.
    input_ids.append(encoded_seq)

# Print sentence 0, now as a list of IDs.
print('Original: ', sequences[0])
print('Token IDs:', input_ids[0])