# 1. Import relevant libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# 2. Data Processing

In [2]:
# import DFs
question_df = pd.read_csv('../0.Datasets/raw/FiQA_train_question_final.tsv', sep = '\t', index_col = 0)
map_df = pd.read_csv('../0.Datasets/raw/FiQA_train_question_doc_final.tsv', sep = '\t', index_col = 0)
doc_df = pd.read_csv('../0.Datasets/raw/FiQA_train_doc_final.tsv', sep = '\t', index_col = 0)

In [3]:
question_df.head()

Unnamed: 0,qid,question,timestamp
0,0,What is considered a business expense on a bus...,Nov 8 '11 at 15:14
1,1,Claiming business expenses for a business with...,May 13 '14 at 13:17
2,2,Transferring money from One business checking ...,Jan 20 '16 at 20:31
3,3,Having a separate bank account for business/in...,Mar 1 at 0:24
4,4,Business Expense - Car Insurance Deductible Fo...,Mar 4 at 0:26


In [4]:
question_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6648 entries, 0 to 6647
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   qid        6648 non-null   int64 
 1   question   6648 non-null   object
 2   timestamp  6648 non-null   object
dtypes: int64(1), object(2)
memory usage: 207.8+ KB


In [5]:
map_df.head()

Unnamed: 0,qid,docid
0,0,18850
1,1,14255
2,2,308938
3,3,296717
4,3,100764


In [6]:
map_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17110 entries, 0 to 17109
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   qid     17110 non-null  int64
 1   docid   17110 non-null  int64
dtypes: int64(2)
memory usage: 401.0 KB


In [7]:
doc_df.head()

Unnamed: 0,docid,doc,timestamp
0,3,I'm not saying I don't like the idea of on-the...,Oct 03 '12 at 14:56
1,31,So nothing preventing false ratings besides ad...,Sep 01 '17 at 13:36
2,56,You can never use a health FSA for individual ...,Jun 9 '14 at 17:37
3,59,Samsung created the LCD and other flat screen ...,Dec 27 at 01:37
4,63,Here are the SEC requirements: The federal sec...,Jul 14 '14 at 8:10


# Create mapped dataframe

In [8]:
question_mapping_dict = {}
for _,row in question_df.iterrows():
    question_mapping_dict[int(row["qid"])] = row["question"]
    
answer_mapping_dict = {}
for _,row in doc_df.iterrows():
    answer_mapping_dict[int(row["docid"])] = row["doc"]

In [9]:
def get_question(index):
    return question_mapping_dict[int(index)]

def get_answer(index):
    return answer_mapping_dict[int(index)]

In [10]:
map_df["question"] = map_df["qid"].apply(get_question)
map_df["answer"] = map_df["docid"].apply(get_answer)

In [11]:
map_df

Unnamed: 0,qid,docid,question,answer
0,0,18850,What is considered a business expense on a bus...,The IRS Guidance pertaining to the subject. I...
1,1,14255,Claiming business expenses for a business with...,Yes you can claim your business deductions if ...
2,2,308938,Transferring money from One business checking ...,You should have separate files for each of the...
3,3,296717,Having a separate bank account for business/in...,Having a separate checking account for the bus...
4,3,100764,Having a separate bank account for business/in...,"You don't specify which country you are in, so..."
...,...,...,...,...
17105,11096,407726,"Pensions, annuities, and “retirement”","An annuity is a product. In simple terms, you ..."
17106,11097,131224,What does it mean to a life insurance policy h...,A stock insurance company is structured like a...
17107,11099,424427,Can capital loss in traditional IRA and Roth I...,Edited in response to JoeTaxpayer's comment an...
17108,11099,150878,Can capital loss in traditional IRA and Roth I...,"No, you cannot. If you withdraw everything fro..."


In [12]:
map_df=map_df.dropna()

In [13]:
map_df

Unnamed: 0,qid,docid,question,answer
0,0,18850,What is considered a business expense on a bus...,The IRS Guidance pertaining to the subject. I...
1,1,14255,Claiming business expenses for a business with...,Yes you can claim your business deductions if ...
2,2,308938,Transferring money from One business checking ...,You should have separate files for each of the...
3,3,296717,Having a separate bank account for business/in...,Having a separate checking account for the bus...
4,3,100764,Having a separate bank account for business/in...,"You don't specify which country you are in, so..."
...,...,...,...,...
17105,11096,407726,"Pensions, annuities, and “retirement”","An annuity is a product. In simple terms, you ..."
17106,11097,131224,What does it mean to a life insurance policy h...,A stock insurance company is structured like a...
17107,11099,424427,Can capital loss in traditional IRA and Roth I...,Edited in response to JoeTaxpayer's comment an...
17108,11099,150878,Can capital loss in traditional IRA and Roth I...,"No, you cannot. If you withdraw everything fro..."


In [14]:
# Some documents are not tagged to any answers. We will save them in a seperate dataframe for future use
is_NaN = map_df.isnull()
row_has_NaN = is_NaN.any(axis=1)
no_answers_df = map_df[row_has_NaN]

In [15]:
map_df_2 = map_df.copy(deep=True)

# Save to new csv

In [16]:
no_answers_df.to_csv("../0.Datasets/no_answers.csv",index=False)

In [17]:
map_df_2.to_csv("../0.Datasets/QnA.csv")

# Split to train test (20%)

In [18]:
train, test = train_test_split(map_df, test_size=0.2, random_state = 425)

In [19]:
train.to_csv("../0.Datasets/train_test_split/train.csv")
test.to_csv("../0.Datasets/train_test_split/test.csv")