# Generate training dataset for SAP CAI

In [20]:
import pandas as pd

data = pd.read_csv('../QnA.csv', encoding='utf-8')
data = data.iloc[:, 1:] # remove docid, qid columns
data

Unnamed: 0,qid,docid,question,answer
0,0,18850,What is considered a business expense on a bus...,The IRS Guidance pertaining to the subject. I...
1,1,14255,Claiming business expenses for a business with...,Yes you can claim your business deductions if ...
2,2,308938,Transferring money from One business checking ...,You should have separate files for each of the...
3,3,296717,Having a separate bank account for business/in...,Having a separate checking account for the bus...
4,3,100764,Having a separate bank account for business/in...,"You don't specify which country you are in, so..."
...,...,...,...,...
17067,11096,407726,"Pensions, annuities, and “retirement”","An annuity is a product. In simple terms, you ..."
17068,11097,131224,What does it mean to a life insurance policy h...,A stock insurance company is structured like a...
17069,11099,424427,Can capital loss in traditional IRA and Roth I...,Edited in response to JoeTaxpayer's comment an...
17070,11099,150878,Can capital loss in traditional IRA and Roth I...,"No, you cannot. If you withdraw everything fro..."


In [21]:
def check_2500_ch(row):
    if len(row) >= 2500:
        return True
    else:
        return False

def check_500_ch(row):
    if len(row) >= 500:
        return True
    else:
        return False

In [22]:
data['long_question'] = data['question'].apply(lambda x:check_500_ch(x))
data['long_answer'] = data['answer'].apply(lambda x:check_2500_ch(x))
data

Unnamed: 0,qid,docid,question,answer,long_question,long_answer
0,0,18850,What is considered a business expense on a bus...,The IRS Guidance pertaining to the subject. I...,False,True
1,1,14255,Claiming business expenses for a business with...,Yes you can claim your business deductions if ...,False,False
2,2,308938,Transferring money from One business checking ...,You should have separate files for each of the...,False,False
3,3,296717,Having a separate bank account for business/in...,Having a separate checking account for the bus...,False,False
4,3,100764,Having a separate bank account for business/in...,"You don't specify which country you are in, so...",False,False
...,...,...,...,...,...,...
17067,11096,407726,"Pensions, annuities, and “retirement”","An annuity is a product. In simple terms, you ...",False,True
17068,11097,131224,What does it mean to a life insurance policy h...,A stock insurance company is structured like a...,False,True
17069,11099,424427,Can capital loss in traditional IRA and Roth I...,Edited in response to JoeTaxpayer's comment an...,False,True
17070,11099,150878,Can capital loss in traditional IRA and Roth I...,"No, you cannot. If you withdraw everything fro...",False,False


In [23]:
# drop rows where question or answer is too long
data.drop(data[data['long_question'] == True].index, inplace=True)
data.drop(data[data['long_answer'] == True].index, inplace=True)
data.reset_index(inplace=True)
data = data.iloc[:, 1:5]
data

Unnamed: 0,qid,docid,question,answer
0,1,14255,Claiming business expenses for a business with...,Yes you can claim your business deductions if ...
1,2,308938,Transferring money from One business checking ...,You should have separate files for each of the...
2,3,296717,Having a separate bank account for business/in...,Having a separate checking account for the bus...
3,3,100764,Having a separate bank account for business/in...,"You don't specify which country you are in, so..."
4,3,314352,Having a separate bank account for business/in...,"If it makes your finances easier, why not? My ..."
...,...,...,...,...
15970,11096,132601,"Pensions, annuities, and “retirement”",There are broadly two kinds of pension: final ...
15971,11096,147730,"Pensions, annuities, and “retirement”",Pension in this instance seems to mean pension...
15972,11096,522438,"Pensions, annuities, and “retirement”","With an annuity, you invest directly into an a..."
15973,11099,150878,Can capital loss in traditional IRA and Roth I...,"No, you cannot. If you withdraw everything fro..."


In [24]:
# clean for SAP conversational AI
data.drop_duplicates(subset='question', inplace=True) # remove duplicate question
data.reset_index(inplace=True)
data = data.iloc[:, 1:]
data

Unnamed: 0,qid,docid,question,answer
0,1,14255,Claiming business expenses for a business with...,Yes you can claim your business deductions if ...
1,2,308938,Transferring money from One business checking ...,You should have separate files for each of the...
2,3,296717,Having a separate bank account for business/in...,Having a separate checking account for the bus...
3,4,196463,Business Expense - Car Insurance Deductible Fo...,"As a general rule, you must choose between a m..."
4,5,69306,Starting a new online business,Most US states have rules that go something li...
...,...,...,...,...
6464,11090,69696,Incorporating real-world parameters into simul...,You said the decision will be made by EOD. If ...
6465,11092,374410,Is real (physical) money traded during online ...,With Forex trading - physical currency is not ...
6466,11096,132601,"Pensions, annuities, and “retirement”",There are broadly two kinds of pension: final ...
6467,11099,150878,Can capital loss in traditional IRA and Roth I...,"No, you cannot. If you withdraw everything fro..."


In [25]:
data_short = data.iloc[:, 2:] # remove qid and docid columns for training
data_short

Unnamed: 0,question,answer
0,Claiming business expenses for a business with...,Yes you can claim your business deductions if ...
1,Transferring money from One business checking ...,You should have separate files for each of the...
2,Having a separate bank account for business/in...,Having a separate checking account for the bus...
3,Business Expense - Car Insurance Deductible Fo...,"As a general rule, you must choose between a m..."
4,Starting a new online business,Most US states have rules that go something li...
...,...,...
6464,Incorporating real-world parameters into simul...,You said the decision will be made by EOD. If ...
6465,Is real (physical) money traded during online ...,With Forex trading - physical currency is not ...
6466,"Pensions, annuities, and “retirement”",There are broadly two kinds of pension: final ...
6467,Can capital loss in traditional IRA and Roth I...,"No, you cannot. If you withdraw everything fro..."


In [27]:
# export cleaned dataset for evaluation
# data.to_csv('sap-qna-full-v2.csv', index=False, encoding='utf-8')

In [49]:
# export cleaned dataset for training
# data_short.to_csv('sap-qna-v2.csv', index=False, encoding='utf-8')