In [100]:
import numpy as np
import pandas as pd
import string
import nltk
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM, BertForSequenceClassification

In [101]:
data=pd.read_json('data/MSDialog-Intent.json')
index=data.index
data=data.reset_index(drop=True)

In [102]:
labelTable={'OQ':0,'RQ':1,'CQ':2,'FD':3,'FQ':4,\
            'IR':5,'PA':6,'PF':7,'NF':8,'GG':9,'JK':10,'O':11}

In [103]:
utterances=[]
label=[]
otherFeatures={}
diaglogKey=index[:-1]
utterancesKey=list(data.iloc[4,0][0].keys())
utterancesKey.remove('utterance')
utterancesKey.remove('tags')
maxLen=0
IDs=[]

for i in range(len(data.columns)):
    temp=data.iloc[4,i]
    ID=data.columns[i]
    for j in range(len(temp)):
        
        # ID of dialog
        IDs+=[ID]
        
        # Utterance
        text=temp[j]['utterance']
        if len(text)> maxLen:
            maxLen=len(text)
        utterances+=[text]
        
        # Other features
        for k in range(4):
            otherFeatures.setdefault(diaglogKey[k], []).append(data.iloc[k,i])
        for k in range(len(utterancesKey)):
            otherFeatures.setdefault(utterancesKey[k],[]).append(temp[j][utterancesKey[k]])
        # Label 
        tempLabel=[0]*12
        labels=temp[j]['tags']
        labels = labels.split(' ')
        if '' in labels:
            labels.remove('')
        if len(labels) > 1 and 'GG' in labels:
            labels.remove('GG')
        if len(labels) > 1 and 'O' in labels:
            labels.remove('O')
        if len(labels) > 1 and 'JK' in labels:
            labels.remove('JK')
        if 'RQ' in labels and 'CQ' in labels and 'FD' in labels and 'IR' in labels:
            labels=['CQ']
        if 'PF' in labels and 'CQ' in labels and 'FD' in labels and 'FQ' in labels:
            labels=['CQ']
        for k in range(len(labels)):
            tempLabel[labelTable[labels[k]]]=1
            
        label+=[tempLabel]

In [104]:
Data=pd.DataFrame(label,columns=labelTable)
Data['utterance']=utterances
Data['diaglogID']=IDs
for k in utterancesKey:
    Data[k]=otherFeatures[k]
for k in diaglogKey:
    Data[k]=otherFeatures[k]
Data['diaglogID']=Data['diaglogID'].astype('int64')

In [105]:
Data.head()

Unnamed: 0,OQ,RQ,CQ,FD,FQ,IR,PA,PF,NF,GG,...,actor_type,user_id,vote,utterance_time,affiliation,is_answer,title,category,dialog_time,frequency
0,1,0,0,0,0,0,0,0,0,0,...,User,Sandra,Freq_0,2017-10-02T11:07:29,Common User,0,backgroundTaskHost.exe stopped working,Windows_10,2017-10-02T11:07:29,0
1,0,0,0,0,0,0,1,0,0,0,...,Agent,Cheryl,1,2017-10-02T11:12:52,MVP Community Moderator | Article Author,1,backgroundTaskHost.exe stopped working,Windows_10,2017-10-02T11:07:29,0
2,0,0,0,0,0,0,0,1,0,0,...,User,Sandra,0,2017-10-02T11:49:16,,0,backgroundTaskHost.exe stopped working,Windows_10,2017-10-02T11:07:29,0
3,1,0,0,0,0,1,0,0,1,0,...,User,James,Freq_3,2015-11-05T03:50:07,Common User,0,Windows 10 Microsoft Edge is slow - System Per...,Windows_10,2015-11-05T03:50:07,3
4,0,0,0,0,0,0,1,1,0,0,...,Agent,Faith,0,2015-11-05T09:37:10,Microsoft,1,Windows 10 Microsoft Edge is slow - System Per...,Windows_10,2015-11-05T03:50:07,3


## Data Cleaning

In [106]:
class Preprocesser():
    
    def __init__(self):
        self.table = str.maketrans('', '', string.punctuation)
    
    def remove_hyperlinks(self,text_list):
        corrected_text = []
        for word in text_list:
            if ':' in word and '/' in word:
                continue
            if 'http' not in word:
                corrected_text.append(word)
        return corrected_text
    
    def remove_punctuations(self,text_lst):
        stripped=[]
        for word in text_lst:
            if '.' in word or '?' in word or '!' in word or ',' in word:
                stripped+=[word]
            else:
                stripped +=[ word.translate(self.table)]
        return stripped
    
    def remove_htmlStructure(self,text_lst):     
        stripped=[]
        for word in text_lst:
            if '/>' in word or '<' in word or '·' in word or word.count('.')>=2:
                continue
            else:
                stripped +=[ word]
        return stripped
    
    def remove_otherInf(self,text_lst):
        stripped=[]
        for word in text_lst:
            if '@' in word:
                continue
            elif '(' in word and ')' in word:
                continue
            else:
                stripped +=[ word]
        return stripped
    
    def remove_number(self,text_lst):
        stripped=[]
        for word in text_lst:
            count=0
            for i in range(10):
                if str(i) in word:
                    count+=1
            if count<2:
                stripped +=[word]
        return stripped
    
    def remove_errorPunctuation(self,text_lst):
        stripped=[]
        for word in text_lst:
            if '.' in word:
                fragement=word.split('.')
                
        return stripped
    
    def preprocess(self,text):
        # preprocess pipline
        text_lst = text.split()
        text_lst = self.remove_hyperlinks(text_lst)
        text_lst = self.remove_otherInf(text_lst)
        text_lst = self.remove_htmlStructure(text_lst)
        text_lst = self.remove_number(text_lst)
        text_lst = self.remove_punctuations(text_lst)
        
        return " ".join(text_lst) 

In [107]:
preprocesser=Preprocesser()

In [108]:
Data=Data.dropna(subset=['utterance']).reset_index(drop=True)
newData=Data.copy()
for i in range(len(Data)):
    newData.at[i,'utterance']=preprocesser.preprocess(Data['utterance'][i])

In [109]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
sentence_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')

In [110]:
newData.loc[Data['diaglogID']==15287,'utterance']

5542    Hi. I want to play City Bus Simulator 2 Munich...
5543    JD Your system does not meet the minimum reqs ...
5544    Thank you. Ill buy a another computer. Or Ill ...
5545                                             windows8
Name: utterance, dtype: object

In [111]:
lens=510
index=[]
for i in range(len(newData)):
    text=newData['utterance'][i]
    sentences=sentence_tokenizer.tokenize(text)
    for sentence in sentences:
        token=tokenizer.tokenize(sentence)
        if len(token)>lens:
            print('Drop the index of ',i,len(token))
            index+=[i]
            break
newData=newData.drop(index)

Drop the index of  6963 910
Drop the index of  7088 698


In [120]:
newData=newData.replace({'utterance': ''}, {'utterance': '.'})

In [None]:
newData

## Split

In [112]:
np.random.seed(1)
lens=len(Data['diaglogID'].unique())
uniqueIndex=Data['diaglogID'].unique()
index=np.random.permutation(lens)
uniqueIndex=uniqueIndex[index]
Train=[]
Valid=[]
Test=[]
TrainP=[]
ValidP=[]
TestP=[]

In [113]:
for i in range(int(np.floor(lens*0.81))):
    Train+=[pd.DataFrame(Data.loc[Data['diaglogID']==uniqueIndex[i],:])]
    TrainP+=[pd.DataFrame(newData.loc[newData['diaglogID']==uniqueIndex[i],:])]
Train=pd.concat(Train)
TrainP=pd.concat(TrainP)

In [114]:
for i in range(int(np.floor(lens*0.81)),int(np.floor(lens*0.91))):
    Valid+=[pd.DataFrame(Data.loc[Data['diaglogID']==uniqueIndex[i],:])]
    ValidP+=[pd.DataFrame(newData.loc[newData['diaglogID']==uniqueIndex[i],:])]
Valid=pd.concat(Valid)
ValidP=pd.concat(ValidP)

In [115]:
for i in range(int(np.floor(lens*0.91)),int(lens)):
    Test+=[pd.DataFrame(Data.loc[Data['diaglogID']==uniqueIndex[i],:])]
    TestP+=[pd.DataFrame(newData.loc[newData['diaglogID']==uniqueIndex[i],:])]
Test=pd.concat(Test)
TestP=pd.concat(TestP)

## Save the Data

In [116]:
Data.to_csv('data/Data.csv',index=False)
Train.to_csv('data/Train.csv',index=False)
Valid.to_csv('data/Valid.csv',index=False)
Test.to_csv('data/Test.csv',index=False)
TrainP.to_csv('data/Train_Preprocessing.csv',index=False)
ValidP.to_csv('data/Valid_Preprocessing.csv',index=False)
TestP.to_csv('data/Test_Preprocessing.csv',index=False)
