In [81]:
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import pandas as pd
import numpy as np
import re
import string

def clean_stopword(text):
    # Apply this code to every textual string
    word_list = text.split() 
    filtered_words = [word for word in word_list if word not in stopwords.words('english')]
    text = ' '.join(filtered_words)
    return text


# # Remove Punctuatio
def remove_punc(text):
    punctuation = set(string.punctuation)
    except_punc = ['?', '!', '\"', ',', '.']
    for ex in except_punc:
        punctuation.remove(ex)
    out = []
    for e in list(text):

        if e in except_punc:
            if e != '.' :
                out.append(" "+e)
            else :
                out.append(".")
        elif e not in punctuation or e is '\'':
            out.append(e)
        else :
            out.append(" ")
    return "".join(out)

# # Split contraction (include 'em 'til)
def split_contraction(phrase):
    # convert two types of single qoute
    phrase = re.sub(r"’", "'", phrase)
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    
    phrase = re.sub(r"\'em ", "them ", phrase)
    phrase = re.sub(r"\'til", "until", phrase)
    
    phrase = re.sub(r"y\'", " you ", phrase)
    
    phrase = re.sub(r"in'", "ing", phrase)
    
    # general

    phrase = re.sub(r"n \'t", " n\'t", phrase)
    
    phrase = re.sub(r"n\'t", " n\'t", phrase)
    phrase = re.sub(r"\'re", " 're", phrase)
    phrase = re.sub(r"\'s", " 's", phrase)
    phrase = re.sub(r"\'d", " 'd", phrase)
    phrase = re.sub(r"\'ll", " 'll", phrase)
    phrase = re.sub(r"\'ve", " 've", phrase)
    phrase = re.sub(r"\'m", " 'm", phrase)
    
    return phrase


# # Group up contigous white space
def group_space(text):
    return " ".join(text.split())

# # ' Expand the qoutation mark '
def expand_qoute(text):
    # remove punctuation first
    out = []
    for i in range(len(text)):
        if text[i] == "\'": 
            if(i!=0 and text[i-1] == ' '): #start qoute
                out.append("' ")
            elif(text[i+1] == ' ' and text[i-1]!=" "): #end qoute
                out.append(" '")
        else: out.append(text[i])
    return "".join(out)


# # Pad < end >
def find_dot(text):
    index = -1 
    reverse_text = text[::-1]
    for i in range(len(reverse_text)):
        if(reverse_text[i] =='.'):
            index = len(text)-i
        else:
            break
    return index

def pad_end_of_sentence(sentences):
    out = []
    for sentence in sentences:
        if len(sentence) > 0 and sentence[-1] == '.':
            out.append(sentence[:find_dot(sentence)-1])
        else: 
            out.append(sentence)
    return " <end> ".join(out) + ' <end>'
 
# ## Capital check
def capital_clean(sentence):
    return sentence.lower()

# ## Check symbols
def have_alphabet(sentence):
    for char in sentence:
        if char.isalpha():
            return True
    return False

# # Clean Function
def clean_data_main(line):
    sentences = sent_tokenize(line)
    out_sentences = []
    for sentence in sentences :
        if not have_alphabet(sentence):
            continue
        x = capital_clean(sentence)
        x = split_contraction(x)
        x = remove_punc(x)
        x = group_space(x)
        out_sentences.append(x)     
    sentence = pad_end_of_sentence(out_sentences)
    return sentence


# # Reverse Function
def recontraction(phrase):
    phrase = re.sub(r"’", "'", phrase)
    # specific
    phrase = re.sub(r"will not", "won\'t", phrase)
    phrase = re.sub(r"can not", "can\'t", phrase)
    
    phrase = re.sub(r"them", "\'em", phrase)
    phrase = re.sub(r"until", "\'til", phrase)
    
    # general

    phrase = re.sub(r" n\'t", "n't", phrase)
    phrase = re.sub(r" \'re", "'re", phrase)
    phrase = re.sub(r" \'s", "'s", phrase)
    phrase = re.sub(r" \'d", "'d", phrase)
    phrase = re.sub(r" \'ll", "'ll", phrase)
    phrase = re.sub(r" \'ve", "'ve", phrase)
    phrase = re.sub(r" \'m", "'m", phrase)
    
    return phrase

import pickle
with open('lowerToCappital.pkl', 'rb') as handle:
    lowerToCapital = pickle.load(handle)

lowerToCapital['i'] = 'I'

def apply_capital(line):
    out = []
    startWord = True
    for word in line.split():
        if(startWord) :
            out.append(word.capitalize())
            startWord = False
            continue
        if word in lowerToCapital:
            out.append(lowerToCapital[word])
        else :
            out.append(word)
    return " ".join(out)

def remove_space_between_punctuation(line) :
    out = ''
    punctuation = set(string.punctuation)
    for i in range(len(line)) :
        try:
            if line[i] == ' ' and line[i-1] in punctuation and line[i+1] in punctuation:
                pass
            else :
                out+=line[i]
        except :
            pass
    return out            

def reverse_clean(data) :
    first_question = ['who', 'what', 'where', 'when', 'why', 'how', 'do', 'does', 'is', 'are', 'am', 'did', 'have', 'has', 'had', 'can', 'could', 'may', 'might', 'would', 'want']
    out = []
    sentences = data.split('<end>')
    for sentence in sentences:
        if len(sentence)<=0:
            continue
        x = recontraction(sentence)
        x = remove_space_between_punctuation(x)
        x = apply_capital(x)
        x = x.strip()
        if x.split()[0].split("'")[0].lower() not in first_question:
            x += '.' 
        else:
            x += '?'
        out.append(x)
    return " ".join(out)

# print(reverse_clean("i do n't want to know how to say that though <end> i want to know useful things <end> like where the good stores are <end> how much does champagne cost <end> stuff like chat <end> i have never in my life had to point out my head to someone <end>"))
# print(clean_data_main("What are you doing? My name is Jump. I like you."))

with open('data_QA.pickle', 'rb') as handle:
    data = pickle.load(handle)

res = dict()

import random, json, re
random.seed(129)
random.shuffle(data)

for i in range(500):
    res[2*i + 1] = (reverse_clean(data[i][0]), reverse_clean(data[i][1]))
    res[2*i + 2] = (reverse_clean(data[i][0]), reverse_clean(data[i][2]))

In [82]:
res

{1: ('Would you excuse me for a moment?', 'Sure.'),
 2: ('Would you excuse me for a moment?', "I'd like to know."),
 3: ("It's so cruel.", 'No. He had the choice.'),
 4: ("It's so cruel.", "And you're a good man."),
 5: ('Thank you.', 'Want another one?'),
 6: ('Thank you.', "I'll be in the back."),
 7: ("Honey you'll wake the whole neighborhood.", 'Mom.'),
 8: ("Honey you'll wake the whole neighborhood.", 'What is it?'),
 9: ('You said he was dead.', 'He is. But he was here.'),
 10: ('You said he was dead.', "He's a girl."),
 11: ('You gonna dig up that lad you s husband.', "I'll call the coroner."),
 12: ('You gonna dig up that lad you s husband.',
  "It's a nice time. I'm sorry."),
 13: ('I better keep looking.', 'Where how?'),
 14: ('I better keep looking.', "You're right."),
 15: ('Jesus.', 'The business has changed.'),
 16: ('Jesus.', "You're not going to do it."),
 17: ('Hello.', 'Are you still coming?'),
 18: ('Hello.', 'Hello.'),
 19: ("Try 'flintstone '.", 'What?'),
 20: ("Tr

In [91]:
from collections import defaultdict
ans = defaultdict(lambda: [0, 0])

In [92]:
import csv

data = list()

with open('response.csv', 'r') as csvFile:
    reader = csv.reader(csvFile)
    for row in reader:
        data += [row]

csvFile.close()

data = data[1:]

In [93]:
for row in data:
    if row[3] == 'bot':
        ans[int(row[2]) % 2][0] += 1
    else:
        ans[int(row[2]) % 2][1] += 1

In [94]:
ans

defaultdict(<function __main__.<lambda>()>, {1: [276, 291], 0: [289, 285]})

In [95]:
ans = defaultdict(lambda: [0, 0])

In [96]:
for row in data:
    if row[3] == 'bot':
        ans[int(row[2])][0] += 1
    else:
        ans[int(row[2])][1] += 1

In [97]:
for idx in range(1, 101):
    if idx <= 100:
        print(idx % 2 == 0 and 'bot' or 'human', res[idx], ans[idx])

human ('Would you excuse me for a moment?', 'Sure.') [4, 5]
bot ('Would you excuse me for a moment?', "I'd like to know.") [2, 5]
human ("It's so cruel.", 'No. He had the choice.') [3, 7]
bot ("It's so cruel.", "And you're a good man.") [10, 0]
human ('Thank you.', 'Want another one?') [9, 3]
bot ('Thank you.', "I'll be in the back.") [1, 6]
human ("Honey you'll wake the whole neighborhood.", 'Mom.') [10, 2]
bot ("Honey you'll wake the whole neighborhood.", 'What is it?') [8, 1]
human ('You said he was dead.', 'He is. But he was here.') [4, 3]
bot ('You said he was dead.', "He's a girl.") [6, 10]
human ('You gonna dig up that lad you s husband.', "I'll call the coroner.") [6, 6]
bot ('You gonna dig up that lad you s husband.', "It's a nice time. I'm sorry.") [5, 4]
human ('I better keep looking.', 'Where how?') [9, 5]
bot ('I better keep looking.', "You're right.") [14, 2]
human ('Jesus.', 'The business has changed.') [4, 2]
bot ('Jesus.', "You're not going to do it.") [7, 4]
human ('H