## Cornell movie dataset preprocessing

In [17]:
import json
import ast
with open("cornell-movie-dialogs-corpus/movie_conversations.txt", buffering=1000, encoding='utf8', errors='ignore') as f:
    for row in f:
        line = row.split(" +++$+++ ")
        x = ast.literal_eval(line[3].replace("\n",""))
        lines.append(x)
lines[:10]

[['L194', 'L195', 'L196', 'L197'],
 ['L198', 'L199'],
 ['L200', 'L201', 'L202', 'L203'],
 ['L204', 'L205', 'L206'],
 ['L207', 'L208'],
 ['L271', 'L272', 'L273', 'L274', 'L275'],
 ['L276', 'L277'],
 ['L280', 'L281'],
 ['L363', 'L364'],
 ['L365', 'L366']]

In [23]:
import re
def preprocess_sentence(w):
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r'\.\.+',".",w)
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

    w = w.rstrip().strip()

    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w

In [26]:
data = {}
with open("cornell-movie-dialogs-corpus/movie_lines.txt", "r", encoding='utf-8', errors='ignore') as f:
    for row in f:
        try:
            
            line = row.split(" +++$+++ ")
            data[line[0]] = preprocess_sentence(line[4].replace("\n",""))
        except Exception as e:
            print(row)
data

{'L1045': '<start> They do not ! <end>',
 'L1044': '<start> They do to ! <end>',
 'L985': '<start> I hope so . <end>',
 'L984': '<start> She okay ? <end>',
 'L925': '<start> Let s go . <end>',
 'L924': '<start> Wow <end>',
 'L872': '<start> Okay you re gonna need to learn how to lie . <end>',
 'L871': '<start> No <end>',
 'L870': '<start> I m kidding . You know how sometimes you just become this persona ? And you don t know how to quit ? <end>',
 'L869': '<start> Like my fear of wearing pastels ? <end>',
 'L868': '<start> The real you . <end>',
 'L867': '<start> What good stuff ? <end>',
 'L866': '<start> I figured you d get to the good stuff eventually . <end>',
 'L865': '<start> Thank God ! If I had to hear one more story about your coiffure . <end>',
 'L864': '<start> Me . This endless . blonde babble . I m like , boring myself . <end>',
 'L863': '<start> What crap ? <end>',
 'L862': '<start> do you listen to this crap ? <end>',
 'L861': '<start> No . <end>',
 'L860': '<start> Then 

In [31]:
question = []
answer = []
for line in lines:
    for i in range(len(line)-1):
        question.append(data[line[i]])
        answer.append(data[line[i+1]])
len(question), len(answer)

(221616, 221616)

In [1]:
import pandas as pd
pd.options.display.max_colwidth =600
# movie_df = pd.DataFrame(list(zip(question, answer)),columns=['question','answer'])
# movie_df

### Wikipedia dataset preprocessing

In [126]:
with open("wikipedia_conversations_corpus_v1.01/wikipedia.talkpages.conversations.txt") as f:
    x = f.read().split("\n\n")
print(len(x))
x[:10]


125293


['524288 +++$+++ Frightner +++$+++ AnonEMouse +++$+++ 524288 +++$+++ initial_post +++$+++ 2007-09-07 11:49:00 +++$+++ 1.189190940E09 +++$+++ You should look at all of the point on the template not just the last one, the template also says the image belonging to the republic of macedonia is in the public domain if it being used for \\"information purposes\\".  +++$+++ You should look at all of the point on the template not just the last one, the template also says the image belonging to the republic of macedonia is in the public domain if it being used for "information purposes". [[User:Frightner|Frightner]] 11:49, 7 September 2007 (UTC)\n524289 +++$+++ Revizionist +++$+++ AnonEMouse +++$+++ 524288 +++$+++ -1 +++$+++ 2007-09-07 15:41:00 +++$+++ 1.189204860E09 +++$+++ Yes I agree. The law permits usage of documents, photographs and other materials for educational and informational purposes. There was a normative act issued by the government of the Republic of Macedonia that even allowed 

In [127]:
def create_id_to_conv_pair(convs):
    p = {}
    for lines in convs.split("\n"):
        z = lines.split(" +++$+++ ")
        p[z[0]] = z[7]
    return p

question = []
answer = []
count = 0
conv_exception = []
for convs in x:
    try:
        pairs = create_id_to_conv_pair(convs)
        for i in convs.split("\n"):
            y = i.split(" +++$+++ ")
            if (y[4] == "initial_post" ):
                pass
            elif (y[4] == '-1'):
                pass
            else:
                question.append(pairs[y[4]])
                answer.append(pairs[y[0]])
    except Exception as e:
        count+= 1
        conv_exception.append(convs)
        pass
len(question), len(answer)


(240252, 240252)

In [128]:
df = pd.DataFrame(list(zip(question, answer)), columns=['question','answer'])
df.drop_duplicates(subset='question', inplace=True)
x = df.loc[df['question'].str.len() < 200 ]
x = x.loc[df['answer'].str.len()  < 200]
len(df), len(x)

(193136, 53466)

In [129]:
x.head()

Unnamed: 0,question,answer
0,"Yes, that's good. Revathy's page looked very reliable, that's why we used that as a source.","Nagma'a site, at least that filmography page, looks like taken from Wikipedia itself. I didn't remove it from time being, but I don't think it's true."
4,"No, no, I was the nominator! Just wanted to make sure it wasn't done in error. --","I was a bit surprised because i know you were the nominator ;) No, it wasn't done in error. -"
8,"Hi. Just writing that you closed out the [[WP:CFD]] as a '''merge and delete''', but category appears open still at this moment. Yours,","Yes, I know. It's on the list at [[Wikipedia:Categories_for_discussion\/Working]], and the bot should get to it shortly."
12,"I agree i copied, even he mi8 know it very well. I dont know 2 design and all but some thing inside, i got my own work also. If u dont mind can i copy abit of urs. Even ur page looks good!!",Ok! Ty for the info and links. Will create new design asap.
14,"Thanks. I understand that there is a lot of controversy here, even between respected scholars. We need to document that disagreement in our articles, hopefully without attacking editors. --","Capricornis, stop insinuating. And please refrain from various [[Ad hominem]] arguments, as they are falacies. Comment on contributions please."


### Custome dataset Preprocessing

In [131]:
with open("/text/rudra/chatbot/data/data") as f:
    x = f.read().split("\n\n\n")
x


['Person A: "Hi, my name is Steve. It\'s nice to meet you."\nPerson B: "I\'m Jack. It\'s a pleasure to meet you, Steve."\nPerson A: "What do you do for a living Jack?"\nPerson B: "I work at the bank."',
 'Person A: "What is your name?"\nPerson B: "Jackson."\nPerson A: "What was that again?"',
 'Person A: "Hey John, how have you been?"\nPerson B: "What a surprise. I haven\'t seen you in a long time. How have you been?"\nPerson A: "I\'m doing very well. How about you?"\nPerson B: "I finally have some free time. I just finished taking a big examination, and I\'m so relieved that I\'m done with it."',
 'Person A: "Hi Nancy, what have you been up to?"\nPerson B: "The same ole same ole." \nPerson A: "I\'m pretty busy at work these days, but otherwise, everything is great."',
 'Person A: "Andy, it\'s been a long time, how are you man?"\nPerson B: "What a surprise. I haven\'t seen you in a long time. How have you been?"\nPerson A: "Do you come to this restaurant often?"\nPerson B: "I\'ve been 

In [145]:
x[1].split("\n")[0].replace('"','')

'Person A: What is your name?'

In [146]:
question = []
answer =[]
for conv in x:
    for _ in range(len(conv.split("\n"))-1):
        y = conv.split("\n")
        question.append(y[_].replace('"','').split(":")[-1].strip())
        answer.append(y[_+1].replace('"','').split(":")[-1].strip())
len(question), len(answer)
    

(309, 309)

In [147]:
question

["Hi, my name is Steve. It's nice to meet you.",
 "I'm Jack. It's a pleasure to meet you, Steve.",
 'What do you do for a living Jack?',
 'What is your name?',
 'Jackson.',
 'Hey John, how have you been?',
 "What a surprise. I haven't seen you in a long time. How have you been?",
 "I'm doing very well. How about you?",
 'Hi Nancy, what have you been up to?',
 'The same ole same ole.',
 "Andy, it's been a long time, how are you man?",
 "What a surprise. I haven't seen you in a long time. How have you been?",
 'Do you come to this restaurant often?',
 'Hi, how are you doing?',
 "I'm doing great. How about you?",
 'Not too bad.',
 'Do you come to this restaurant often?',
 "I've been here a couple of times, but I don't come on a regular basis. What have you been up to?",
 "I'm pretty busy at work these days, but otherwise, everything is great.",
 'Well, have a good evening.',
 "It's nice to meet you. My name is Jack.",
 "I'm Steve. It's a pleasure to meet you.",
 'What was your name again?

In [115]:
for i,_ in enumerate(question):
    try:
        question[i] = bytes(_, "utf-8").decode("unicode_escape")
    except Exception as e:
        
        print(i,_, e)


In [116]:
question

['"Hi, my name is Steve. It\'s nice to meet you."',
 '"I\'m Jack. It\'s a pleasure to meet you, Steve."',
 '"What do you do for a living Jack?"',
 '"What is your name?"',
 '"Jackson."',
 '"Hey John, how have you been?"',
 '"What a surprise. I haven\'t seen you in a long time. How have you been?"',
 '"I\'m doing very well. How about you?"',
 '"Hi Nancy, what have you been up to?"',
 '"The same ole same ole."',
 '"Andy, it\'s been a long time, how are you man?"',
 '"What a surprise. I haven\'t seen you in a long time. How have you been?"',
 '"Do you come to this restaurant often?"',
 '"Hi, how are you doing?"',
 '"I\'m doing great. How about you?"',
 '"Not too bad."',
 '"Do you come to this restaurant often?"',
 '"I\'ve been here a couple of times, but I don\'t come on a regular basis. What have you been up to?"',
 '"I\'m pretty busy at work these days, but otherwise, everything is great."',
 '"Well, have a good evening."',
 '"It\'s nice to meet you. My name is Jack."',
 '"I\'m Steve. It

In [150]:
df= pd.DataFrame(list(zip(question, answer)), columns=['question', 'answer'])

In [151]:
# df.apply(lambda x: repl(x.question,x.answer), axis=1)
df.head()

Unnamed: 0,question,answer
0,"Hi, my name is Steve. It's nice to meet you.","I'm Jack. It's a pleasure to meet you, Steve."
1,"I'm Jack. It's a pleasure to meet you, Steve.",What do you do for a living Jack?
2,What do you do for a living Jack?,I work at the bank.
3,What is your name?,Jackson.
4,Jackson.,What was that again?


### Supreme court dataset Preprocessing

'testjhgd'