### Importing the libraries

In [1]:
import numpy as np
import tensorflow as tf
import re
import time

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])


### Importing the dataset

In [2]:
lines = open('movie_lines.txt', encoding = 'utf-8', errors = 'ignore').read().split('\n')
conversations = open('movie_conversations.txt', encoding = 'utf-8', errors = 'ignore').read().split('\n')


In [3]:
lines

['L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!',
 'L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!',
 'L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.',
 'L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?',
 "L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.",
 'L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow',
 "L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.",
 'L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No',
 'L870 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?',
 'L869 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Like my fear of wearing pastels?',
 'L868 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ The "real you".',
 'L867 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ What good stuff?',
 "L866 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ I figured yo

In [4]:
conversations

["u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L204', 'L205', 'L206']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L207', 'L208']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L271', 'L272', 'L273', 'L274', 'L275']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L276', 'L277']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L280', 'L281']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L363', 'L364']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L365', 'L366']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L367', 'L368']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L401', 'L402', 'L403']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L404', 'L405', 'L406', 'L407']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L575', 'L576']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L577', 'L578']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L662', 'L663']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L693', 'L69

#### Creating the dictionary that maps each line and its id

In [5]:
id2line = {}
for line in lines:
    _line = line.split(' +++$+++ ')
    if len(_line) == 5:
        id2line[_line[0]] = _line[4]

In [6]:
id2line

{'L402115': "I'm a licensed cosmetologist, I worked for two years, at the &quot;Dino Raphael&quot; Salon, all my customers cried when I told them I was leaving.",
 'L601446': "I was afraid you weren't gonna come. We haven't got much time.",
 'L142318': 'John Hinkley.  The guy who shot Reagan.  He only had two names.',
 'L644412': 'I think we asked for "special programs" and they gave us "special prisons."',
 'L36433': "That's the idea.",
 'L489860': 'You tell me, then... you walk into an apartment, and a man has beaten his wife to death, or the wife murdered the husband, and you have to wash the blood off their children.  You put the killer in jail.  Who won?',
 'L434139': "That's Kenneth, my brother.  He's talking on the phone.",
 'L298516': 'You have to leave the White House.',
 'L632498': "Now that's too bad.  I sure hope you know how to wash dishes or shovel shit 'cause you're gonna have to work this one off.",
 'L62767': 'The name of the show?',
 'L376872': 'MEMENTO Pink Revisions

##### Create a list of all the conversation

In [7]:
conversations_ids = []
for conversation in conversations[:-1]:
    _conversation = conversation.split(' +++$+++ ')[-1][1:-1].replace("'", "").replace(" ", "")
    conversations_ids.append(_conversation.split(','))
    
conversations_ids
    
    

[['L194', 'L195', 'L196', 'L197'],
 ['L198', 'L199'],
 ['L200', 'L201', 'L202', 'L203'],
 ['L204', 'L205', 'L206'],
 ['L207', 'L208'],
 ['L271', 'L272', 'L273', 'L274', 'L275'],
 ['L276', 'L277'],
 ['L280', 'L281'],
 ['L363', 'L364'],
 ['L365', 'L366'],
 ['L367', 'L368'],
 ['L401', 'L402', 'L403'],
 ['L404', 'L405', 'L406', 'L407'],
 ['L575', 'L576'],
 ['L577', 'L578'],
 ['L662', 'L663'],
 ['L693', 'L694', 'L695'],
 ['L696', 'L697', 'L698', 'L699'],
 ['L860', 'L861'],
 ['L862', 'L863', 'L864', 'L865'],
 ['L866', 'L867', 'L868', 'L869'],
 ['L870', 'L871', 'L872'],
 ['L924', 'L925'],
 ['L984', 'L985'],
 ['L1044', 'L1045'],
 ['L49', 'L50', 'L51'],
 ['L571', 'L572', 'L573'],
 ['L579', 'L580'],
 ['L595', 'L596', 'L597'],
 ['L598', 'L599', 'L600'],
 ['L659', 'L660'],
 ['L952', 'L953'],
 ['L394', 'L395'],
 ['L396', 'L397'],
 ['L589', 'L590', 'L591'],
 ['L592', 'L593'],
 ['L756', 'L757', 'L758'],
 ['L759', 'L760'],
 ['L164', 'L165'],
 ['L319', 'L320'],
 ['L441', 'L442', 'L443', 'L444', 'L445']

#### Get separately the questions and the answers

In [8]:
questions = []
answers = []

for conversation in conversations_ids:
    for i in range(len(conversation) - 1):
        questions.append(id2line[conversation[i]])
        answers.append(id2line[conversation[i+1]])
#     x = conversation[::2]
#     y = conversation[1::2]



In [9]:
questions

['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.',
 "Well, I thought we'd start with pronunciation, if that's okay with you.",
 'Not the hacking and gagging and spitting part.  Please.',
 "You're asking me out.  That's so cute. What's your name again?",
 "No, no, it's my fault -- we didn't have a proper introduction ---",
 'Cameron.',
 "The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.",
 'Why?',
 'Unsolved mystery.  She used to be really popular when she started high school, then it was just like she got sick of it or something.',
 'Gosh, if only we could find Kat a boyfriend...',
 "C'esc ma tete. This is my head",
 "Right.  See?  You're ready for the quiz.",
 "I don't want to know how to say that though.  I want to know useful things. Like where the good stores are.  How much does champagne cost?  Stuff like Chat.  I have n

In [10]:
answers

["Well, I thought we'd start with pronunciation, if that's okay with you.",
 'Not the hacking and gagging and spitting part.  Please.',
 "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?",
 'Forget it.',
 'Cameron.',
 "The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.",
 'Seems like she could get a date easy enough...',
 'Unsolved mystery.  She used to be really popular when she started high school, then it was just like she got sick of it or something.',
 "That's a shame.",
 'Let me see what I can do.',
 "Right.  See?  You're ready for the quiz.",
 "I don't want to know how to say that though.  I want to know useful things. Like where the good stores are.  How much does champagne cost?  Stuff like Chat.  I have never in my life had to point out my head to someone.",
 "That's because it's such a nice one.",
 'Forget French.',
 "Well, there's someone I think might be --",
 'Where?',
 "I 

#### Cleaning the texts

In [11]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"i'm", "i am", text) # remove apostrophe
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"[-()\"#/@;:<>{}+=~|.?,]", "", text)
    return text

#### Cleaning the questions and answers



In [12]:
clean_questions = []
clean_answers = []

for question in questions:
    clean_questions.append(clean_text(question))
    

for answer in answers:
    clean_answers.append(clean_text(answer))
    
    

In [13]:
clean_questions

['can we make this quick  roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad  again',
 'well i thought we would start with pronunciation if that is okay with you',
 'not the hacking and gagging and spitting part  please',
 'you are asking me out  that is so cute what is your name again',
 "no no it's my fault  we didn't have a proper introduction ",
 'cameron',
 'the thing is cameron  i am at the mercy of a particularly hideous breed of loser  my sister  i cannot date until she does',
 'why',
 'unsolved mystery  she used to be really popular when she started high school then it was just like she got sick of it or something',
 'gosh if only we could find kat a boyfriend',
 "c'esc ma tete this is my head",
 'right  see  you are ready for the quiz',
 "i don't want to know how to say that though  i want to know useful things like where the good stores are  how much does champagne cost  stuff like chat  i have never in my life had to point out

In [14]:
clean_answers

['well i thought we would start with pronunciation if that is okay with you',
 'not the hacking and gagging and spitting part  please',
 "okay then how 'bout we try out some french cuisine  saturday  night",
 'forget it',
 'cameron',
 'the thing is cameron  i am at the mercy of a particularly hideous breed of loser  my sister  i cannot date until she does',
 'seems like she could get a date easy enough',
 'unsolved mystery  she used to be really popular when she started high school then it was just like she got sick of it or something',
 'that is a shame',
 'let me see what i can do',
 'right  see  you are ready for the quiz',
 "i don't want to know how to say that though  i want to know useful things like where the good stores are  how much does champagne cost  stuff like chat  i have never in my life had to point out my head to someone",
 "that is because it's such a nice one",
 'forget french',
 "well there's someone i think might be ",
 'where',
 "i counted on you to help my cause 

### Remove non frequent words

#### Creating a dictionary that maps each word to its number of occurences

In [15]:
word2count={} #it maps each word to how many time it appears

for question in clean_questions:
    for word in question.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1
for answer in clean_answers:
    for word in answer.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1

In [16]:
word2count

{'surely': 179,
 'inflexible': 4,
 "decency'": 2,
 'ostrey': 2,
 'sanest': 2,
 "'s'okay": 2,
 'abilene': 6,
 'gonzalez': 1,
 'pruned': 1,
 'pore': 3,
 'ufreshutherefore': 1,
 'twisted': 80,
 'less': 521,
 'sunbed': 1,
 'duckbill': 2,
 'shoulld': 1,
 'restraints': 16,
 'whoever': 203,
 'eisenhower': 6,
 'buzzer': 7,
 'logger': 2,
 'labs': 10,
 'itsomething': 2,
 "emperor's": 8,
 'monkeybone': 14,
 'stolen': 135,
 'boq': 1,
 'chucho': 1,
 'lickety': 1,
 'planned': 152,
 'revolves': 5,
 "ernhart's": 1,
 'rearm': 1,
 'fonzie': 2,
 'unsupported': 6,
 'kiev': 4,
 "dis'": 2,
 'audience!': 6,
 'fink!': 1,
 'costume!': 1,
 'cheeseburger': 10,
 '145': 4,
 'fallen': 78,
 'picturephone': 2,
 'doofus!': 4,
 'submittingbeing': 1,
 'crashi': 1,
 'undeserving': 1,
 "pettin'": 4,
 "recruitin'": 1,
 "millionaire's": 1,
 "up'": 12,
 'beam': 80,
 "education'": 1,
 'amorphous': 2,
 'humpback': 11,
 'fumitsu': 4,
 'sirloin!': 2,
 'bachelor': 66,
 'egyptians': 4,
 'terminus': 2,
 'gat[11]': 2,
 'flyfly': 1,


In [17]:
# qq = ' Hi How are you? Hi Hi '
# pq = {}
# for word in qq.split():
# #     print(word)
#     if word not in pq:
#         pq[word] = 1
#     else:
#         pq[word] += 1

In [18]:
word2count.get('if')

18952

#### Creating two dictionaries that map the questions words and the answers words to a unique integer

`We perform tokenization here`

* remove 5% least frequent words

In [None]:
threshold_questions = 20 
questionswords2int = {}
word_number = 0

"""Include 95% of most frequent word of question and at the same time 
associate them to a uniue integer."""
for word,count in word2count.items():
    if count >= threshold_questions:
        questionswords2int[word] = word_number # include the word in dictionary
        word_number += 1
        
# questionsword2int


threshold_answers = 20 
answerswords2int = {}
word_number = 0

"""Include 95% of most frequent word of answers and at the same time 
associate them to a uniue integer."""
for word,count in word2count.items():
    if count >= threshold_answers:
        answerswords2int[word] = word_number # include the word in dictionary
        word_number += 1
        
# answersword2int

In [None]:
print(len(answerswords2int))
answerswords2int['avoiding']

In [None]:
print(len(questionswords2int))
# print(questionswords2int['Roxanne'])
questionswords2int

In [None]:
print(len(answerswords2int))
answerswords2int

### Adding the list of tokens to these two dictionaries, that is sos and eos

In [23]:
tokens = ['<PAD>','<EOS>','<OUT>','<SOS>'] # OUT means all the words that were filtered out by all the previous dictionries

for token in tokens:
    questionswords2int[token] = len(questionswords2int) + 1
    
for token in tokens:
    answerswords2int[token] = len(answerswords2int) + 1

In [24]:
len(questionswords2int)

8825

In [25]:
len(answerswords2int)

8825

### Creating the inverse dictionary of the answerswords2int dictionary

In [26]:
answersints2word = {w_i: w for w, w_i in answerswords2int.items()}
answersints2word

{0: 'surely',
 1: 'twisted',
 2: 'less',
 3: 'whoever',
 4: 'stolen',
 5: 'planned',
 6: 'fallen',
 7: 'beam',
 8: 'bachelor',
 9: 'brandy',
 10: 'butts',
 11: 'service',
 12: 'owl',
 13: 'means',
 14: 'racist',
 15: 'gentlemen',
 16: 'nigga',
 17: 'perhaps',
 18: 'merger',
 19: 'roger',
 20: 'indians',
 21: 'diane',
 22: 'pilots',
 23: "tellin'",
 24: 'family',
 25: 'gather',
 26: 'curtain',
 27: 'sandwiches',
 28: 'seduce',
 29: 'squid',
 30: 'hates',
 31: 'disappeared',
 32: 'aside',
 33: 'keep',
 34: 'fuckedup',
 35: "money's",
 36: 'wednesday',
 37: 'flip',
 38: 'dumped',
 39: 'tune',
 40: 'collecting',
 41: "anythin'",
 42: 'oral',
 43: 'pd',
 44: 'ned',
 45: 'baked',
 46: 'except',
 47: 'morals',
 48: 'boat',
 49: 'evolved',
 50: 'refuses',
 51: 'ridge',
 52: 'payday',
 53: 'association',
 54: 'inject',
 55: 'sixty',
 56: 'exposure',
 57: 'sworn',
 58: 'brady',
 59: 'reach',
 60: 'newspapers',
 61: 'everett',
 62: 'henry',
 63: 'villain',
 64: 'ahm',
 65: 'could',
 66: 'sue',
 6

### Adding the End Of String token to the end of every answer

In [27]:
for i in range(len(clean_answers)):
    clean_answers[i] += '<EOS>'
    
clean_answers

['well i thought we would start with pronunciation if that is okay with you<EOS>',
 'not the hacking and gagging and spitting part  please<EOS>',
 "okay then how 'bout we try out some french cuisine  saturday  night<EOS>",
 'forget it<EOS>',
 'cameron<EOS>',
 'the thing is cameron  i am at the mercy of a particularly hideous breed of loser  my sister  i cannot date until she does<EOS>',
 'seems like she could get a date easy enough<EOS>',
 'unsolved mystery  she used to be really popular when she started high school then it was just like she got sick of it or something<EOS>',
 'that is a shame<EOS>',
 'let me see what i can do<EOS>',
 'right  see  you are ready for the quiz<EOS>',
 "i don't want to know how to say that though  i want to know useful things like where the good stores are  how much does champagne cost  stuff like chat  i have never in my life had to point out my head to someone<EOS>",
 "that is because it's such a nice one<EOS>",
 'forget french<EOS>',
 "well there's some

#### Translating all the questions and the answers into integers
#### and Replacing all the words that were filtered out by \<OUT>

`That is remove all the non frequent word(5% word) from questions list and answers list`

In [28]:
questions_into_int = []
for question in clean_questions:
    ints = []
    for word in question.split():
        if word not in questionswords2int:
            ints.append(questionswords2int['<OUT>'])
        else:
            ints.append(questionswords2int[word])
    questions_into_int.append(ints)
answers_into_int = []
for answer in clean_answers:
    ints = []
    for word in answer.split():
        if word not in answerswords2int:
            ints.append(answerswords2int['<OUT>'])
        else:
            ints.append(answerswords2int[word])
    answers_into_int.append(ints)

In [29]:
print(questionswords2int)




In [30]:
questions_into_int

[[6276,
  8424,
  3287,
  1141,
  8653,
  8824,
  8824,
  748,
  6693,
  8824,
  4167,
  6499,
  696,
  6608,
  8824,
  2131,
  4687,
  4647,
  2103,
  3463,
  8824,
  2994],
 [7211,
  5736,
  2419,
  8424,
  8303,
  5175,
  4443,
  8824,
  2872,
  4631,
  6644,
  8342,
  4443,
  2523],
 [7341, 3463, 8824, 748, 8824, 748, 8824, 693, 1109],
 [2523,
  4167,
  6279,
  3869,
  8123,
  4631,
  6644,
  5892,
  1108,
  6461,
  6644,
  6054,
  405,
  2994],
 [4757, 4757, 7158, 8011, 1355, 8424, 4476, 5389, 2029, 8225, 8824],
 [634],
 [3463,
  7079,
  6644,
  634,
  5736,
  3065,
  6703,
  3463,
  2280,
  6068,
  2029,
  8489,
  8824,
  7489,
  6068,
  3763,
  8011,
  5223,
  5736,
  3847,
  3057,
  7527,
  1555,
  7968],
 [7379],
 [8824,
  6900,
  1555,
  4215,
  6690,
  4508,
  5374,
  1264,
  3102,
  1555,
  1097,
  1709,
  8370,
  1876,
  3417,
  7092,
  1368,
  4931,
  1555,
  3646,
  1014,
  6068,
  3417,
  2690,
  8385],
 [5344, 2872, 5424, 8424, 65, 5128, 6419, 2029, 1292],
 [8824, 8405

In [31]:
print("indxx of removed words\n",ints)

indxx of removed words
 [5892, 4162, 5424, 6395, 6912, 6211, 8424, 5389, 5174, 2243, 6068, 2029, 7230, 8824, 1347, 1797, 5579, 7994, 8823]


In [32]:
print(answerswords2int)




In [33]:
answers_into_int

[[7211,
  5736,
  2419,
  8424,
  8303,
  5175,
  4443,
  8824,
  2872,
  4631,
  6644,
  8342,
  4443,
  8824],
 [7341, 3463, 8824, 748, 8824, 748, 8824, 693, 8824],
 [8342, 1876, 7826, 2597, 8424, 67, 8123, 2498, 5625, 8824, 7694, 8824],
 [4525, 8824],
 [8824],
 [3463,
  7079,
  6644,
  634,
  5736,
  3065,
  6703,
  3463,
  2280,
  6068,
  2029,
  8489,
  8824,
  7489,
  6068,
  3763,
  8011,
  5223,
  5736,
  3847,
  3057,
  7527,
  1555,
  8824],
 [6994, 4931, 1555, 65, 2467, 2029, 3057, 154, 8824],
 [8824,
  6900,
  1555,
  4215,
  6690,
  4508,
  5374,
  1264,
  3102,
  1555,
  1097,
  1709,
  8370,
  1876,
  3417,
  7092,
  1368,
  4931,
  1555,
  3646,
  1014,
  6068,
  3417,
  2690,
  8824],
 [4631, 6644, 2029, 8824],
 [3115, 3869, 1803, 6461, 5736, 6276, 8824],
 [7101, 1803, 2523, 4167, 7100, 5242, 3463, 8824],
 [5736,
  5245,
  3098,
  6690,
  5291,
  7826,
  6690,
  5002,
  4631,
  8335,
  5736,
  3098,
  6690,
  5291,
  5414,
  3088,
  4931,
  639,
  3463,
  6993,
  7276,

### Sorting questions and answers by the length of questions

In [34]:
sorted_clean_questions = []
sorted_clean_answers = []

for length in range(1,25+1):
    for i in enumerate(questions_into_int): # give index of the question and question that is converted into number
        if len(i[1]) == length:
            sorted_clean_questions.append(questions_into_int[i[0]])
            sorted_clean_answers.append(answers_into_int[i[0]])

In [35]:
sorted_clean_questions

[[634],
 [7379],
 [7994],
 [5249],
 [6211],
 [4757],
 [7468],
 [4757],
 [308],
 [2420],
 [5362],
 [6461],
 [7379],
 [6211],
 [7379],
 [2415],
 [6612],
 [6641],
 [3722],
 [4428],
 [6461],
 [4656],
 [308],
 [8342],
 [7379],
 [308],
 [8824],
 [3758],
 [6325],
 [308],
 [1757],
 [8824],
 [8824],
 [7566],
 [7826],
 [8824],
 [8824],
 [6461],
 [4757],
 [6461],
 [8824],
 [5249],
 [7101],
 [6461],
 [8767],
 [8824],
 [880],
 [4757],
 [4757],
 [5957],
 [8824],
 [1040],
 [4757],
 [4307],
 [3102],
 [4428],
 [4757],
 [4645],
 [2405],
 [4428],
 [3222],
 [3222],
 [3222],
 [3222],
 [4428],
 [5297],
 [7981],
 [8342],
 [7100],
 [7566],
 [6461],
 [1304],
 [7253],
 [8824],
 [8824],
 [3989],
 [4428],
 [1782],
 [6461],
 [6461],
 [4428],
 [461],
 [461],
 [461],
 [461],
 [461],
 [461],
 [8342],
 [7566],
 [5374],
 [7566],
 [4307],
 [7826],
 [717],
 [461],
 [461],
 [461],
 [8824],
 [461],
 [461],
 [5374],
 [4359],
 [2766],
 [19],
 [4428],
 [405],
 [4428],
 [6167],
 [4428],
 [7753],
 [4428],
 [7211],
 [7994],
 [64

In [36]:
sorted_clean_answers

[[3463,
  7079,
  6644,
  634,
  5736,
  3065,
  6703,
  3463,
  2280,
  6068,
  2029,
  8489,
  8824,
  7489,
  6068,
  3763,
  8011,
  5223,
  5736,
  3847,
  3057,
  7527,
  1555,
  8824],
 [8824,
  6900,
  1555,
  4215,
  6690,
  4508,
  5374,
  1264,
  3102,
  1555,
  1097,
  1709,
  8370,
  1876,
  3417,
  7092,
  1368,
  4931,
  1555,
  3646,
  1014,
  6068,
  3417,
  2690,
  8824],
 [8824],
 [5786, 4931, 3088, 7457, 8123, 7981, 8824],
 [2523, 4021, 863, 1141, 8824],
 [8342, 2523, 4167, 5819, 4275, 6690, 6908, 7826, 6690, 8824],
 [2168, 8824],
 [2523, 261, 1487, 3900, 7316, 8824],
 [8824],
 [8303, 2523, 7299, 2803, 3869, 2029, 6907, 8824],
 [8011,
  1133,
  1964,
  5736,
  5389,
  3646,
  2029,
  6993,
  6889,
  6703,
  7532,
  3463,
  8824,
  5370,
  2106,
  8824],
 [4531, 8824, 5242, 2029, 8824],
 [5612, 7092, 4931, 2029, 4220, 8824],
 [4406,
  4631,
  5736,
  4552,
  5736,
  8303,
  2415,
  3782,
  5876,
  1368,
  3219,
  3337,
  587,
  7092,
  6523,
  3417,
  748,
  5736,
  

# ` Building Seq to Seq model `
### Creating placeholders for the inputs and the targets

In [37]:
def model_inputs():
    inputs = tf.placeholder(tf.int32, [None,None], name = 'input')
    targets = tf.placeholder(tf.int32, [None,None], name = 'target') # [None,None] represents targets variable is of two dimension
    lr = tf.placeholder(tf.float32, name = 'learning_rate')
    keep_prob = tf.placeholder(tf.float32,name = 'keep_prob')
    
    return inputs,targets,lr,keep_prob

### Preprocessing the target

In [38]:
def preprocess_targets(targets,word2int,batch_size):
    left_side = tf.fill([batch_size,1],word2int['<SOS>'])
    right_side = tf.strided_slice(targets, [0,0], [batch_size,-1],[1,1]) #Extract everything except the last column
    preprocessed_targets = tf.concat([left_side, right_side], 1)
    return preprocessed_targets

### Creating the Encoder RNN

In [39]:
def encoder_rnn(rnn_inputs,rnn_size,num_layers,keep_prob,seq_length):
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    lstm_dropout = tf.contrib.rnn.DropoutWrapper(lstm,input_keep_prob=keep_prob)
    encoder_cell = tf.contrib.rnn.MultiRNNCell([lstm_dropout] * num_layers)
    encoder_output,encoder_state = tf.nn.bidirectional_dynamic_rnn(cell_fw = encoder_cell,
                                                                  cell_bw = encoder_cell,
                                                                  sequence_length = sequence_length,
                                                                  inputs = rnn_inputs,
                                                                  dtype = tf.float32)
    
    return encoder_state

### Decoding the training set


In [40]:
def decode_training_set(encoder_state, decoder_cell, decoder_embedded_input, sequence_length, decoding_scope, output_function, keep_prob, batch_size):
    attention_states = tf.zeros([batch_size, 1, decoder_cell.output_size])
    attention_keys, attention_values, attention_score_function, attention_construct_function = tf.contrib.seq2seq.prepare_attention(attention_states, attention_option = "bahdanau", num_units = decoder_cell.output_size)
    training_decoder_function = tf.contrib.seq2seq.attention_decoder_fn_train(encoder_state[0],
                                                                              attention_keys,
                                                                              attention_values,
                                                                              attention_score_function,
                                                                              attention_construct_function,
                                                                              name = "attn_dec_train")
    decoder_output, decoder_final_state, decoder_final_context_state = tf.contrib.seq2seq.dynamic_rnn_decoder(decoder_cell,
                                                                                                              training_decoder_function,
                                                                                                              decoder_embedded_input,
                                                                                                              sequence_length,
                                                                                                              scope = decoding_scope)
    decoder_output_dropout = tf.nn.dropout(decoder_output, keep_prob)
    return output_function(decoder_output_dropout)

### Decoding the validation set

In [41]:
def decode_test_set(encoder_state, decoder_cell, decoder_embeddings_matrix, sos_id, eos_id, maximum_length, num_words, decoding_scope, output_function, keep_prob, batch_size):
    attention_states = tf.zeros([batch_size, 1, decoder_cell.output_size])
    attention_keys, attention_values, attention_score_function, attention_construct_function = tf.contrib.seq2seq.prepare_attention(attention_states, attention_option = "bahdanau", num_units = decoder_cell.output_size)
    test_decoder_function = tf.contrib.seq2seq.attention_decoder_fn_inference(output_function,
                                                                              encoder_state[0],
                                                                              attention_keys,
                                                                              attention_values,
                                                                              attention_score_function,
                                                                              attention_construct_function,
                                                                              decoder_embeddings_matrix,
                                                                              sos_id,
                                                                              eos_id,
                                                                              maximum_length,
                                                                              num_words,
                                                                              name = "attn_dec_inf")
    test_predictions, decoder_final_state, decoder_final_context_state = tf.contrib.seq2seq.dynamic_rnn_decoder(decoder_cell,
                                                                                                                test_decoder_function,
                                                                                                                scope = decoding_scope)
    return test_predictions

### Creating the Decoder RNN

In [42]:
def decoder_rnn(decoder_embedded_input, decoder_embeddings_matrix, encoder_state, num_words, sequence_length, rnn_size, num_layers, word2int, keep_prob, batch_size):
    with tf.variable_scope("decoding") as decoding_scope:
        lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
        lstm_dropout = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob = keep_prob)
        decoder_cell = tf.contrib.rnn.MultiRNNCell([lstm_dropout] * num_layers)
        weights = tf.truncated_normal_initializer(stddev = 0.1)
        biases = tf.zeros_initializer()
        output_function = lambda x: tf.contrib.layers.fully_connected(x,
                                                                      num_words,
                                                                      None,
                                                                      scope = decoding_scope,
                                                                      weights_initializer = weights,
                                                                      biases_initializer = biases)
        training_predictions = decode_training_set(encoder_state,
                                                   decoder_cell,
                                                   decoder_embedded_input,
                                                   sequence_length,
                                                   decoding_scope,
                                                   output_function,
                                                   keep_prob,
                                                   batch_size)
        decoding_scope.reuse_variables()
        test_predictions = decode_test_set(encoder_state,
                                           decoder_cell,
                                           decoder_embeddings_matrix,
                                           word2int['<SOS>'],
                                           word2int['<EOS>'],
                                           sequence_length - 1,
                                           num_words,
                                           decoding_scope,
                                           output_function,
                                           keep_prob,
                                           batch_size)
    return training_predictions, test_predictions


### Building the seq2seq model

In [43]:
def seq2seq_model(inputs, targets, keep_prob, batch_size, sequence_length, answers_num_words, questions_num_words, encoder_embedding_size, decoder_embedding_size, rnn_size, num_layers, questionswords2int):
    encoder_embedded_input = tf.contrib.layers.embed_sequence(inputs,
                                                              answers_num_words + 1,
                                                              encoder_embedding_size,
                                                              initializer = tf.random_uniform_initializer(0, 1))
    encoder_state = encoder_rnn(encoder_embedded_input, rnn_size, num_layers, keep_prob, sequence_length)
    preprocessed_targets = preprocess_targets(targets, questionswords2int, batch_size)
    decoder_embeddings_matrix = tf.Variable(tf.random_uniform([questions_num_words + 1, decoder_embedding_size], 0, 1))
    decoder_embedded_input = tf.nn.embedding_lookup(decoder_embeddings_matrix, preprocessed_targets)
    training_predictions, test_predictions = decoder_rnn(decoder_embedded_input,
                                                         decoder_embeddings_matrix,
                                                         encoder_state,
                                                         questions_num_words,
                                                         sequence_length,
                                                         rnn_size,
                                                         num_layers,
                                                         questionswords2int,
                                                         keep_prob,
                                                         batch_size)
    return training_predictions, test_predictions

# Training the Seq2Seq Model

### <font color = 'green'> Setting the hyperparmeter</font>

In [44]:
epochs = 100
batch_size = 64
rnn_size = 512
num_layers = 3 # How many layers in encoder nn and decoder nn
encoding_embedding_size = 512 # no of columns for embdedding value.
decoding_embedding_size = 512
learning_rate = 0.01
learning_rate_decay = 0.9 # By which percentage lr is reduced
min_learning_rate = 0.0001 
keep_probability = 0.5 # it controls the dropout rate. It is basically 1-dropout_rate

In [45]:
# Defining a session
tf.reset_default_graph()
session = tf.InteractiveSession()

In [46]:
# Loading the model inputs
inputs,targets,lr,keep_prob = model_inputs()
# Setting the sequence length
sequence_length = tf.placeholder_with_default(25,None,name='sequence_length')
# Getting the shape of input tensor
input_shape = tf.shape(inputs)

### Getting the training and test predictions


In [47]:
training_predictions, test_predictions = seq2seq_model(tf.reverse(inputs, [-1]),
                                                       targets,
                                                       keep_prob,
                                                       batch_size,
                                                       sequence_length,
                                                       len(answerswords2int),
                                                       len(questionswords2int),
                                                       encoding_embedding_size,
                                                       decoding_embedding_size,
                                                       rnn_size,
                                                       num_layers,
                                                       questionswords2int)

### Setting up the Loss Error, the optimizer and Gradient Clipping

In [48]:

with tf.name_scope("optimization"):
    loss_error = tf.contrib.seq2seq.sequence_loss(training_predictions,targets,tf.ones([input_shape[0],sequence_length]))
    optimizer = tf.train.AdamOptimizer(learning_rate)
    gradients = optimizer.compute_gradients(loss_error)
    clipped_gradients = [(tf.clip_by_value(grad_tensor, -5., 5.), grad_variable) for grad_tensor, grad_variable in gradients if grad_tensor is not None]
    optimizer_gradient_clipping = optimizer.apply_gradients(clipped_gradients)

### Padding the sequences with the <PAD> token

In [49]:

def apply_padding(batch_of_sequences, word2int):
    max_sequence_length = max([len(sequence) for sequence in batch_of_sequences])
    return [sequence + [word2int['<PAD>']] * (max_sequence_length - len(sequence)) for sequence in batch_of_sequences]

### Splitting the data into batches of questions and answers

In [50]:

def split_into_batches(questions, answers, batch_size):
    for batch_index in range(0, len(questions) // batch_size):
        start_index = batch_index * batch_size
        questions_in_batch = questions[start_index : start_index + batch_size]
        answers_in_batch = answers[start_index : start_index + batch_size]
        padded_questions_in_batch = np.array(apply_padding(questions_in_batch, questionswords2int))
        padded_answers_in_batch = np.array(apply_padding(answers_in_batch, answerswords2int))
        yield padded_questions_in_batch, padded_answers_in_batch

### Splitting the questions and answers into training and validation sets

In [51]:

training_validation_split = int(len(sorted_clean_questions) * 0.15)
training_questions = sorted_clean_questions[training_validation_split:]
training_answers = sorted_clean_answers[training_validation_split:]
validation_questions = sorted_clean_questions[:training_validation_split]
validation_answers = sorted_clean_answers[:training_validation_split]

### Training

In [52]:
# batch_index_check_training_loss = 100
# batch_index_check_validation_loss = ((len(training_questions)) // batch_size // 2) - 1
# total_training_loss_error = 0
# list_validation_loss_error = []
# early_stopping_check = 0
# early_stopping_stop = 1000
# checkpoint = "./chatbot_weights.ckpt"
# session.run(tf.global_variables_initializer())

In [53]:
# for epoch in range(1, epochs + 1):
#     for batch_index, (padded_questions_in_batch, padded_answers_in_batch) in enumerate(split_into_batches(training_questions, training_answers, batch_size)):
#         starting_time = time.time()
#         _, batch_training_loss_error = session.run([optimizer_gradient_clipping, loss_error], {inputs: padded_questions_in_batch,
#                                                                                                targets: padded_answers_in_batch,
#                                                                                                lr: learning_rate,
#                                                                                                sequence_length: padded_answers_in_batch.shape[1],
#                                                                                                keep_prob: keep_probability})
#         total_training_loss_error += batch_training_loss_error
#         ending_time = time.time()
#         batch_time = ending_time - starting_time
#         if batch_index % batch_index_check_training_loss == 0:
#             print('Epoch: {:>3}/{}, Batch: {:>4}/{}, Training Loss Error: {:>6.3f}, Training Time on 100 Batches: {:d} seconds'.format(epoch,
#                                                                                                                                        epochs,
#                                                                                                                                        batch_index,
#                                                                                                                                        len(training_questions) // batch_size,
#                                                                                                                                        total_training_loss_error / batch_index_check_training_loss,
#                                                                                                                                        int(batch_time * batch_index_check_training_loss)))
#             total_training_loss_error = 0
#         if batch_index % batch_index_check_validation_loss == 0 and batch_index > 0:
#             total_validation_loss_error = 0
#             starting_time = time.time()
#             for batch_index_validation, (padded_questions_in_batch, padded_answers_in_batch) in enumerate(split_into_batches(validation_questions, validation_answers, batch_size)):
#                 batch_validation_loss_error = session.run(loss_error, {inputs: padded_questions_in_batch,
#                                                                        targets: padded_answers_in_batch,
#                                                                        lr: learning_rate,
#                                                                        sequence_length: padded_answers_in_batch.shape[1],
#                                                                        keep_prob: 1})
#                 total_validation_loss_error += batch_validation_loss_error
#             ending_time = time.time()
#             batch_time = ending_time - starting_time
#             average_validation_loss_error = total_validation_loss_error / (len(validation_questions) / batch_size)
#             print('Validation Loss Error: {:>6.3f}, Batch Validation Time: {:d} seconds'.format(average_validation_loss_error, int(batch_time)))
#             learning_rate *= learning_rate_decay
#             if learning_rate < min_learning_rate:
#                 learning_rate = min_learning_rate
#             list_validation_loss_error.append(average_validation_loss_error)
#             if average_validation_loss_error <= min(list_validation_loss_error):
#                 print('I speak better now!!')
#                 early_stopping_check = 0
#                 saver = tf.train.Saver()
#                 saver.save(session, checkpoint)
#             else:
#                 print("Sorry I do not speak better, I need to practice more.")
#                 early_stopping_check += 1
#                 if early_stopping_check == early_stopping_stop:
#                     break
#     if early_stopping_check == early_stopping_stop:
#         print("My apologies, I cannot speak better anymore. This is the best I can do.")
#         break
# print("Game Over")

# Testing the Seq2Seq model

### Loading the weights and Running the session

In [75]:
import os
arr = os.listdir('./')
print(arr)

['.ipynb_checkpoints', 'best_weights_training.ckpt.data-00000-of-00001', 'best_weights_training.ckpt.index', 'best_weights_training.ckpt.meta', 'Chatbot.ipynb', 'movie_conversations.txt', 'movie_lines.txt']


In [None]:
checkpoint = "./best_weights_training.ckpt"
# checkpoint = "./chatbot_weights.ckpt"
session = tf.InteractiveSession()
session.run(tf.global_variables_initializer())
saver = tf.train.Saver()
saver.restore(session, checkpoint)

### Converting the questions from strings to lists of encoding integers

In [None]:
def convert_string2int(question, word2int):
    question = clean_text(question)
    return [word2int.get(word, word2int['<OUT>']) for word in question.split()]

### Setting up the chat

In [None]:
while(True):
    question = input("You: ")
    
    if question == 'Goodbye':
        break
    question = convert_string2int(question, questionswords2int)
    question = question + [questionswords2int['<PAD>']] * (25 - len(question))
    fake_batch = np.zeros((batch_size, 25))
    fake_batch[0] = question
    predicted_answer = session.run(test_predictions, {inputs: fake_batch, keep_prob: 0.5})[0]
    answer = ''
    for i in np.argmax(predicted_answer, 1):
        if answersints2word[i] == 'i':
            token = ' I'
        elif answersints2word[i] == '<EOS>':
            token = '.'
        elif answersints2word[i] == '<OUT>':
            token = 'out'
        else:
            token = ' ' + answersints2word[i]
        answer += token
        if token == '.':
            break
    print('ChatBot: ' + answer) 

In [None]:
! git add Chatbot.ipynb
! git commit -m "Final Commit"
! git push origin main