<a href="https://colab.research.google.com/github/vigneshdurairaj/Chandler_Bot/blob/master/chandler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
import re

from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.utils.data_utils import get_file
from keras.callbacks import EarlyStopping
from __future__ import print_function

import random
import sys

In [0]:
def split_data(df, train_perc = 0.8):
    df['train'] = np.random.rand(len(df)) < train_perc
    train = df[df.train == 1]
    test = df[df.train == 0]
    split_data ={'train': train, 'test': test}
    return split_data

def cleanstr(somestring):
    rx = re.compile('\W+')
    return rx.sub(' ', somestring).strip()


def sample(preds, temperature=1.0):
  preds = np.asarray(preds).astype('float64')
  preds = np.log(preds) / temperature
  exp_preds = np.exp(preds)
  preds = exp_preds / np.sum(exp_preds)
  probas = np.random.multinomial(1, preds, 1)
  return np.argmax(probas)
    


In [7]:
df = pd.read_csv('friends-transcripts corpus.txt',delimiter='\t')
df.head(3)

Unnamed: 0,Season,Episode,Season & Episode,Title,Author,Quote
0,d,d,d,d,d,string
1,meta,meta,meta,meta,meta,include=True
2,01,01,0101,The One Where Monica Gets A Roommate,Monica,There's nothing to tell! He's just some guyI ...


In [8]:
df = df[2:]
df.drop("Season & Episode", axis=1 , inplace=True)
df.head(3)


Unnamed: 0,Season,Episode,Title,Author,Quote
2,1,1,The One Where Monica Gets A Roommate,Monica,There's nothing to tell! He's just some guyI ...
3,1,1,The One Where Monica Gets A Roommate,Joey,"C'mon, you're going out with the guy! There's..."
4,1,1,The One Where Monica Gets A Roommate,Chandler,"All right Joey, benice. So does he have a hu..."


In [9]:
df.Season = pd.to_numeric(df.Season , errors='raise')
df.Episode = pd.to_numeric(df.Episode, errors='coerce')
df.Episode = df.Episode.replace(np.nan , 17)
df.Title = df.Title.astype(str)
df.Quote = df.Quote.astype(str)
df.Author = df.Author.astype(str)
df.head(2)

Unnamed: 0,Season,Episode,Title,Author,Quote
2,1,1.0,The One Where Monica Gets A Roommate,Monica,There's nothing to tell! He's just some guyI ...
3,1,1.0,The One Where Monica Gets A Roommate,Joey,"C'mon, you're going out with the guy! There's..."


In [0]:
Dataset = split_data(df , train_perc=0.8)

In [64]:
text = ' '.join(Dataset['train'].Quote[Dataset['train'].Author == 'Chandler'].tolist())
text = text.lower()
print('Total Dialogues', len(text))
chars = set(text)
print(text[:40])

Total Dialogues 372838
 all right joey, benice.  so does he hav


In [0]:
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [36]:
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

nb sequences: 124266
Vectorization...


In [0]:

model = Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(maxlen, len(chars))))
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

In [0]:
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

In [61]:
model.fit(X, y, batch_size=128, nb_epoch=5, callbacks=[EarlyStopping(monitor='loss', min_delta=0, patience=0, verbose=0, mode='auto', baseline=None, restore_best_weights=False)])


  """Entry point for launching an IPython kernel.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fcb3fdbaeb8>

In [0]:
model.save('chandler_F.h5')

In [71]:
for iteration in range(1, 10):
    print()
    print('-' * 50)
    print('Iteration', iteration)

    

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print()
        print('----- diversity:', diversity)
        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)
        for i in range(50):
            x = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x[0, t, char_indices[char]] = 1.
            preds = model.predict(x, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]
            generated += next_char
            sentence = sentence[1:] + next_char
            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()



--------------------------------------------------
Iteration 1

----- diversity: 0.2
----- Generating with seed: "over again.  i mean synchronized swimmin"
over again.  i mean synchronized swimming to go out there.   oh, that's the world be a lit

----- diversity: 0.5
----- Generating with seed: "over again.  i mean synchronized swimmin"
over again.  i mean synchronized swimming to still see of that.   you think it was been a 

----- diversity: 1.0
----- Generating with seed: "over again.  i mean synchronized swimmin"
over again.  i mean synchronized swimming for as us the beendar.  and you thought i can to

----- diversity: 1.2
----- Generating with seed: "over again.  i mean synchronized swimmin"
over again.  i mean synchronized swimming wrolaz.  for one dadn. and we really. you can dn

--------------------------------------------------
Iteration 2

----- diversity: 0.2
----- Generating with seed: "hat youre crazy.  heres another plann"
hat youre crazy.  heres another plannow 