## FRIENDS TV SCRIPT GENERATION

### Import libraries

In [1]:
from collections import Counter
import numpy as np
import torch
import os
import pickle

### Explore Data

In [5]:
#load the dataset
data_dir = './data/friends1.txt'
input_file = os.path.join(data_dir)
with open(input_file, 'r', encoding='utf8') as file:
    text= file.read()


In [6]:
### Dataset statistics

w = []
for word in text.split():
    w.append(None)
print("Number of unique words: {}".format(len(w)))

lines = text.split('\n')
print('Number of lines: {}'.format(len(lines)))

word_count_per_line = []
for line in lines:
    word_count_per_line.append(len(line.split()))
print('Average number of words in each line: {}'.format(np.average(word_count_per_line)))

print("Sample text from index {} to index {}".format(0,500))
print(text[0:500])

Number of unique words: 180984
Number of lines: 30251
Average number of words in each line: 5.982744372086874
Sample text from index 0 to index 500
[Scene: Central Perk, Chandler, Joey, Phoebe, and Monica are there.]

Monica: There's nothing to tell! He's just some guy I work with!

Joey: C'mon, you're going out with the guy! There's gotta be something wrong with him!

Chandler: All right Joey, be nice.  So does he have a hump? A hump and a hairpiece?

Phoebe: Wait, does he eat chalk?

(They all stare, bemused.)

Phoebe: Just, 'cause, I don't want her to go through what I went through with Carl- oh!

Monica: Okay, everybody relax. This is n


### preprocessing

1. create lookup table - which will include creating two dictionaries. One to map the 
2. tokenize punctuation

In [7]:
def create_lookup_table(text):
    word_count = Counter(text)
    sorted_word_count = sorted(word_count, key= word_count.get, reverse=True)
    
    int_to_vocab = {num: word for num, word in enumerate(sorted_word_count)}
    vocab_to_int = {word: num for num, word in int_to_vocab.items()}
    
    return (vocab_to_int, int_to_vocab)

In [8]:
def token_lookup():
    tokens = dict()
    tokens['.'] = '<PERIOD>'
    tokens[','] = '<COMMA>'
    tokens['"'] = '<QUOTATION_MARK>'
    tokens[';'] = '<SEMICOLON>'
    tokens['!'] = '<EXCLAMATION_MARK>'
    tokens['?'] = '<QUESTION_MARK>'
    tokens['('] = '<LEFT_PAREN>'
    tokens[')'] = '<RIGHT_PAREN>'
    tokens['?'] = '<QUESTION_MARK>'
    tokens['-'] = '<DASH>'
    tokens['\n'] = '<NEW_LINE>'
    return tokens   

In [7]:
#preprocess the data and save it
def preprocess_data():
    SPECIAL_WORDS = {'PADDING': '<PAD>'}
    
    text = load_data(data_dir)
    token_dictionary = token_lookup()
    
    for key,token in token_dictionary.items():
        text = text.replace(key, ' {}'.format(token))
        
    text = text.lower()
    text = text.split()
    
    vocab_to_int, int_to_vocab = create_lookup_table(text + list(SPECIAL_WORDS.values()))
    int_text = [vocab_to_int[word] for word in text]
    pickle.dump((int_text, vocab_to_int, int_to_vocab, token_dictionary), open('preprocess.p', 'wb'))

In [8]:
preprocess_data()

In [9]:
def load_preprocess():
    """
    Load the Preprocessed Training data and return them in batches of <batch_size> or less
    """
    return pickle.load(open('preprocess.p', mode='rb'))

In [10]:
int_text, vocab_to_int, int_to_vocab, token_dict = load_preprocess()

### Build the neural network

In [11]:
gpu_available = torch.cuda.is_available()
if(not gpu_available):
    print("Error, no GPU available")
else:
    print("Success, GPU available")

Success, GPU available


#### Input

In [12]:
def batch_data(words, sequence_length, batch_size):
    nwords = len(words)//batch_size
    words = words[:len(words)]
    y_length = len(words)-sequence_length
    x = []
    y = []
    for index in range(0,y_length):
        index_end = index + sequence_length
        #features would be from the current index until the end of sequence
        x_batch = words[index:index_end]
        x.append(x_batch)
        #target/predicted would be the next word in the sequence- index_end in this case
        y_batch = words[index_end]
        y.append(y_batch)
        
    #create Tensor datasets from both the x and y lists
    data = torch.utils.data.TensorDataset(torch.from_numpy(np.asarray(x)), torch.from_numpy(np.asarray(y)))
    data_loader = torch.utils.data.DataLoader(data, batch_size=batch_size)
    return data_loader
    

### Build neural network

In [13]:
import torch.nn as nn

#implementing a class that inherits from Pytorch's base class

class RNN(nn.Module):
    def __init__(self, input_size, output_size, embedding_dim, hidden_dim, nlayers, dropout=0.5,learning_rate= 0.001):
        super(RNN, self).__init__()
        
        self.embedding = nn.Embedding(input_size, embedding_dim)
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers,dropout = dropout, batch_first= True)
        
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        #final fully connected output layer
        self.fc = nn.Linear(hidden_dim,output_size)
    
    def forward(self, nn_input, hidden):
        batch_size = nn_input.size(0)
        
        embeds = self.embedding(nn_input)
        lstm_out, hidden = self.lstm(embeds, hidden)
        
        # stack up lstm outputs - Reshaping the lstm outputs such that it can be fit into the fully connected layer
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # dropout and fully-connected layer
        out = self.fc(lstm_out)
        
        # reshape into (batch_size, seq_length, output_size)
        out = out.view(batch_size, -1, self.output_size)
        
        # get last batch
        out = out[:, -1]

        return out, hidden
    
    def init_hidden(self,batch_size):
        weight = next(self.parameters()).data
        
        if (gpu_available):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden

In [14]:
def forward_back_prop(rnn, optimizer, criterion, inp, target, hidden):
    """
    Forward and backward propagation on the neural network
    :param decoder: The PyTorch Module that holds the neural network
    :param decoder_optimizer: The PyTorch optimizer for the neural network
    :param criterion: The PyTorch loss function
    :param inp: A batch of input to the neural network
    :param target: The target output for the batch of input
    :return: The loss and the latest hidden state Tensor
    """
    # move model to GPU, if available
    if(gpu_available):
        rnn.cuda()
        
#     # Creating new variables for the hidden state, otherwise
#     # we'd backprop through the entire training history
    h = tuple([each.data for each in hidden])

    # zero accumulated gradients
    rnn.zero_grad()
    
    if(gpu_available):
        inputs, target = inp.cuda(), target.cuda()
    
    # get predicted outputs
    output, h = rnn(inputs, h)
    
    # calculate loss
    loss = criterion(output, target)
    
#     optimizer.zero_grad()
    loss.backward()
    # 'clip_grad_norm' helps prevent the exploding gradient problem in RNNs / LSTMs
    nn.utils.clip_grad_norm_(rnn.parameters(), 5)

    optimizer.step()
    return loss.item(), h

In [15]:
def train_rnn(rnn, batch_size, optimizer, criterion, n_epochs, show_every_n_batches=100):
    batch_losses = []
    
    rnn.train()

    print("Training for %d epoch(s)..." % n_epochs)
    for epoch_i in range(1, n_epochs + 1):
        
        # initialize hidden state
        hidden = rnn.init_hidden(batch_size)
        
        for batch_i, (inputs, labels) in enumerate(train_loader, 1):
            
            # make sure you iterate over completely full batches, only
            n_batches = len(train_loader.dataset)//batch_size
            if(batch_i > n_batches):
                break
            
            # forward, back prop
            loss, hidden = forward_back_prop(rnn, optimizer, criterion, inputs, labels, hidden)          
            # record loss
            batch_losses.append(loss)

            # printing loss stats
            if (batch_i % show_every_n_batches) == 0:
                print('Epoch: {:>4}/{:<4}  Loss: {}\n'.format(
                    epoch_i, n_epochs, np.average(batch_losses)))
                batch_losses = []

    # returns a trained rnn
    return rnn

### Setting the hyperparameters

In [17]:
# Data params
# Sequence Length
sequence_length =  10 # of words in a sequence
# Batch Size
batch_size = 64

# data loader - do not change
train_loader = batch_data(int_text, sequence_length, batch_size)

# Training parameters
# Number of Epochs
num_epochs = 20
# Learning Rate
learning_rate = 0.001

# Model parameters
# Vocab size
vocab_size = len(vocab_to_int)
# Output size
output_size = vocab_size
# Embedding Dimension
embedding_dim = 200
# Hidden Dimension
hidden_dim = 250
# Number of RNN Layers
n_layers = 2

# Show stats for every n number of batches
show_every_n_batches = 1500

print(len(vocab_to_int))

11158


### Train

In [19]:
def save_model(filename, decoder):
    save_filename = os.path.splitext(os.path.basename(filename))[0] + '.pt'
    torch.save(decoder, save_filename)

In [20]:
# instantiate the model with hyperparameters and move to gpu if available
rnn = RNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5)
if gpu_available:
    rnn.cuda()

In [21]:
# defining loss and optimization functions for training
optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# training the model
trained_rnn = train_rnn(rnn, batch_size, optimizer, criterion, num_epochs, show_every_n_batches)

# saving the trained model
save_model('./save/trained_rnn', trained_rnn)
print('Model Trained and Saved')

Training for 20 epoch(s)...
Epoch:    1/20    Loss: 5.460625616232554

Epoch:    1/20    Loss: 4.9654794948895775

Epoch:    2/20    Loss: 4.585750030706057

Epoch:    2/20    Loss: 4.2597872899373375

Epoch:    3/20    Loss: 4.138378633035196

Epoch:    3/20    Loss: 3.9340926497777304

Epoch:    4/20    Loss: 3.8575706123500257

Epoch:    4/20    Loss: 3.6998205739657086

Epoch:    5/20    Loss: 3.665892713879411

Epoch:    5/20    Loss: 3.5308276023864744

Epoch:    6/20    Loss: 3.5188357561222605

Epoch:    6/20    Loss: 3.3979625005722047

Epoch:    7/20    Loss: 3.4019740711192825

Epoch:    7/20    Loss: 3.2914987393220265

Epoch:    8/20    Loss: 3.3005733174850813

Epoch:    8/20    Loss: 3.2019443883101144

Epoch:    9/20    Loss: 3.2224047394922457

Epoch:    9/20    Loss: 3.127979487816493

Epoch:   10/20    Loss: 3.1504560357614144

Epoch:   10/20    Loss: 3.059274341185888

Epoch:   11/20    Loss: 3.0907044703694613

Epoch:   11/20    Loss: 3.005137904246648

Epoch:   12

  "type " + obj.__name__ + ". It won't be checked "


### Checkpoint

In [22]:
def load_model(filename):
    save_filename = os.path.splitext(os.path.basename(filename))[0] + '.pt'
    return torch.load(save_filename)

In [23]:
_, vocab_to_int, int_to_vocab, token_dict = load_preprocess()
trained_rnn = load_model('./save/trained_rnn')

### Generate Text

In [24]:
import torch.nn.functional as F

def generate(rnn, prime_id, int_to_vocab, token_dict, pad_value, predict_len=100):
    """
    Generate text using the neural network
    :param decoder: The PyTorch Module that holds the trained neural network
    :param prime_id: The word id to start the first prediction
    :param int_to_vocab: Dict of word id keys to word values
    :param token_dict: Dict of puncuation tokens keys to puncuation values
    :param pad_value: The value used to pad a sequence
    :param predict_len: The length of text to generate
    :return: The generated text
    """
    rnn.eval()
    
    # create a sequence (batch_size=1) with the prime_id
    current_seq = np.full((1, sequence_length), pad_value)
    current_seq[-1][-1] = prime_id
    predicted = [int_to_vocab[prime_id]]
    
    for _ in range(predict_len):
        if gpu_available:
            current_seq = torch.LongTensor(current_seq).cuda()
        else:
            current_seq = torch.LongTensor(current_seq)
        
        # initialize the hidden state
        hidden = rnn.init_hidden(current_seq.size(0))
        
        # get the output of the rnn
        output, _ = rnn(current_seq, hidden)
        
        # get the next word probabilities
        p = F.softmax(output, dim=1).data
        if(gpu_available):
            p = p.cpu() # move to cpu
         
        # use top_k sampling to get the index of the next word
        top_k = 5
        p, top_i = p.topk(top_k)
        top_i = top_i.numpy().squeeze()
        
        # select the likely next word index with some element of randomness
        p = p.numpy().squeeze()
        word_i = np.random.choice(top_i, p=p/p.sum())
        
        # retrieve that word from the dictionary
        word = int_to_vocab[word_i]
        predicted.append(word)     
        
        current_seq = current_seq.cpu()
        # the generated word becomes the next "current sequence" and the cycle can continue
        current_seq = np.roll(current_seq, -1, 1)
        current_seq[-1][-1] = word_i
    
#     print("predicred", predicted)
    gen_sentences = ' '.join(predicted)
#     print("before", gen_sentences)
    
    # Replace punctuation tokens
    for key, token in token_dict.items():
        ending = ' ' if key in ['\n', '(', '"'] else ''
        gen_sentences = gen_sentences.replace(' ' + token.lower(), key)
    gen_sentences = gen_sentences.replace('\n ', '\n')
    gen_sentences = gen_sentences.replace('( ', '(')
    
    # return all the sentences
    return gen_sentences

In [32]:
# run the cell multiple times to get different results!
gen_length = 400 # modify the length to your preference
prime_word = 'ross' # name for starting the script
SPECIAL_WORDS = {'PADDING': '<PAD>'}
"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
pad_word = SPECIAL_WORDS['PADDING']
generated_script = generate(trained_rnn, vocab_to_int[prime_word + ':'], int_to_vocab, token_dict, vocab_to_int[pad_word], gen_length)
print(generated_script)

ross: special. you know what? i mean if i can get the door with one side, and i was able to mix for him, but i guess i have to be a good idea if i got a difference or something?

phoebe: yeah, well i guess you do not know anything anymore too.

rachel: well, yeah, i mean.

phoebe: well, i know, i just had one more thing, i-i-i didn’t know that i could be with you for a while, but if you want to go down to work with him.(hands him a twin box)

phoebe: okay. okay, fine, i have to get to work with a song, and the way he had....

phoebe: oh honey, honey, i guess it’s not a good idea. i mean it’ll like this girl would be like bad, or fall.

ross: yeah.

ross: yeah, yeah!

monica: well, i think i was thinking i saw him homemade boyfriend's brand room with you guys! oh my god trusted that? buzz the channel!

joey: whoa!!

joey: whoa! are it out!

phoebe: oh, okay, yeah, well then, so, you ju and then i realized you can be able to go into one.

rachel: okay, well then i guess i have no idea ho

In [33]:
# save script to a text file
f =  open("generated_script_3.txt","w")
f.write(generated_script)
f.close()