## FRIENDS TV SCRIPT GENERATION

### Import libraries

In [1]:
from collections import Counter
import numpy as np
import torch
import os
import pickle

### Explore Data

In [2]:
#load the dataset
def load_data():
    data_dir = './data/friends.txt'
    input_file = os.path.join(data_dir)
    with open(input_file, 'r', encoding='utf8') as file:
        text= file.read()
    return text
text = load_data()

In [3]:
### Dataset statistics
w = []
for word in text.split():
    w.append(None)
print("Number of unique words: {}".format(len(w)))

lines = text.split('\n')
print('Number of lines: {}'.format(len(lines)))

word_count_per_line = []
for line in lines:
    word_count_per_line.append(len(line.split()))
print('Average number of words in each line: {}'.format(np.average(word_count_per_line)))

print("Sample text from index {} to index {}".format(0,500))
print(text[0:500])

Number of unique words: 625513
Number of lines: 100218
Average number of words in each line: 6.24152347881618
Sample text from index 0 to index 500
[Scene: Central Perk, Chandler, Joey, Phoebe, and Monica are there.]

Monica: There's nothing to tell! He's just some guy I work with!

Joey: C'mon, you're going out with the guy! There's gotta be something wrong with him!

Chandler: All right Joey, be nice.  So does he have a hump? A hump and a hairpiece?

Phoebe: Wait, does he eat chalk?

(They all stare, bemused.)

Phoebe: Just, 'cause, I don't want her to go through what I went through with Carl- oh!

Monica: Okay, everybody relax. This is n


### Preprocessing

1. create lookup table - which will include creating two dictionaries. One to map the words to integers and other to map integers to words
2. tokenize punctuation - The words are split by a space delimiter, when punctuations are a part of the word it will create multiple ids for the same word. The token_lookup will create a dictionary which has symbols as keys, the tokens as values. This dictionary can be used to tokenize the symbols/punctuations thus separating the words from tokens and making it easier for the neural network to predict next word.

In [4]:
def create_lookup_table(text):
    word_count = Counter(text)
    sorted_word_count = sorted(word_count, key= word_count.get, reverse=True)
    
    int_to_vocab = {num: word for num, word in enumerate(sorted_word_count)}
    vocab_to_int = {word: num for num, word in int_to_vocab.items()}
    
    return (vocab_to_int, int_to_vocab)

In [5]:
def token_lookup():
    tokens = dict()
    tokens['.'] = '||PERIOD||'
    tokens[','] = '||COMMA||'
    tokens['"'] = '||QUOTATION_MARK||'
    tokens[';'] = '||SEMICOLON||'
    tokens['!'] = '||EXCLAMATION_MARK||'
    tokens['?'] = '||QUESTION_MARK||'
    tokens['('] = '||LEFT_PAREN||'
    tokens[')'] = '||RIGHT_PAREN||'
    tokens['?'] = '||QUESTION_MARK||'
    tokens['-'] = '||DASH||'
    tokens['\n'] = '||NEW_LINE||'
    return tokens   

### Preprocess the data and store it

In [6]:
#preprocess the data and save it
def preprocess_data():
    SPECIAL_WORDS = {'PADDING': '||PAD||'}
    
    text = load_data()
    token_dictionary = token_lookup()
    
    for key,token in token_dictionary.items():
        text = text.replace(key, ' {}'.format(token))
        
    text = text.lower()
    text = text.split()
    
    vocab_to_int, int_to_vocab = create_lookup_table(text + list(SPECIAL_WORDS.values()))
    int_text = [vocab_to_int[word] for word in text]
    pickle.dump((int_text, vocab_to_int, int_to_vocab, token_dictionary), open('preprocess.p', 'wb'))

In [7]:
preprocess_data()

In [8]:
def load_preprocess():
    """
    Load the Preprocessed Training data and return them in batches of <batch_size> or less
    """
    return pickle.load(open('preprocess.p', mode='rb'))

In [9]:
int_text, vocab_to_int, int_to_vocab, token_dict = load_preprocess()

### Build the neural network

check if GPU is available

In [10]:
gpu_available = torch.cuda.is_available()
if(not gpu_available):
    print("Error, no GPU available")
else:
    print("Success, GPU available")

Success, GPU available


#### Input - separate the input into chunks of batches and create a data from the features and target tensors. 

In [11]:
def batch_data(words, sequence_length, batch_size):
    nwords = len(words)//batch_size
    words = words[:len(words)]
    y_length = len(words)-sequence_length
    x = []
    y = []
    for index in range(0,y_length):
        index_end = index + sequence_length
        #features would be from the current index until the end of sequence
        x_batch = words[index:index_end]
        x.append(x_batch)
        #target/predicted would be the next word in the sequence- index_end in this case
        y_batch = words[index_end]
        y.append(y_batch)
        
    #create Tensor datasets from both the x and y lists
    data = torch.utils.data.TensorDataset(torch.from_numpy(np.asarray(x)), torch.from_numpy(np.asarray(y)))
    data_loader = torch.utils.data.DataLoader(data, batch_size=batch_size)
    return data_loader
    

The function forward_back_prop will take care of the forward and backward propogation on the neural network. The paramtes that the function takes in are : neural network module, optimizer, Loss function, Target output for the current batch of input.
This function returns the latest hidden state tensor and loss

In [13]:
def forward_back_prop(rnn, optimizer, criterion, inp, target, hidden):
    # move model to GPU, if available
    if(gpu_available):
        rnn.cuda()
    h = tuple([each.data for each in hidden])

    # zero accumulated gradients
    rnn.zero_grad()
    
    if(gpu_available):
        inputs, target = inp.cuda(), target.cuda()    
    # getting predicted outputs
    output, h = rnn(inputs, h)   
    #calculate loss
    loss = criterion(output, target)
    loss.backward()
    # 'clip_grad_norm' helps prevent the exploding gradient problem in RNNs / LSTMs
    nn.utils.clip_grad_norm_(rnn.parameters(), 5)
    optimizer.step()
    return loss.item(), h

### Training the Neural Network

The function train_rnn will train the Neural network over all the batches for the number of epochs passed to it. It returns the trained rnn

In [14]:
def train_rnn(rnn, batch_size, optimizer, criterion, n_epochs, show_every_n_batches):
    batch_losses = []   
    rnn.train()
    print("Training for %d epoch(s)..." % n_epochs)
    for epoch_i in range(1, n_epochs + 1):
        # initialize hidden state
        hidden = rnn.init_hidden(batch_size)        
        for batch_i, (inputs, labels) in enumerate(train_loader, 1):            
            n_batches = len(train_loader.dataset)//batch_size
            if(batch_i > n_batches):
                break           
            #forward and back prop
            loss, hidden = forward_back_prop(rnn, optimizer, criterion, inputs, labels, hidden)          
            # record loss
            batch_losses.append(loss)
            # printing the current loss and status
            if (batch_i % show_every_n_batches) == 0:
                print('Epoch: {:>4}/{:<4}  Loss: {}\n'.format(
                    epoch_i, n_epochs, np.average(batch_losses)))
                batch_losses = []
    # returns a trained rnn
    return rnn

### Setting the hyperparameters

In [15]:
sequence_length =  10 # of words in a sequence
batch_size = 128

train_loader = batch_data(int_text, sequence_length, batch_size)

# Training parameters
num_epochs = 20
learning_rate = 0.001

# Model parameters
vocab_size = len(vocab_to_int)
output_size = vocab_size
embedding_dim = 200
hidden_dim = 250
n_layers = 2
show_every_n_batches = 2000

print(len(vocab_to_int))

20919


### Actual Training

In [16]:
def save_model(filename, decoder):
    save_filename = os.path.splitext(os.path.basename(filename))[0] + '.pt'
    torch.save(decoder, save_filename)

In [17]:
# instantiate the model with hyperparameters and move to gpu if available
rnn = RNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5)
if gpu_available:
    rnn.cuda()

In [18]:
# defining loss and optimization functions for training
optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# training the model
trained_rnn = train_rnn(rnn, batch_size, optimizer, criterion, num_epochs, show_every_n_batches)

# saving the trained model
save_model('./save/trained_rnn', trained_rnn)
print('Model Trained and Saved')

Training for 20 epoch(s)...
Epoch:    1/20    Loss: 5.326299237966538

Epoch:    1/20    Loss: 4.714436972737312

Epoch:    1/20    Loss: 4.532453446269035

Epoch:    2/20    Loss: 4.316002941682848

Epoch:    2/20    Loss: 4.089063893079758

Epoch:    2/20    Loss: 4.064766646385193

Epoch:    3/20    Loss: 3.9990834399684547

Epoch:    3/20    Loss: 3.8613693948984147

Epoch:    3/20    Loss: 3.853468917131424

Epoch:    5/20    Loss: 3.6865270157999617

Epoch:    5/20    Loss: 3.617316173553467

Epoch:    5/20    Loss: 3.628477858185768

Epoch:    6/20    Loss: 3.598852265087855

Epoch:    6/20    Loss: 3.5425473648309707

Epoch:    6/20    Loss: 3.5511225508451463

Epoch:    7/20    Loss: 3.5229055145572374

Epoch:    7/20    Loss: 3.4750159566402434

Epoch:    7/20    Loss: 3.485599455356598

Epoch:    8/20    Loss: 3.4678699115797276

Epoch:    8/20    Loss: 3.4274679460525515

Epoch:    8/20    Loss: 3.4416489213705064

Epoch:    9/20    Loss: 3.416726037302918

Epoch:    9/20  

  "type " + obj.__name__ + ". It won't be checked "


### Checkpoint

In [19]:
def load_model(filename):
    save_filename = os.path.splitext(os.path.basename(filename))[0] + '.pt'
    return torch.load(save_filename)

In [20]:
_, vocab_to_int, int_to_vocab, token_dict = load_preprocess()
trained_rnn = load_model('./save/trained_rnn')

### Generate Text 

The network will take in the prime word and generates/predicts the next set of words of size predict_len with the prime word as the key. It gets the top 5 probable next words and generates the next word with some randomness among the top 5. After it generates each predicted sequence, the function does post-processing to replace the tokens with the original punctuation.

In [22]:
import torch.nn.functional as F

def generate(rnn, prime_id, int_to_vocab, token_dict, pad_value, predict_len=100):
    rnn.eval()
    
    # create a sequence (batch_size=1) with the prime_id
    current_seq = np.full((1, sequence_length), pad_value)
    current_seq[-1][-1] = prime_id
    predicted = [int_to_vocab[prime_id]]
    
    for _ in range(predict_len):
        if gpu_available:
            current_seq = torch.LongTensor(current_seq).cuda()
        else:
            current_seq = torch.LongTensor(current_seq)        
        # initialize the hidden state
        hidden = rnn.init_hidden(current_seq.size(0))        
        # get the output of the rnn
        output, _ = rnn(current_seq, hidden)        
        # get the next word probabilities
        p = F.softmax(output, dim=1).data
        if(gpu_available):
            p = p.cpu() 
         
        # use top_k sampling to get the index of the next word
        top_k = 5
        p, top_i = p.topk(top_k)
        top_i = top_i.numpy().squeeze()
        
        p = p.numpy().squeeze()
        word_i = np.random.choice(top_i, p=p/p.sum())
        
        word = int_to_vocab[word_i]
        predicted.append(word)     
        
        current_seq = current_seq.cpu()
        # the generated word becomes the next "current sequence" and the cycle can continue
        current_seq = np.roll(current_seq, -1, 1)
        current_seq[-1][-1] = word_i
    
    gen_sentences = ' '.join(predicted)
    
    # Post-processing
    for key, token in token_dict.items():
        ending = ' ' if key in ['\n', '(', '"'] else ''
        gen_sentences = gen_sentences.replace(' ' + token.lower(), key)
    gen_sentences = gen_sentences.replace('\n ', '\n')
    gen_sentences = gen_sentences.replace('( ', '(')
    return gen_sentences

### Test Output

In [25]:
# run the cell multiple times to get different results!
gen_length = 400 
prime_word = 'ross' 
SPECIAL_WORDS = {'PADDING': '||PAD||'}
pad_word = SPECIAL_WORDS['PADDING']
generated_script = generate(trained_rnn, vocab_to_int[prime_word + ':'], int_to_vocab, token_dict, vocab_to_int[pad_word], gen_length)
print(generated_script)

ross: advice, i don’t know what to say.

chandler: well, i was going to do that, but i think we need it, okay? you-you-you can be a little bit!

ross: i know! i mean you didn’t know anything?!

joey: yeah. but then, i don’t know if you didn’t want to tell me that i could get married.

joey: well, i’m-i’m gonna go find you and we get the same suite to be alone and we have sex with you. and then we-we don’t think i was just practicing e aunt you and-and we were gonna get married to the airport.

phoebe: yeah, i’m sorry. i know.

monica: what?! i mean i can’t believe it! i mean you know what? i don’t know, i’m not gonna go get a little bit. i mean it was a little bit. and if you want to be able to get wet a little bit!

rachel: oh no, you can’t tell you that i am.

joey: oh, i know i know! i am a terrible person, i’m gonna be alone in my apartment! okay? i mean you don’t know what i’m gonna go to the street?

monica: well, i think i’m gonna be with a couple of you.

ross: oh, yeah-yeah, i

In [26]:
# save script to a text file
f =  open("generated_script_6.txt","w")
f.write(generated_script)
f.close()

### Final output

Generate 100 scenes individually by varying the prime words and process them to generate one episode

In [26]:
#save script to file 
def save_file(num, generated_script):
    f = open("final_script_{}.txt".format(num), "w")
    f.write(generated_script)
    f.close()

In [27]:
#final script- generate 100 scenes and put together
gen_length = 500
prime_words = ['ross', 'rachel','monica','phoebe','chandler','joey']
SPECIAL_WORDS = {'PADDING': '<PAD>'}

pad_word = SPECIAL_WORDS['PADDING']

#generate the first scene as prime word
prime_word = '[scene'
generated_script = generate(trained_rnn, vocab_to_int[prime_word + ':'], int_to_vocab, token_dict, vocab_to_int[pad_word], gen_length)
save_file(0,generated_script)


j = 0
#loop through for dialogues starting with each characters
for p in range(100):
    if (j>len(prime_words)-1):
        j = 0
    prime_word = prime_words[j]
    generated_script = generate(trained_rnn, vocab_to_int[prime_word + ':'], int_to_vocab, token_dict, vocab_to_int[pad_word], gen_length)
    #write to output script
    save_file(p+1,generated_script)

#### Process and generate the episode

Generated the final episode text file by writing the 100 generated files into it. I used the following command in the terminal

`cat *.txt > Final/The Fake Script.txt`

In [7]:
def line_prepender(filename, line):
    with open(filename, 'r+') as f:
        content = f.read()
        f.seek(0, 0)
        f.write(line.rstrip('\r\n') + '\n\n' + content)
        e = "End"
        f.write('\n\n' + e.rstrip('\r\n'))


In [9]:
line_prepender('The Fake Script.txt', "                                    The One with the Fake Script")

In [80]:
def clean_file(file):
    s = open(file).read()
    s = s.replace('joey', 'Joey')
    s = s.replace('chandler','Chandler')
    s = s.replace('monica','Monica')
    s = s.replace('rachel', 'Rachel')
    s = s.replace('ross','Ross')
    s = s.replace('phoebe','Phoebe')
    f = open(file, 'w')
    f.write(s)
    f.close()

In [81]:
clean_file('The Fake Script.txt')