In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import re
import pickle
import random

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

import os

# Load Dataset

In [3]:
# _PROJECT_PATH = '/content/drive/MyDrive/Colab Notebooks/nlg'
_PROJECT_PATH = '/home/taindp/VINBRAIN_INTERNSHIP/nlg_lstm'

In [4]:
# %pwd

In [5]:
# read pickle file
pickle_in = open(os.path.join(_PROJECT_PATH,"data/plots_text.pickle"),"rb")
movie_plots = pickle.load(pickle_in)

# count of movie plot summaries
len(movie_plots)

500

# Data Preparation

In [6]:
# clean text
movie_plots = [re.sub("[^a-z' ]", "", i) for i in movie_plots]

In [21]:
movie_plots

["barry is a private with the st airborne division of the united states army stationed at fort campbell kentucky calpernia works as a showgirl at a transgender revue in nashville tennessee when the two met in  barry's roommate justin fisher  brings barry to the club where she performs when barry and calpernia begin seeing each other regularly fisher begins spreading rumors on base about their relationship which appeared to be a violation of the military's don't ask don't tell policy about discussing the sexual orientation of military personnel barry faces increasing harassment and pressure which explode into violence over fourth of july weekend while calpernia performs in a pageant in nashville barry is beaten to death in his sleep with a baseball bat by calvin glover who had been goaded by fisher into committing the crime the film ends with a discussion of the aftermath",
 'chinese exorcist oneeyebrow priest  leads a peaceful life with two disciples ah hao  and ah fang  in a small tow

In [22]:
len(movie_plots)

500

In [7]:
def create_seq(text, seq_len = 5):
    
    sequences = []

    # if the number of tokens in 'text' is greater than 5
    if len(text.split()) > seq_len:
        for i in range(seq_len, len(text.split())):
            # select sequence of tokens
            seq = text.split()[i-seq_len:i+1]
            # add to the list
            sequences.append(" ".join(seq))

        return sequences

    # if the number of tokens in 'text' is less than or equal to 5
    else:
      
      return [text]

In [8]:
seqs = [create_seq(i) for i in movie_plots]

# merge list-of-lists into a single list
seqs = sum(seqs, [])

# count of sequences
len(seqs)

152644

In [23]:
seqs

['barry is a private with the',
 'is a private with the st',
 'a private with the st airborne',
 'private with the st airborne division',
 'with the st airborne division of',
 'the st airborne division of the',
 'st airborne division of the united',
 'airborne division of the united states',
 'division of the united states army',
 'of the united states army stationed',
 'the united states army stationed at',
 'united states army stationed at fort',
 'states army stationed at fort campbell',
 'army stationed at fort campbell kentucky',
 'stationed at fort campbell kentucky calpernia',
 'at fort campbell kentucky calpernia works',
 'fort campbell kentucky calpernia works as',
 'campbell kentucky calpernia works as a',
 'kentucky calpernia works as a showgirl',
 'calpernia works as a showgirl at',
 'works as a showgirl at a',
 'as a showgirl at a transgender',
 'a showgirl at a transgender revue',
 'showgirl at a transgender revue in',
 'at a transgender revue in nashville',
 'a transgend

In [24]:
# movie_plots[0]

In [10]:
# create inputs and targets (x and y)
x = []
y = []

for s in seqs:
    x.append(" ".join(s.split()[:-1]))
    y.append(" ".join(s.split()[1:]))

In [11]:
x[1]

'is a private with the'

In [12]:
y[1]

'a private with the st'

In [13]:
# create integer-to-token mapping
int2token = {}
cnt = 0

for w in set(" ".join(movie_plots).split()):
    int2token[cnt] = w
    cnt+= 1

# create token-to-integer mapping
token2int = {t: i for i, t in int2token.items()}

token2int["the"], int2token[14271]

(9406, 'dupe')

In [19]:
# torch.save(token2int,'/home/taindp/VINBRAIN_INTERNSHIP/nlg_lstm/resource/token2int.h5')
# torch.save(int2token,'/home/taindp/VINBRAIN_INTERNSHIP/nlg_lstm/resource/int2token.h5')

In [14]:
# set vocabulary size
vocab_size = len(int2token)
vocab_size

16592

In [18]:
def get_integer_seq(seq):
    return [token2int[w] for w in seq.split()]

# convert text sequences to integer sequences
x_int = [get_integer_seq(i) for i in x]
y_int = [get_integer_seq(i) for i in y]

# convert lists to numpy arrays
x_int = np.array(x_int)
y_int = np.array(y_int)

# Model Building

In [19]:
def get_batches(arr_x, arr_y, batch_size):
         
    # iterate through the arrays
    prv = 0
    for n in range(batch_size, arr_x.shape[0], batch_size):
        x = arr_x[prv:n,:]
        y = arr_y[prv:n,:]
        prv = n
        yield x, y

In [25]:
class WordLSTM(nn.Module):
    
    def __init__(self, n_hidden=256, n_layers=4, drop_prob=0.3, lr=0.001):
        super().__init__()

        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        self.emb_layer = nn.Embedding(vocab_size, 200)

        ## define the LSTM
        self.lstm = nn.LSTM(200, n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        ## define a dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        ## define the fully-connected layer
        self.fc = nn.Linear(n_hidden, vocab_size)      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''

        ## pass input through embedding layer
        embedded = self.emb_layer(x)     
        
        ## Get the outputs and the new hidden state from the lstm
        lstm_output, hidden = self.lstm(embedded, hidden)
        
        ## pass through a dropout layer
        out = self.dropout(lstm_output)
        
        #out = out.contiguous().view(-1, self.n_hidden) 
        out = out.reshape(-1, self.n_hidden) 

        ## put "out" through the fully-connected layer
        out = self.fc(out)

        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data

        # if GPU is available
        if (torch.cuda.is_available()):
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                    weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        
        # if GPU is not available
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                    weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden

In [26]:
net = WordLSTM()

print(net)

WordLSTM(
  (emb_layer): Embedding(16592, 200)
  (lstm): LSTM(200, 256, num_layers=4, batch_first=True, dropout=0.3)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=16592, bias=True)
)


In [27]:
def train(net, epochs=10, batch_size=32, lr=0.001, clip=1, print_every=32):
    
    # optimizer
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    
    # loss
    criterion = nn.CrossEntropyLoss()
    
    # push model to GPU
    net.cuda()
    
    counter = 0

    net.train()

    for e in range(epochs):

        # initialize hidden state
        h = net.init_hidden(batch_size)
        
        for x, y in get_batches(x_int, y_int, batch_size):
            counter+= 1
            
            # convert numpy arrays to PyTorch arrays
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            
            # push tensors to GPU
            inputs, targets = inputs.cuda(), targets.cuda()

            # detach hidden states
            h = tuple([each.data for each in h])

            # zero accumulated gradients
            net.zero_grad()
            
            # get the output from the model
            output, h = net(inputs, h)
            
            # calculate the loss and perform backprop
            loss = criterion(output, targets.view(-1))

            # back-propagate error
            loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)

            # update weigths
            opt.step()            
            
            if counter % print_every == 0:
            
              print("Epoch: {}/{}...".format(e+1, epochs),
                    "Step: {}...".format(counter))

In [28]:
# train the model
train(net, batch_size = 32, epochs=20, print_every=256)

Epoch: 1/20... Step: 256...
Epoch: 1/20... Step: 512...
Epoch: 1/20... Step: 768...
Epoch: 1/20... Step: 1024...
Epoch: 1/20... Step: 1280...
Epoch: 1/20... Step: 1536...
Epoch: 1/20... Step: 1792...
Epoch: 1/20... Step: 2048...
Epoch: 1/20... Step: 2304...
Epoch: 1/20... Step: 2560...
Epoch: 1/20... Step: 2816...
Epoch: 1/20... Step: 3072...
Epoch: 1/20... Step: 3328...
Epoch: 1/20... Step: 3584...
Epoch: 1/20... Step: 3840...
Epoch: 1/20... Step: 4096...
Epoch: 1/20... Step: 4352...
Epoch: 1/20... Step: 4608...
Epoch: 2/20... Step: 4864...
Epoch: 2/20... Step: 5120...
Epoch: 2/20... Step: 5376...
Epoch: 2/20... Step: 5632...
Epoch: 2/20... Step: 5888...
Epoch: 2/20... Step: 6144...
Epoch: 2/20... Step: 6400...
Epoch: 2/20... Step: 6656...
Epoch: 2/20... Step: 6912...
Epoch: 2/20... Step: 7168...
Epoch: 2/20... Step: 7424...
Epoch: 2/20... Step: 7680...
Epoch: 2/20... Step: 7936...
Epoch: 2/20... Step: 8192...
Epoch: 2/20... Step: 8448...
Epoch: 2/20... Step: 8704...
Epoch: 2/20... St

In [29]:
_MODEL_PATH = '/content/drive/MyDrive/Colab Notebooks/nlg/model'

In [32]:
checkpoint = {'model': net,
      'state_dict': net.state_dict()}

torch.save(checkpoint, os.path.join(_MODEL_PATH,'checkpoint.pth'))