# Project - Text Generation by LSTM

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import random

## Data preparation

In [2]:
# First, build the word dictionary
def get_dict(file):
    words = ['<bos>','<eos>']
    i = 0
    with open(file, encoding ='UTF8') as f:
        for line in f:
            for word in line.split():
                words.append(word)

    words_dict = list(set(words))
    return words_dict

In [3]:
# Encode sentences
def sentences_to_tensor(file):
    words_dict = get_dict(file)
    sens = []
    
    with open(file, encoding ='UTF8') as f:
        for line in f:
            line = "<bos>" + line + "<eos>"
            line = line.split()
            sen = np.zeros(len(line))
            for index,word in enumerate(line):
                sen[index] = words_dict.index(word)
                
            sens.append(torch.LongTensor(sen))
    
    return sens


In [4]:
train_data = sentences_to_tensor('./data/ptb.train.txt')
dev_data = sentences_to_tensor('./data/ptb.valid.txt')
test_data = sentences_to_tensor('./data/ptb.test.txt')

In [5]:
print("Number of train dataset:", len(train_data), '\n', 
     "Number of valid dataset:", len(dev_data), '\n',
     "Number of test dataset:", len(test_data))

Number of train dataset: 42068 
 Number of valid dataset: 3370 
 Number of test dataset: 3761


## Model Architecture

### Model 1 - Simple LSTMcell 

In [19]:
class LSTMCell(nn.Module):
    def __init__(self, inputsize, hiddensize):
        super(LSTMCell,self).__init__()
        self.inputsize=inputsize
        self.hiddensize=hiddensize
        
        #input
        
        Wi=torch.zeros(inputsize, hiddensize).cuda()
        self.Wi=nn.Parameter(nn.init.xavier_normal_(Wi))
        Ui= torch.zeros(hiddensize,hiddensize).cuda()
        self.Ui=nn.Parameter(nn.init.xavier_normal_(Ui))
        self.bi=nn.Parameter(torch.ones(hiddensize).cuda())
        
        #forget
        
        Wf=torch.zeros(inputsize, hiddensize).cuda()
        self.Wf=nn.Parameter(nn.init.xavier_normal_(Wf))
        Uf= torch.zeros(hiddensize,hiddensize).cuda()
        self.Uf=nn.Parameter(nn.init.xavier_normal_(Uf))
        self.bf=nn.Parameter(torch.ones(hiddensize).cuda())
        
        #output
        
        Wo=torch.zeros(inputsize, hiddensize).cuda()
        self.Wo=nn.Parameter(nn.init.xavier_normal_(Wo))
        Uo= torch.zeros(hiddensize,hiddensize).cuda()
        self.Uo=nn.Parameter(nn.init.xavier_normal_(Uo))
        self.bo=nn.Parameter(torch.ones(hiddensize).cuda())
        
        
        #g
        
        Wg=torch.zeros(inputsize, hiddensize).cuda()
        self.Wg=nn.Parameter(nn.init.xavier_normal_(Wg))
        Ug= torch.zeros(hiddensize,hiddensize).cuda()
        self.Ug=nn.Parameter(nn.init.xavier_normal_(Ug))
        self.bg=nn.Parameter(torch.ones(hiddensize).cuda())
        
    #Implement on sentence level
    def forward(self,inp,initialize=None):
        seq_len,input_size=inp.size()
        hidden_sequence=torch.zeros((seq_len, self.hiddensize)).cuda()
        
        # Initialize h,c
        if initialize is None:
            h= torch.zeros(self.hiddensize).cuda()
            c= torch.zeros(self.hiddensize).cuda()
        else:
            h,c=initialize
            
        #updating gates
        for t in range(seq_len-1):    
            i=torch.sigmoid(inp[t,:]@self.Wi+h@self.Ui+self.bi)
            f=torch.sigmoid(inp[t,:]@self.Wf+h@self.Uf+self.bf)
            o=torch.sigmoid(inp[t,:]@self.Wo+h@self.Uo+self.bo)
            g=torch.tanh(inp[t,:]@self.Wg+h@self.Ug+self.bg)
            c=f*c+i*g
            h=o*torch.tanh(c)
            
            # len(hidden_sequence) = seq_len-1, we don't need (h,c) of <eos> ???
            hidden_sequence[t,:] = h
            
        return hidden_sequence,(h,c)

In [20]:
class Simple_LSTM(nn.Module):
    def __init__(self,number_words,emb_dim=100):
        super().__init__()
        
        self.embedding=nn.Embedding(number_words,emb_dim)
        self.lstm=LSTMCell(emb_dim,200)
        self.output_proj=nn.Linear(200,number_words)
            
    def forward(self,inp):
        input_tensor=self.embedding(inp)
        hidden, (h,c)=self.lstm(input_tensor)
        output=self.output_proj(hidden)
        
        return output.squeeze(0)

In [45]:
def train(model, train_data, dev_data,epochs=10, lr=0.001, gpu=True):
    ''' Training a network 
    
        Arguments
        ---------
        
        net: CharRNN network
        data: text data to train the network
        epochs: Number of epochs to train
        batch_size: Number of mini-sequences per mini-batch, aka batch size
        seq_length: Number of character steps per mini-batch
        lr: learning rate
        clip: gradient clipping
        val_frac: Fraction of data to hold out for validation
        print_every: Number of steps for printing training and validation loss
        gpu: specify training device
    
    '''
    
    if gpu:
        model.cuda()
        
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = F.cross_entropy
    
    #loss list
    train_loss = torch.zeros(epochs).cuda()
    dev_loss = torch.zeros(epochs).cuda()
    
    for epoch in range(epochs):
        random.shuffle(data)
        epoch_loss=0
        valid_loss=0
        model.train()
        
        for sentence in train_data:
            
            if gpu:
                sentence = sentence.cuda()

            # zero accumulated gradients
            optimizer.zero_grad()
            
            # get the output from the model
            output = model(sentence[:-1])
            
            # calculate the loss and perform backprop
            sentence_loss = criterion(output, sentence[1:], reduction="sum")
            loss = sentence_loss / len(sentence-1)
            loss.backward()
            
            epoch_loss += loss.item()
            
            optimizer.step()
            
        with torch.no_grad():
            for sentence in dev_data:
                sentence = sentence.cuda()
                output = model(sentence[:-1])
                sentence_loss = criterion(output, sentence[1:], reduction="sum")
                loss = sentence_loss / len(sentence-1)
                valid_loss += loss.item()
                
        print("Epoch: {}/{} ".format(epoch+1, epochs),
                "avg_train_loss: {:.4f} ".format(epoch_loss/len(train_data)),
             "avg_dev_loss: {:.4f} ".format(valid_loss/len(dev_data)))
        
        train_loss[epoch] = epoch_loss/len(train_data)
        dev_loss[epoch] = valid_loss/len(dev_data)
        
    return train_loss, dev_loss

In [73]:
word_dict = get_dict('./data/ptb.train.txt')
model = Simple_LSTM(len(word_dict))
print(model)

Simple_LSTM(
  (embedding): Embedding(10001, 100)
  (lstm): LSTMCell()
  (output_proj): Linear(in_features=200, out_features=10001, bias=True)
)


In [47]:
list1, list2 = train(model, train_data, dev_data)

Epoch: 1/10  avg_train_loss: 5.1293  avg_dev_loss: 12.1270 
Epoch: 2/10  avg_train_loss: 4.6731  avg_dev_loss: 12.7206 
Epoch: 3/10  avg_train_loss: 4.5357  avg_dev_loss: 13.0890 
Epoch: 4/10  avg_train_loss: 4.4533  avg_dev_loss: 13.4295 
Epoch: 5/10  avg_train_loss: 4.3917  avg_dev_loss: 13.5033 
Epoch: 6/10  avg_train_loss: 4.3484  avg_dev_loss: 13.8617 
Epoch: 7/10  avg_train_loss: 4.3212  avg_dev_loss: 14.1961 
Epoch: 8/10  avg_train_loss: 4.2995  avg_dev_loss: 14.7355 
Epoch: 9/10  avg_train_loss: 4.2812  avg_dev_loss: 15.4136 
Epoch: 10/10  avg_train_loss: 4.2687  avg_dev_loss: 15.4750 


In [86]:
def text_generation(num, file, data, dictionary, model):
    ''' Word predition
    
        Arguments
        ---------
        num: number of sentences to generate
        file: 
    '''
    
    with open(file, encoding ='UTF8') as f:
            for i in range(num):
                line = f.readline()
                
                logits = model(data[i].cuda())
                pred = logits.argmax(dim=1).cpu()
                sentence = ''
            
                for index in pred:
                    word = dictionary[index]
                    sentence = sentence + word + ' '
                
                print("Sentence {}:".format(i), '\t', line,
                     "Prediction:", '\t', sentence)