<a href="https://colab.research.google.com/github/sum-coderepo/DeepLearning-Pytorch/blob/master/PytorchTutorials/NN_languageModel1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
import torch
import os
import torch
import torch.nn as nn
import torch.optim as optim
import spacy
import numpy as np
import random
import math
import time
import string
import pandas
from sklearn.model_selection import train_test_split
from spacy.lang.en import English
!python3 -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')
#nlp.add_pipe('sentencizer')
nlp.add_pipe(nlp.create_pipe('sentencizer'))
nlp.max_length = 15000000

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [None]:
#pip install --ignore-installed --upgrade tensorflow-gpu==2.4.0

In [23]:
class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0

    def add_word(self, word):
        if not word in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def __len__(self):
        return len(self.word2idx)


class Corpus(object):
    def __init__(self):
        self.dictionary = Dictionary()

    def get_data(self, path, batch_size=20):
        # Add words to the dictionary
        split_sent = split_in_sentences1(path)
        tokens = {1:0,2:0,3:0}
        
        train, test = train_test_split(split_sent, train_size = 0.7)
        test , val = train_test_split(test, train_size = 0.66)        
        print(len(train), len(test), len(val))
        
        lst = [train, test, val]
        seq = 0
        for data in lst:
          seq = seq + 1
          for line in data:
            words = line.split() + ['<eos>']
            tokens[seq] += len(words)
            for word in words:
                self.dictionary.add_word(word)
                
        
        print(tokens)
        train_ids = torch.LongTensor(tokens[1])
        test_ids = torch.LongTensor(tokens[2])
        val_ids = torch.LongTensor(tokens[3])
             
        train_tensor = self.create_tensors(train_ids, train)
        test_tensor = self.create_tensors(test_ids, test)
        val_tensor = self.create_tensors(val_ids, val)
        
        return train_tensor, test_tensor, val_tensor
      
      
    def create_tensors( self, tensor, data):
        token = 0
        for line in data:
                words = line.split() + ['<eos>']
                for word in words:
                    tensor[token] = self.dictionary.word2idx[word]
                    #print(word, self.dictionary.word2idx[word])
                    token += 1
        num_batches = tensor.size(0) // batch_size
        tensor = tensor[:num_batches*batch_size]
        return tensor.view(batch_size, -1)  
      
       
def split_in_sentences1(path): 
    split_sent = []
    with open(path, 'r') as f:
      data = f.read()
      data = data.split('\n')
      data = " ".join(data) 
      data = data.split('.')
      for sent in data:
        if len(str(sent).strip())> 0:
            split_sent.append(str(sent).strip())
    return split_sent  
    

In [27]:
import torch
import torch.nn as nn
import numpy as np
from torch.nn.utils import clip_grad_norm_
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyper-parameters
embed_size = 128
hidden_size = 1024
num_layers = 1
num_epochs = 5
batch_size = 20 
seq_length = 30 
learning_rate = 0.001

In [25]:
corpus = Corpus()
train_tensors, test_tensors, val_tensors = corpus.get_data('brown.txt', batch_size)
vocab_size = len(corpus.dictionary)
num_batches = train_tensors.size(1) // seq_length

34771 9835 5067
{1: 744508, 2: 211643, 3: 109601}


In [28]:
train_tensors.size(), test_tensors.size(), val_tensors.size()

(torch.Size([20, 37225]), torch.Size([20, 10582]), torch.Size([20, 5480]))

In [29]:
num_batches

1240

In [30]:
# RNN based language model
class LSTMLM(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(RNNLM, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, x, h):
        
        x = self.embed(x)     
        out, (h, c) = self.lstm(x, h)
        out = out.reshape(out.size(0)*out.size(1), out.size(2))
        out = self.linear(out)
        return out, (h, c)     
    
    def init_hidden(self,batch_size):
        self.hidden = Variable(T.zeros(self.n_layers, batch_size, self.hidden_size).cuda())


model = RNNLM(vocab_size, embed_size, hidden_size, num_layers).to(device)

In [31]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Truncated backpropagation
def detach(states):
    return [state.detach() for state in states] 

In [None]:
# Train the model
for epoch in range(num_epochs):
    states = (torch.zeros(num_layers, batch_size, hidden_size).to(device),
              torch.zeros(num_layers, batch_size, hidden_size).to(device))
    
    for i in range(0, train_tensors.size(1) - seq_length, seq_length): 
        # Get mini-batch inputs and targets
        inputs = train_tensors[:, i:i+seq_length].to(device)
        targets = train_tensors[:, (i+1):(i+1)+seq_length].to(device)
        states = detach(states)
        outputs, states = model(inputs, states)
        
        loss = criterion(outputs, targets.reshape(-1))
        
        optimizer.zero_grad()
        loss.backward()
        clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        
        step = (i+1) // seq_length
        if step % 25 == 0:
            print ('Epoch [{}/{}], Step[{}/{}], Loss: {:.4f}, Perplexity: {:5.2f}'
                   .format(epoch+1, num_epochs, step, num_batches, loss.item(), np.exp(loss.item())))

Epoch [1/5], Step[0/1240], Loss: 11.3973, Perplexity: 89082.56
Epoch [1/5], Step[25/1240], Loss: 8.1134, Perplexity: 3338.84
Epoch [1/5], Step[50/1240], Loss: 7.9097, Perplexity: 2723.53
Epoch [1/5], Step[75/1240], Loss: 7.6686, Perplexity: 2140.14
Epoch [1/5], Step[100/1240], Loss: 7.4877, Perplexity: 1786.02
Epoch [1/5], Step[125/1240], Loss: 7.4912, Perplexity: 1792.26
Epoch [1/5], Step[150/1240], Loss: 7.4028, Perplexity: 1640.56
Epoch [1/5], Step[175/1240], Loss: 7.5551, Perplexity: 1910.40
Epoch [1/5], Step[200/1240], Loss: 7.2774, Perplexity: 1447.21
Epoch [1/5], Step[225/1240], Loss: 7.2473, Perplexity: 1404.25
Epoch [1/5], Step[250/1240], Loss: 7.9048, Perplexity: 2710.13
Epoch [1/5], Step[275/1240], Loss: 6.9946, Perplexity: 1090.76
Epoch [1/5], Step[300/1240], Loss: 6.9798, Perplexity: 1074.71
Epoch [1/5], Step[325/1240], Loss: 7.3791, Perplexity: 1602.07
Epoch [1/5], Step[350/1240], Loss: 6.8993, Perplexity: 991.58
Epoch [1/5], Step[375/1240], Loss: 7.1031, Perplexity: 1215

In [16]:
model.parameters, model.eval

(<bound method Module.parameters of RNNLM(
   (embed): Embedding(16923, 128)
   (lstm): LSTM(128, 1024, batch_first=True)
   (linear): Linear(in_features=1024, out_features=16923, bias=True)
 )>, <bound method Module.eval of RNNLM(
   (embed): Embedding(16923, 128)
   (lstm): LSTM(128, 1024, batch_first=True)
   (linear): Linear(in_features=1024, out_features=16923, bias=True)
 )>)

In [17]:
def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0
    ntokens = len(corpus.dictionary)
    hidden = state = (torch.zeros(num_layers, batch_size, hidden_size).to(device),
                 torch.zeros(num_layers, batch_size, hidden_size).to(device))
    for i in range(0, data_source.size(0) - 1, seq_length):
        data = data_source[:, i:i+seq_length].to(device)
        targets = data_source[:, (i+1):(i+1)+seq_length].to(device)
        output, hidden = model(data, hidden)
        print(output.size())
        output_flat = output.view(-1, ntokens)
        print(output_flat.size(), targets.size())
        total_loss += len(data) * criterion(output_flat, targets.reshape(-1)).data
        print(total_loss)
        hidden = detach(states)
        
    return total_loss.item()/ len(data_source)

In [18]:
# Run on test data.
test_loss = evaluate(test_tensors)
print('=' * 89)
print('loss {:5.2f} | perplexity {:8.2f}'.format(test_loss, np.exp(test_loss)))
print('=' * 89)
#loss.item(), ))

torch.Size([600, 16923])
torch.Size([600, 16923]) torch.Size([20, 30])
tensor(191.7822)
loss  9.59 | perplexity 14604.86


In [1]:
# Run on test data.
test_loss = evaluate(val_tensors)
print('=' * 89)
print('loss {:5.2f} | perplexity {:8.2f}'.format(test_loss, np.exp(test_loss)))
print('=' * 89)
#loss.item(), ))

NameError: ignored