In [35]:
#importing libraries
print("Importing libraries ... ", end="")
from collections import Counter
from keras.preprocessing.text import text_to_word_sequence
import datetime
import json

import numpy as np
import pandas as pd

from tqdm import tqdm

from keras.preprocessing.text import text_to_word_sequence
from torch.utils.data import DataLoader

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

import matplotlib.pyplot as plt
print("done.")

Importing libraries ... done.


In [36]:
#run this notebook on google colab upload dataset on google drive and use from there
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [37]:
with open('/content/drive/My Drive/datasets/brown.txt', 'r') as f: 
    data = f.read()
data = data.split('.')

In [38]:
import re
def clean(string):
    string = string.encode('ascii', 'ignore').decode('ascii')
    string = re.sub('^\[.*\]','',string)
    string = re.sub('^.*:','',string)
    string = string.strip()
    string = string.strip('\"')
    string = re.sub('\?+','?',string)
    string = re.sub('!+','!',string)
    string = re.sub('\.+','.',string)
    string = re.sub(' +',' ',string)
    string = re.sub('\(.*\)','',string)
    string = string.replace(':)','')
    string = string.replace(';)','')
    string = string.replace('*','')
    string = string.strip()
    return string

In [39]:
for i in range(len(data)):
  data[i] = data[i].replace('\n',' ')
  data[i] = clean(data[i])
  data[i] = data[i] + '.'
cleaned = []
for i in range(len(data)):
  if len(data[i])>=10:
    cleaned.append(data[i])

In [40]:
len(cleaned)

48920

In [41]:
# from random import shuffle
# shuffle(data)
train_data = cleaned[:35000]
validation_data = cleaned[35000:40000]
test_data = cleaned[40000:]

In [42]:
print("Initialising Hyperparameters ... ", end="")

EMBEDDING_DIM = 256

SEQUENCE_LENGTH = 2

EPOCHS = 2
BATCH_SIZE = 64
LR = 3e-3

PADDING_IDX = 0
LSTM_CELLS = 192
VANILLA_CELLS = 96
LSTM_LAYERS = 2
LSTM_DROPOUT = 0.4
BI_LSTM = False

PREDICTION_SIZE = 20

print("done.")

Initialising Hyperparameters ... done.


In [43]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', DEVICE)

Using device: cuda


In [44]:
print("cudnn enabled:", torch.backends.cudnn.enabled)

cudnn enabled: True


In [45]:
print("GPU Name: ",torch.cuda.get_device_name(0))

GPU Name:  Tesla K80


In [46]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, seq_len):

        # length of sequence in consideration
        self.seq_len = seq_len
        # a list of all the tokens
        self.words = self.load_words()
        # a sorted list of all the tokens
        self.uniq_words = self.get_uniq_words()
        # storing the vocublary size
        self.vocab_size = len(self.uniq_words)

        # dictionaries to convert to and from words
        # used for embeddings and the generated text 
        # Note the index+1 0 is kept for padding 
        self.index_to_word = {
            index: word for index, word in enumerate(self.uniq_words)
        }
        self.word_to_index = {
            word: index for index, word in enumerate(self.uniq_words)
        }

        # a grand sequence of the words, each translated 
        # into its respective indices
        self.words_indexes = [self.word_to_index[w] for w in self.words]


    def load_words(self):
        text = ' '.join(train_data)
        return text_to_word_sequence(text) 

    def get_uniq_words(self):
        word_counts = Counter(self.words)
        return sorted(word_counts, key=word_counts.get, reverse=True)

    # returns length of the dataset. required by torch
    def __len__(self):
        return len(self.words_indexes) - self.seq_len

    # returns an item, as if accessed from the dictionary as
    # dataset[i]. required by torch
    def __getitem__(self, index):
        return (
            torch.tensor(self.words_indexes[index:index+self.seq_len]),
            torch.tensor(self.words_indexes[index+1:index+self.seq_len+1]),
        )

In [47]:
class Model(nn.Module):
    def __init__(
            self,
            vocab_size,
            embedding_dim,
            lstm_cells,
            lstm_layers,
            lstm_dropout,
            is_bidirectional,
            vanilla_cells,
            seq_len,
            batch_size
        ):
        super().__init__()

        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.lstm_cells = lstm_cells
        self.lstm_layers = lstm_layers
        self.lstm_dropout = lstm_dropout
        self.is_bidirectional = is_bidirectional
        self.vanilla_cells = vanilla_cells
        self.seq_len = seq_len
        self.num_directions = 2 if is_bidirectional else 1
        self.batch_size = batch_size

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=lstm_cells,
            num_layers=lstm_layers,
            dropout=lstm_dropout,
            bidirectional=is_bidirectional
        )

        self.fc1 = nn.Linear(
            lstm_cells*self.num_directions,
            vanilla_cells
        )

        self.fc2 = nn.Linear(
            vanilla_cells,
            vocab_size
        )

    def forward(self, x, prev_state):
        embed_output = self.embedding(x)
        lstm_output, new_state = self.lstm(embed_output, prev_state)
        fc1_output = self.fc1(lstm_output)
        fc2_output = self.fc2(fc1_output)

        return fc2_output, new_state

    def init_lstm(self):
        return (
            torch.zeros(
                self.lstm_layers*self.num_directions, 
                self.seq_len,
                self.lstm_cells
            ),
            torch.zeros(
                self.lstm_layers*self.num_directions, 
                self.seq_len,
                self.lstm_cells
            )
        )

In [48]:
print("Processing Dataset ... ", end="")
dataset = Dataset(SEQUENCE_LENGTH)
VOCAB_SIZE = dataset.vocab_size
print("done.")

Processing Dataset ... done.


In [49]:
print("Creating Model ... ", end="")
model = Model(
    VOCAB_SIZE,
    EMBEDDING_DIM,
    LSTM_CELLS,
    LSTM_LAYERS,
    LSTM_DROPOUT,
    BI_LSTM,
    VANILLA_CELLS,
    SEQUENCE_LENGTH,
    BATCH_SIZE
)
print("done.")
print("="*80)
print("MODEL:")
print(model)
print("="*80)
print("Shifting model to the GPU ... ", end="")
model.to(DEVICE)
print("done.")

Creating Model ... done.
MODEL:
Model(
  (embedding): Embedding(38835, 256)
  (lstm): LSTM(256, 192, num_layers=2, dropout=0.4)
  (fc1): Linear(in_features=192, out_features=96, bias=True)
  (fc2): Linear(in_features=96, out_features=38835, bias=True)
)
Shifting model to the GPU ... done.


In [50]:
print("Setting up loss and optimizers ... ", end="")

CRITERION = nn.CrossEntropyLoss()
OPTIMIZER = optim.Adam(model.parameters(), lr=LR)
print("done.")

Setting up loss and optimizers ... done.


In [51]:
def train(dataset, model):
    # getting the model ready for training ...
    model.train()
    # preparing the data to be iterated over
    data_generator = DataLoader(dataset, batch_size=BATCH_SIZE, drop_last=True)
    
    # stores losses for all the epochs
    loss_values = []
    # stores a list of the epoch number
    epoch_numbers = []
    
    # going over all the epochs
    for epoch in range(EPOCHS):
        # initialising the model for each epoch
        h_state, c_state = model.init_lstm()
        
        # sending the inits to the GPU
        h_state, c_state = h_state.to(DEVICE), c_state.to(DEVICE)

        # iterating over the dataset
        for batch, (x, y) in enumerate(tqdm(data_generator)):
            # transferring data to the GPU
            x, y = x.to(DEVICE), y.to(DEVICE)
            
            # resetting the gradients for each batch 
            # for proper training
            OPTIMIZER.zero_grad()

            # predictions from the model
            y_pred, (h_state, c_state) = model(x, (h_state, c_state))

            loss = CRITERION(y_pred.transpose(1, 2), y)
            h_state = h_state.detach()
            c_state = c_state.detach()
            loss.backward()
            OPTIMIZER.step()

        loss_values.append(loss.item())
        epoch_numbers.append(epoch+1)
        print(f"Epoch: {epoch}, loss: {loss.item()}")

In [52]:
train(dataset, model)

100%|██████████| 12030/12030 [07:49<00:00, 25.60it/s]
  0%|          | 4/12030 [00:00<06:20, 31.58it/s]

Epoch: 0, loss: 5.82609224319458


100%|██████████| 12030/12030 [07:49<00:00, 25.62it/s]


Epoch: 1, loss: 5.690529823303223


In [162]:
perplexity  = torch.exp(torch.Tensor([5.690529823303223]))
print(perplexity)

tensor([296.0504])


In [54]:
x = datetime.datetime.now()
current_time = str(x.year)+"-"+str(x.month)+"-"+str(x.day)+"-"+str(x.hour)+"-"+str(x.minute)

In [55]:
torch.save(model.state_dict(), f'{current_time}.model')

In [56]:
HYPERPARAMETERS = {
    "EMBEDDING_DIM": EMBEDDING_DIM,
    
    "SEQUENCE_LENGTH": SEQUENCE_LENGTH,

    "EPOCHS": EPOCHS,
    "BATCH_SIZE": BATCH_SIZE,
    "LR": LR,

    "LSTM_CELLS": LSTM_CELLS,
    "VANILLA_CELLS": VANILLA_CELLS,
    "LSTM_LAYERS": LSTM_LAYERS,
    "LSTM_DROPOUT": LSTM_DROPOUT,
    "BI_LSTM": BI_LSTM,

    "PREDICTION_SIZE": PREDICTION_SIZE,
}

In [57]:
with open(f'{current_time}.json', 'w+') as fp:
    json.dump(HYPERPARAMETERS, fp)

In [136]:
# shifting model back to the CPU
model.to('cpu')
# getting the model ready for testing ...
model.eval()

Model(
  (embedding): Embedding(38835, 256)
  (lstm): LSTM(256, 192, num_layers=2, dropout=0.4)
  (fc1): Linear(in_features=192, out_features=96, bias=True)
  (fc2): Linear(in_features=96, out_features=38835, bias=True)
)

In [153]:
def generate(dataset, model, seed):
    # # shifting model back to the CPU
    # model.to('cpu')
    # # getting the model ready for testing ...
    # model.eval()
    # a list of words
    words = text_to_word_sequence(seed)
    # # initialising the model parameters    
    h_state, c_state = model.init_lstm()
    # going over, and predicting
    x = torch.tensor([[dataset.word_to_index[w] for w in words[-2:]]])
    
    y_pred, (h_state, c_state) = model(x, (h_state, c_state))
    loss = CRITERION(y_pred.transpose(1, 2), x)
    perplexity  = torch.exp(torch.Tensor([loss.item()]))
    perp = int(perplexity[0])
    #print(perp)
    return perp/len(words)
    

In [163]:
seeds = validation_data
lm = []
lines = []
tot, num = 0, 0
for i in range(len(seeds)):
    # seed_list = seeds[i].split()
    # seed = ""
    # for word in seed_list:
    #     seed += word +" "
    # seed = seed[:-1]
    if i%3000==0:
      print(i)
    try:
      perplexity = generate(dataset, model, seed=seeds[i])
      tot += perplexity
      num += 1
      sent = f'{seeds[i]}\t{perplexity}'
      lines.append(sent)
      
      
    except:
      pass

0
3000


In [164]:
lines[:5]

['Greatest thing that ever happened.\t419.0',
 "Now Eileen really would have to settle down to love honor and obey, and she'd have to quit drinking.\t1828.6315789473683",
 "He'd come East for the christening, by God he would.\t41.2",
 'Before he left town Pat saw to it that I was fixed up with a job.\t2195.9375',
 'Pat had contacts all over the labor movement.\t512.75']

In [165]:
with open('validation_lstm.txt', 'w') as f:
    for item in lines:
        f.write("%s\n" % item)

In [166]:
tot/num

21781.48651231239