<a href="https://colab.research.google.com/github/wickedWOLF123/DRP/blob/main/wordModellingPytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Word Modelling Using LSTM**

This is a implementation from https://www.tensorflow.org/text/tutorials/text_generation in TensorFlow based on the http://karpathy.github.io/2015/05/21/rnn-effectiveness/ by Andrej Karapathy. This is being implemented in Pytorch.

We are building a Word Modelling ie next token prediction. We are trying this for generation of Shakespeare like text to understand the usefullness of LSTMS for Natural Language Processing Tasks

In [None]:
#Imports for this project

import torch
import torch.nn as nn
import numpy as np
import os
import time

In [None]:
# Getting Shakespear writing as text file from googleapis
import requests

url = 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt'
response = requests.get(url)
text = response.text

print(f'Length of text: {len(text)} characters')
print(text[:250])

# We need to convert every character to a vector so were see how many unique characters
# These unique characters make up our vocabulary
vocabulary = sorted(set(text))
vocab_size = len(vocabulary)
print(f'{len(vocabulary)} unique characters')

Length of text: 1115394 characters
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

65 unique characters


In [None]:
# Create mappings from characters to vectors and vice-versa
letter_to_index = {character: idx for idx, character in enumerate(vocabulary)}
index_to_letter = {idx: character for idx, character in enumerate(vocabulary)}

In [None]:
# Were going to chop up our 1000000+ character input into 100 size pieces
# So that it is easier and we get batch processing
sequence_length = 100
encoded_text = np.array([letter_to_index[ch] for ch in text], dtype=np.int64)

In [None]:
# Slice the encoded into sizes of encoded lenght
# Our input sequence is from [0: seq_len] and the
# the target sequence is [1: seq_len+1], now we loop
input = []
output = []

for i in range(0, len(encoded_text) - sequence_length):
  input.append(encoded_text[i:i+sequence_length])
  output.append(encoded_text[i+1:i+sequence_length+1])

# Converting to arrays for faster
input = np.array(input)
output = np.array(output)

print(f'Sequences = {len(input)}')

Sequences = 1115294


In [None]:
# PARAMETERS

BATCH_SIZE = 64
NUM_BATCHES = len(input) // BATCH_SIZE

EMBED_SIZE = 128
HIDDEN_SIZE = 256
NUM_EPOCHS = 10
NUM_LAYERS = 2


In [None]:
# Creating the Dataset in Pytorch and change them to tensors

class shakespeareDataset(torch.utils.data.Dataset):
  def __init__(self, input, output):
    self.input = torch.tensor(input, dtype=torch.long)
    self.output = torch.tensor(output, dtype=torch.long)

  def __len__(self):
    return len(self.input)

  def __getitem__(self, idx):
    return self.input[idx], self.output[idx]

dataset = shakespeareDataset(input, output)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE , shuffle=True, drop_last=True)

In [None]:
# Creating the model
# We start with the embedding from the vector to an embedding
# Then the torch.nn.lstm layer for the 2nd layer
# And a fully connected dense layer that gives us the output as the
# Same size as out input token.

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden):
        x = self.embedding(x)
        out, hidden = self.lstm(x, hidden)
        out = out.contiguous().view(-1, out.shape[2])
        out = self.fc(out)
        return out, hidden

    def init_hidden(self, batch_size):
        # Initialize hidden state and cell state to zeros
        weight = next(self.parameters()).data
        hidden = (weight.new(self.lstm.num_layers, batch_size, self.lstm.hidden_size).zero_(),
                  weight.new(self.lstm.num_layers, batch_size, self.lstm.hidden_size).zero_())
        return hidden


In [None]:
model = LSTMModel(vocab_size, EMBED_SIZE, HIDDEN_SIZE, NUM_LAYERS)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

LSTMModel(
  (embedding): Embedding(65, 128)
  (lstm): LSTM(128, 256, num_layers=2, batch_first=True)
  (fc): Linear(in_features=256, out_features=65, bias=True)
)

In [None]:
for epoch in range(1, NUM_EPOCHS + 1):
    hidden = model.init_hidden(BATCH_SIZE)
    hidden = tuple([h.to(device) for h in hidden])

    epoch_loss = 0
    for batch_idx, (x, y) in enumerate(dataloader):
        if batch_idx >= NUM_BATCHES:
            break
        x, y = x.to(device), y.to(device)

        # Detach hidden states to prevent backpropagating through the entire history
        hidden = tuple([h.detach() for h in hidden])

        optimizer.zero_grad()

        output, hidden = model(x, hidden)
        loss = criterion(output, y.view(-1))
        loss.backward()

        # Gradient clipping to prevent exploding gradients
        nn.utils.clip_grad_norm_(model.parameters(), 5)

        optimizer.step()

        epoch_loss += loss.item()

        if (batch_idx + 1) % 250 == 0:
            print(f'Epoch [{epoch}/{NUM_EPOCHS}], Batch [{batch_idx+1}/{len(dataloader)}], Loss: {loss.item():.4f}')

    avg_loss = epoch_loss / len(dataloader)
    print(f'Epoch [{epoch}/{NUM_EPOCHS}] completed with Average Loss: {avg_loss:.4f}')


Epoch [1/10], Batch [250/17426], Loss: 1.8550
Epoch [1/10], Batch [500/17426], Loss: 1.6276
Epoch [1/10], Batch [750/17426], Loss: 1.4772
Epoch [1/10], Batch [1000/17426], Loss: 1.4709
Epoch [1/10], Batch [1250/17426], Loss: 1.4107
Epoch [1/10], Batch [1500/17426], Loss: 1.3745
Epoch [1/10], Batch [1750/17426], Loss: 1.3502
Epoch [1/10], Batch [2000/17426], Loss: 1.3560
Epoch [1/10], Batch [2250/17426], Loss: 1.3606
Epoch [1/10], Batch [2500/17426], Loss: 1.3346
Epoch [1/10], Batch [2750/17426], Loss: 1.2859
Epoch [1/10], Batch [3000/17426], Loss: 1.3213
Epoch [1/10], Batch [3250/17426], Loss: 1.2832
Epoch [1/10], Batch [3500/17426], Loss: 1.2872
Epoch [1/10], Batch [3750/17426], Loss: 1.2979
Epoch [1/10], Batch [4000/17426], Loss: 1.2338
Epoch [1/10], Batch [4250/17426], Loss: 1.2444
Epoch [1/10], Batch [4500/17426], Loss: 1.2648
Epoch [1/10], Batch [4750/17426], Loss: 1.2114
Epoch [1/10], Batch [5000/17426], Loss: 1.2387
Epoch [1/10], Batch [5250/17426], Loss: 1.1981
Epoch [1/10], Ba

In [None]:
# Now actually Tetsing the model to see output

def generate(model, start_string, num_words):
  model.eval()
  start_string = start_string
  input_seq = torch.tensor([letter_to_index[ch] for ch in start_string], dtype=torch.long).unsqueeze(0).to(device)

  hidden = model.init_hidden(1)
  hidden = tuple([h.to(device) for h in hidden])

  generated = start_string

  with torch.no_grad():
    for i in range(len(start_string) - 1):
        _, hidden = model(input_seq[:, i].unsqueeze(1), hidden)

    last_char = input_seq[:, -1]

    for _ in range(num_words):
        output, hidden = model(last_char.unsqueeze(1), hidden)
        probabilities = nn.functional.softmax(output.squeeze(), dim=0)
        top_char = torch.multinomial(probabilities, 1)[0]
        generated += index_to_letter[top_char.item()]
        last_char = torch.tensor([top_char], dtype=torch.long).to(device)

    return generated



In [None]:
# Change this start string how you want
start_text = "How you doing"
generated_text = generate(model, start_text, 500)
print(generated_text)

How you doing,
To right of her opposited king, our jest,
To prove it, and let me do: and art thou hither's use
Only Honess commends the sounding:
I have forgot, methinks I see many lay
A precious boyt with a full rou so? If
thy weapons like a servant's drum.
Who speak, I do bear my covertors: as marry, Henry, daughter?

WARWICK:
Is it even now at obe,
I would be contented me.

GRUMIO:
I called my husband's king with his power;
Our artied arm Lein to bite this most accursed: underness never
Unopempose, the li
