In [6]:
import warnings
from IPython.display import display

warnings.filterwarnings("ignore")

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd

In [9]:
with open("data/wizard_of_oz.txt", "r", encoding = "utf-8") as f:
    text = f.read()

print(text[:200])

chars = sorted(set(text))
print(f"\nNumber of unique chars: {len(chars)}")

﻿The Project Gutenberg eBook of Dorothy and the Wizard in Oz
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no rest

Number of unique chars: 82


In [10]:
char_to_int = {c:i for i, c in enumerate(chars)}
int_to_char = {i:c for i, c in enumerate(chars)}
encoder = lambda s: [char_to_int[char] for char in s]
decoder = lambda i: "".join([int_to_char[n] for n in i])

In [21]:
encoder("hello !")

[62, 59, 66, 66, 69, 1, 2]

In [32]:
decoder([62, 59, 66, 66, 69, 1, 2])

'hello !'

In [30]:
data = torch.tensor(encoder("This is the Text !"), dtype = torch.long)

In [31]:
data

tensor([45, 62, 63, 73,  1, 63, 73,  1, 74, 62, 59,  1, 45, 59, 78, 74,  1,  2])

In [35]:
import re

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Split text into words
    words = text.split()
    return words

# Example usage:
text = "Hello! How are you doing?"
processed_text = preprocess_text(text)
print(processed_text)


['hello', 'how', 'are', 'you', 'doing']


In [36]:
def generate_ngram_sequences(words, n):
    sequences = []
    for i in range(len(words) - n + 1):
        context = tuple(words[i:i+n-1])
        next_word = words[i+n-1]
        sequences.append((context, next_word))
    return sequences

# Example usage:
processed_text = ["the", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"]
n = 4
ngram_sequences = generate_ngram_sequences(processed_text, n)
print(f"{n}-gram sequences:", ngram_sequences)

4-gram sequences: [(('the', 'quick', 'brown'), 'fox'), (('quick', 'brown', 'fox'), 'jumps'), (('brown', 'fox', 'jumps'), 'over'), (('fox', 'jumps', 'over'), 'the'), (('jumps', 'over', 'the'), 'lazy'), (('over', 'the', 'lazy'), 'dog')]


In [16]:
torch.random.seed()
torch.randint(5, (4, ))

tensor([2, 2, 0, 2])

In [25]:
from collections import defaultdict

# Step 1: Gather Text Data
text_data = "The quick brown fox jumps over the lazy dog."

# Step 2: Preprocess Text Data
def preprocess_text(text):
    text = text.lower()
    text = text.replace(".", "")  # Remove punctuation
    return text

processed_text = preprocess_text(text_data)

# Step 3: Convert Text to Numerical Representation
def tokenize_words(text):
    # tokens = text.split()
    tokens = sorted(set(text))
    word_to_id = {}
    id_to_word = {}
    for i, token in enumerate(set(tokens)):
        word_to_id[token] = i
        id_to_word[i] = token
    return word_to_id, id_to_word

word_to_id, id_to_word = tokenize_words(processed_text)

# Step 4: Generate Input-Output Pairs
def generate_bigram_sequences(tokens):
    sequences = []
    for i in range(len(tokens) - 1):
        context = tokens[i]
        next_word = tokens[i + 1]
        sequences.append((context, next_word))
    return sequences

sequences = generate_bigram_sequences(processed_text.split())

# Print the generated sequences
print("Generated Bigram Sequences:")
for context, next_word in sequences:
    print(f"Context: {context}, Next Word: {next_word}")


Generated Bigram Sequences:
Context: the, Next Word: quick
Context: quick, Next Word: brown
Context: brown, Next Word: fox
Context: fox, Next Word: jumps
Context: jumps, Next Word: over
Context: over, Next Word: the
Context: the, Next Word: lazy
Context: lazy, Next Word: dog


In [18]:
from torch.utils.data import Dataset, DataLoader

# Define a custom dataset class
class BigramDataset(Dataset):
    def __init__(self, sequences, word_to_id):
        self.sequences = sequences
        self.word_to_id = word_to_id

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        context, next_word = self.sequences[idx]
        context_id = self.word_to_id[context]
        next_word_id = self.word_to_id[next_word]
        return torch.tensor(context_id), torch.tensor(next_word_id)

# Create an instance of the custom dataset
dataset = BigramDataset(sequences, word_to_id)

# Define batch size
batch_size = 4

# Create a DataLoader for training
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Print the first batch of data
for batch_idx, (inputs, labels) in enumerate(train_loader):
    print(f"Batch {batch_idx + 1}:")
    print("Inputs:", inputs)
    print("Labels:", labels)
    print()
    if batch_idx == 0:
        break


Batch 1:
Inputs: tensor([4, 5, 1, 3])
Labels: tensor([2, 7, 3, 4])



In [23]:
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(4, 5)  # Fully connected layer with input size 4 and output size 5
        self.fc2 = nn.Linear(5, 4)   # Fully connected layer with input size 5 and output size 4

    def forward(self, x):
        x = torch.relu(self.fc1(x))  # Apply ReLU activation function to the output of the first layer
        x = self.fc2(x)              # Pass the output through the second layer
        return x

# Create an instance of the neural network
model = SimpleNN()
print(model)


SimpleNN(
  (fc1): Linear(in_features=4, out_features=5, bias=True)
  (fc2): Linear(in_features=5, out_features=4, bias=True)
)


In [None]:
import torch.optim as optim

num_epochs = 4
# Define loss function
criterion = nn.CrossEntropyLoss()

# Choose optimization algorithm and set hyperparameters
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

# Training loop
for epoch in range(num_epochs):
    running_loss = 0.0
    for inputs, labels in train_loader:
        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)

        # Compute the loss
        loss = criterion(outputs, labels)

        # Backpropagation
        loss.backward()

        # Update the parameters
        optimizer.step()

        # Print statistics
        running_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader)}")

# Evaluation
# with torch.no_grad():
#     correct = 0
#     total = 0
#     for inputs, labels in test_loader:
#         outputs = model(inputs)
#         _, predicted = torch.max(outputs.data, 1)
#         total += labels.size(0)
#         correct += (predicted == labels).sum().item()
#     accuracy = correct / total
#     print(f"Accuracy on test set: {100 * accuracy:.2f}%")


In [29]:
model.fc1.bias

Parameter containing:
tensor([-0.0456,  0.2101, -0.3432, -0.4100,  0.2428], requires_grad=True)

#### Each word/subword or more correctly token in the vocabulary will have it's own embedding vector of n-dimensions
#### The vocabulary size is typically decided based on the size and diversity of the text corpus used for training.
#### Hence, before start of the training we will have a large matrix of size (vocab_size, embeddings) from where each embedding is picked based on it's occurance during training

##### nn.Embedding is a PyTorch module that is used to create word embeddings in neural network models for natural language processing (NLP) tasks. It maps discrete tokens (e.g., words or characters) to continuous vectors of fixed size, allowing the model to learn representations of words in a dense embedding space.

#### How nn.Embedding Works:

##### nn.Embedding initializes a learnable embedding matrix where each row corresponds to the embedding vector of a token in the vocabulary. During forward pass, nn.Embedding takes an input tensor containing token indices and returns the corresponding embedding vectors. The embedding vectors are looked up from the embedding matrix based on the input token indices. The embedding vectors are then fed into the neural network model for further processing.