# Encoder-Decoder Modelling

## Observations and Analysis

- Initially I tried the question by giving all the outputs of the Encoder to the Decoder, but I later realised that it was a form of Pesudo Attention, and we weren't supposed to use it, However with that model I got a score of 2 Marks.
- Later I tried Three Different approaches, by using a Regular Biderectional Encoder (With Num Layers = 1) and by using a BiDirectional Encoder (With Num Layers = 2) and using a Regular Encoder with 2 layers.
- In all of the Above I concatenated the outputs and sent it to the Decoder
- I realised that given the simple enough implementation that we really did not need a very fancy solution (Because the other models did not perform that much better, and required more time to train)
- The decision between GRU, LSTM and RNN, was made for me by the Checker Script, I got 0.5, 0.5 with the GRU and LSTM and 1 mark with the RNN
- My Hypothesis is that the LSTM and the GRU learn information that is not really necessary for the given task, that is they overfit the loss

In [1]:
# Import the Libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import wandb

import torch
import torch.nn as nn
import torch.optim as optim

# Initialize the wandb
wandb.login()

input_size = 26
embedding_size = 50
enc_hidden_size = 50
encoding_size = 200
dec_hidden_size = 100
output_size = 26
batch_size = 256
epochs = 1000
learning_rate = 0.001

# Random Seed
seed = 42
np.random.seed(seed)

# Initialize the wandb project
wandb.init(project = 'seq2seq')
# Define the hyperparameters
config = wandb.config
config.learning_rate = learning_rate
config.batch_size = batch_size
config.epochs = epochs
config.input_size = input_size
config.embedding_size = embedding_size
config.enc_hidden_size = enc_hidden_size
config.encoding_size = encoding_size
config.dec_hidden_size = dec_hidden_size
config.output_size = output_size

# Import the Dataset
train_dataset = pd.read_csv('train_data.csv')
eval_dataset = pd.read_csv('eval_data.csv')

# Convert the Dataset to Numpy Array
X_train = train_dataset.iloc[1:, :].values
X_eval = eval_dataset.iloc[1:, :].values

# Each Row, is comprised of 2 columns, the first column is the input, and the second column is the output
X_col = X_train[:, 0]
Y_col = X_train[:, 1]

def numericize(X, Y):
    # Create a dictionary of the alphabet
    alphabet = 'abcdefghijklmnopqrstuvwxyz'
    numeric_dict = {}
    for i in range(len(alphabet)):
        numeric_dict[alphabet[i]] = i

    # Now, each sentence is a 8 letter word, so we need to convert each letter to a number
    X_numeric = []
    Y_numeric = []

    for i in range(len(X)):
        X_numeric.append([numeric_dict[j] for j in X[i]])
        Y_numeric.append([numeric_dict[j] for j in Y[i]])

    return X_numeric, Y_numeric

# Now, for each element in X_Col and Y_Col create embeddings -> each letter is a number in the alphabet
X_numeric, Y_numeric = numericize(X_col, Y_col)

# Convert the X_numeric and Y_numeric to numpy arrays
X_numeric = np.array(X_numeric)
Y_numeric = np.array(Y_numeric)

# Now, we need to convert the X_numeric and Y_numeric to tensors
X_numeric = torch.from_numpy(X_numeric)
Y_numeric = torch.from_numpy(Y_numeric)

# Test Dataset
X_eval_col = X_eval[:, 0]
Y_eval_col = X_eval[:, 1]

# Convert the X_eval_col and Y_eval_col to numeric
X_eval_numeric, Y_eval_numeric = numericize(X_eval_col, Y_eval_col)

# Convert the X_eval_numeric and Y_eval_numeric to numpy arrays
X_eval_numeric = np.array(X_eval_numeric)
Y_eval_numeric = np.array(Y_eval_numeric)

# Convert the X_eval_numeric and Y_eval_numeric to tensors
X_eval_numeric = torch.from_numpy(X_eval_numeric)
Y_eval_numeric = torch.from_numpy(Y_eval_numeric)


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mvenkata-kesav[0m ([33miiit_hyd[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
def Model_Output(X, model):
    # Numerize the X
    X, _ = numericize(X, X)
    # Convert the X to a numpy array
    X = np.array(X)
    # Convert the X to a tensor
    X = torch.from_numpy(X)
    # Get the output from the model
    output = model(X)
    # Now, we need to get the output take the argmax of the output along the first dimension and convert it to a set of letters
    output = torch.argmax(output, dim = 2)
    # Now, we need to convert the output to a list of letters
    output = output.tolist()
    # Now, we need to convert the output to a list of letters
    output = [[chr(i + 97) for i in j] for j in output]
    # Join the letters in each list
    output = [''.join(i) for i in output]
    return output


# Write a Custom Loss Function
def Loss_Function(X_eval_col, Y_eval_col, model):
    # Get the output from the model
    output = Model_Output(X_eval_col, model)
    # Now, we need to get the Y_eval_col
    Y_eval_col, _ = numericize(Y_eval_col, Y_eval_col)
    # Now, we need to calculate the loss
    loss = 0
    # For each element in the output, we need to calculate the loss and add it to the loss
    for i in range(len(output)):
        for j in range(len(output[i])):
            if output[i][j] != Y_eval_col[i][j]:
                loss += 0.01

    return loss

In [3]:
# Now, define the Encoder and Decoder Class
class EncDec(nn.Module):
    def __init__(self, input_size, embedding_size ,enc_hidden_size, encoding_size, dec_hidden_size,output_size, batch_size):
        super(EncDec, self).__init__()
        self.enc_hidden_size = enc_hidden_size
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.dec_hidden_size = dec_hidden_size
        self.batch_size = batch_size
        # Define the Encoder-1 LSTM
        self.encoder = nn.RNN(embedding_size, enc_hidden_size, batch_first = True, bidirectional = True)
        # Define the Decoder-1 LSTM
        self.decoder = nn.RNN(100, dec_hidden_size, batch_first = True)
        # Define the Linear Layer
        self.linear = nn.Linear(100, output_size)
        # Define the Dropout Layer
        self.dropout = nn.Dropout(0.2)

    def forward(self, input):
        # First, we need to convert the input to embeddings
        embedded = self.embedding(input)
        # print("Embedded Shape", embedded.shape)
        enc_hidden1_0 = torch.zeros(2, input.shape[0], self.enc_hidden_size)

        # Now, we need to pass the embeddings through the encoder
        enc_out1, hidden_out1 = self.encoder(embedded, enc_hidden1_0)
        # Get the final hidden state from the encoder
        dec_in = torch.cat((hidden_out1[0], hidden_out1[1]), dim=1)
        dec_in = dec_in.unsqueeze(1)
        # print("Decoder Input Shape", dec_in.shape) # Of Size 256*1*100
        final_in = [] # Of Size 256*1*100
        final_in.append(dec_in) # Of Size 256*1*100
        # print("Final In Shape", final_in[0].shape) # Of Size 256*1*100
        hidden_state = torch.zeros(1, input.shape[0], self.dec_hidden_size) 
        # print("Hidden State Shape", hidden_state.shape) # Of Size 1*256*200
        for i in range(7):
            f_ten = torch.cat(final_in, dim = 1) # Of Size Finally -> 256*8*100
            dec_out, hidden_state = self.decoder(f_ten, hidden_state)
            dec_out = dec_out[:, -1, :]
            final_in.append(dec_out.unsqueeze(1))
        f_ten = torch.cat(final_in, dim = 1)
        final_out = self.linear(f_ten)
        return final_out

In [4]:
# Now, we need to define the model
model = EncDec(input_size, embedding_size, enc_hidden_size, encoding_size, dec_hidden_size, output_size, batch_size)

# Now, we need to define the loss function and the optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = learning_rate)

# Training loop
for epoch in range(epochs):
    total_loss = 0

    # Mini-batch training
    for i in range(0, len(X_numeric), batch_size):
        optimizer.zero_grad()
        
        # Extract the current batch
        input_batch = X_numeric[i:i+batch_size]
        target_batch = Y_numeric[i:i+batch_size]

        # Forward pass
        output = model(input_batch)

        # Calculate the loss
        loss = criterion(output.view(-1, output_size), target_batch.view(-1))
        total_loss += loss.item()

        # Calculate the accuracy and print
        _, predicted = torch.max(output, dim=2)
        correct = (predicted == target_batch).sum().item()
        accuracy = correct / (batch_size * 8)

        # Backpropagation
        loss.backward()
        optimizer.step()

        print('Epoch: {}, Batch: {}, Loss: {:.6f}, Accuracy: {:.2f}%'.format(epoch, i//batch_size, loss.item(), accuracy * 100))

    if epoch % 10 == 0:
        # Get the Loss
        loss_val = Loss_Function(X_eval_col, Y_eval_col, model)
        # Print the loss
        print('Epoch: ', epoch, ' Validation Loss: ', loss_val)

    # Calculate the loss for the evaluation dataset -> Like the Train Dataset
    eval_loss = criterion(model(X_eval_numeric).view(-1, output_size), Y_eval_numeric.view(-1))

    # Print the average loss for this epoch
    print('Epoch: {}, Loss: {:.6f}'.format(epoch, total_loss / (len(X_numeric) // batch_size)))
    wandb.log({'Epoch': epoch, 'Loss': total_loss / (len(X_numeric) // batch_size), 'Eval Loss': eval_loss.item()})

# Run the Model on a single Evaluation Example

Epoch: 0, Batch: 0, Loss: 3.273032, Accuracy: 4.20%
Epoch: 0, Batch: 1, Loss: 3.271163, Accuracy: 3.52%
Epoch: 0, Batch: 2, Loss: 3.263216, Accuracy: 4.35%
Epoch: 0, Batch: 3, Loss: 3.261011, Accuracy: 4.00%
Epoch: 0, Batch: 4, Loss: 3.260625, Accuracy: 4.10%
Epoch: 0, Batch: 5, Loss: 3.255671, Accuracy: 4.35%
Epoch: 0, Batch: 6, Loss: 3.261335, Accuracy: 4.49%
Epoch: 0, Batch: 7, Loss: 3.254319, Accuracy: 3.81%
Epoch: 0, Batch: 8, Loss: 3.255884, Accuracy: 4.98%
Epoch: 0, Batch: 9, Loss: 3.252102, Accuracy: 5.03%
Epoch: 0, Batch: 10, Loss: 3.256083, Accuracy: 4.88%
Epoch: 0, Batch: 11, Loss: 3.251460, Accuracy: 5.18%
Epoch: 0, Batch: 12, Loss: 3.245968, Accuracy: 4.64%
Epoch: 0, Batch: 13, Loss: 3.255392, Accuracy: 4.25%
Epoch: 0, Batch: 14, Loss: 3.248942, Accuracy: 4.20%
Epoch: 0, Batch: 15, Loss: 3.247919, Accuracy: 5.03%
Epoch: 0, Batch: 16, Loss: 3.243202, Accuracy: 5.32%
Epoch: 0, Batch: 17, Loss: 3.246107, Accuracy: 5.42%
Epoch: 0, Batch: 18, Loss: 3.242838, Accuracy: 5.27%
Epo

# Output Function

In [5]:
# Save the Model
torch.save(model.state_dict(), 'model_final.pth')

In [6]:
# Load the Model
model = EncDec(input_size, embedding_size, enc_hidden_size, encoding_size, dec_hidden_size, output_size, batch_size)
model.load_state_dict(torch.load('model_final.pth'))

<All keys matched successfully>

In [7]:
# Now, we need to run the model on the X_eval_numeric
# output = model(X_e[8].unsqueeze(0))
# output = output.squeeze(0)

# print(Y_eval_numeric[8])

# # Now, we need to get the output take the argmax of the output along the first dimension and convert it to a set of letters
# output = torch.argmax(output, dim = 1)
# print(output)

X_eval_numeric = []
X_eval_col = X_eval[:, 0]
Y_eval_col = X_eval[:, 1]
# Now, we need to convert the X_eval_col to numeric
X_eval_numeric, Y_eval_numeric = numericize(X_eval_col, Y_eval_col)
# Now, we need to convert the X_eval_numeric to a numpy array
X_eval_numeric = np.array(X_eval_numeric)
Y_eval_numeric = np.array(Y_eval_numeric)
# Now, we need to convert the X_eval_numeric to a tensor
X_eval_numeric = torch.from_numpy(X_eval_numeric)
Y_eval_numeric = torch.from_numpy(Y_eval_numeric)


output = model(X_numeric[7].unsqueeze(0))
output = output.squeeze(0)

print(Y_numeric[7])

output = torch.argmax(output, dim = 1)
print(output)

tensor([ 4, 21, 23, 21,  3, 22,  5, 20])
tensor([17, 21, 23, 21,  3, 22,  5, 20])


In [8]:
def Model_Output(X):
    # Numerize the X
    X, _ = numericize(X, X)
    # Convert the X to a numpy array
    X = np.array(X)
    # Convert the X to a tensor
    X = torch.from_numpy(X)
    # Get the output from the model
    output = model(X)
    # Now, we need to get the output take the argmax of the output along the first dimension and convert it to a set of letters
    output = torch.argmax(output, dim = 2)
    # Now, we need to convert the output to a list of letters
    output = output.tolist()
    # Now, we need to convert the output to a list of letters
    output = [[chr(i + 97) for i in j] for j in output]
    # Join the letters in each list
    output = [''.join(i) for i in output]
    return output

def Model_Output_Single(x):
    # Numerize the X -> To get a list of numbers
    X, _ = numericize(x, x)
    # Convert the X to a numpy array
    X = np.array(X)
    # Convert the X to a tensor
    X = torch.from_numpy(X)
    X = X.squeeze(1)
    X = X.unsqueeze(0)
    # Get the output from the model
    output = model(X)
    # Now, we need to get the output take the argmax of the output along the first dimension and convert it to a set of letters
    output = torch.argmax(output, dim = 2)
    # Now, we need to convert the output to a list of letters
    output = output.tolist()
    # Now, we need to convert the output to a list of letters
    output = [[chr(i + 97) for i in j] for j in output]
    # Join the letters in each list
    output = [''.join(i) for i in output]
    return output

# Import the Test Dataset
test_dataset = pd.read_csv('eval_data.csv')
X_test = test_dataset.iloc[1:, :].values
X_test_col = X_test[:, 0]

output = Model_Output(X_test_col)
print(output)
print(X_test[:, 1].tolist())
# y_val = [chr(i + 97) for i in Y_eval_numeric]

# Find the Loss, between the output and the Y_eval_numeric
# loss = criterion(output, X_test[:, 1])

['mfsoyhwl', 'xwjoqnkq', 'ndxjkzgb', 'qloqbgvx', 'swkibfor', 'hyasngvh', 'oatvtqyo', 'eccejgbt', 'fdtmtwhp', 'fdpdndyx', 'ojfyyzak', 'zdtfkduk', 'kvjghjlw', 'wksjvnky', 'pxqzjrmk', 'iynjeklv', 'efoarevr', 'btoovjhv', 'yebmjdjh', 'gnhyhjzi', 'ksytgpgt', 'gypzsfds', 'kwaimswj', 'eblbufds', 'rysqdjkb', 'prtmzsap', 'hrminzzp', 'vgcsnpxc', 'gcksonxm', 'xywzmpap', 'godhxdsj', 'drpdmglt', 'lwavfesu', 'hfjbjkge', 'awqtiryv', 'ebacftdx', 'zoprqthd', 'xrwgabkm', 'cvslxzfi', 'pwnbbarv', 'xlaoxqob', 'bzkshbvw', 'bffwlpyf', 'sthjqsdx', 'mxelgpsa', 'ktyjqequ', 'hyqvamgi', 'tbzgnaak', 'nlkghnmt', 'ochtchza', 'rhivnvai', 'aizyhosd', 'dmvuouzr', 'jaahbhjh', 'wgaxvvgc', 'vtuzflds', 'ngoitulc', 'hroqyanz', 'rjjardwf', 'hfxuapnb', 'pouxlhuh', 'zzsmvwjb', 'tetcxcmg', 'igdzsuyh', 'mgeywkck', 'jachnzeg', 'jmfirkmk', 'mxzcuebs', 'dpyljwfj', 'wcjtuhml', 'htbylrdl', 'hdofyxto', 'gwhrzhcm', 'gzoaomxd', 'xrxgepxg', 'hlyiasqt', 'vrlmirfm', 'efuomypc', 'ejfrjnqd', 'odsihvxy', 'cqwbtfke', 'fceckdvp', 'mdwkachv', 'bv

## Checker Script

In [9]:
import pandas as pd
import numpy as np

# Function to check how many characters match in the two strings
def check(pred: str, true: str):
    correct = 0
    for a, b in zip(pred, true):
        if a == b:
            correct += 1

    # Prediction is more than 8 letters, so penalize for every extra letter.
    correct -= max(0, len(pred) - len(true))
    correct = max(0, correct)
    return correct

# Function to score the model's performance
def evaluate(model):
    # Train data
    print("Obtaining results for training data:")
    train_data = pd.read_csv("train_data.csv").to_numpy()
    results = {
        "pred": [],
        "true": [],
        "score": [],
    }
    count = 0
    correct = [0 for _ in range(9)]
    for x, y in train_data:
        pred = Model_Output_Single(x)
        pred = ''.join(pred)
        score = check(pred, y)
        results["pred"].append(pred)
        results["true"].append(y)
        results["score"].append(score)

        correct[score] += 1
        count += 1
    print("Train dataset results:")
    for num_chr in range(9):
        print(
            f"Number of predictions with {num_chr} correct predictions: {correct[num_chr]}"
        )
    points = sum(correct[4:6]) * 0.5 + sum(correct[6:])
    print(f"Points: {points}")
    # Save predicitons and true sentences to inspect manually if required.
    pd.DataFrame.from_dict(results).to_csv("results_train.csv", index=False)

    #----------------------------------------------------------------------------------

    print("Obtaining metrics for eval data:")
    count = 0
    eval_data = pd.read_csv("eval_data.csv").to_numpy()
    results = {
        "pred": [],
        "true": [],
        "score": [],
    }
    correct = [0 for _ in range(9)]
    for x, y in eval_data:
        pred = Model_Output_Single(x)
        pred = ''.join(pred)
        score = check(pred, y)
        results["pred"].append(pred)
        results["true"].append(y)
        results["score"].append(score)
        correct[score] += 1
    print("Eval dataset results:")
    for num_chr in range(9):
        print(
            f"Number of predictions with {num_chr} correct predictions: {correct[num_chr]}"
        )
    points = sum(correct[4:6]) * 0.5 + sum(correct[6:])
    marks = round(min(2, points / 1400 * 2) * 2) / 2  # Rounds to the nearest 0.5
    print(f"Points: {points}")
    print(f"Marks: {marks}")
    # Save predicitons and true sentences to inspect manually if required.
    pd.DataFrame.from_dict(results).to_csv("results_eval.csv", index=False)

evaluate(model)

Obtaining results for training data:
Train dataset results:
Number of predictions with 0 correct predictions: 0
Number of predictions with 1 correct predictions: 0
Number of predictions with 2 correct predictions: 0
Number of predictions with 3 correct predictions: 0
Number of predictions with 4 correct predictions: 17
Number of predictions with 5 correct predictions: 232
Number of predictions with 6 correct predictions: 1304
Number of predictions with 7 correct predictions: 2973
Number of predictions with 8 correct predictions: 2474
Points: 6875.5
Obtaining metrics for eval data:
Eval dataset results:
Number of predictions with 0 correct predictions: 15
Number of predictions with 1 correct predictions: 99
Number of predictions with 2 correct predictions: 273
Number of predictions with 3 correct predictions: 499
Number of predictions with 4 correct predictions: 529
Number of predictions with 5 correct predictions: 397
Number of predictions with 6 correct predictions: 149
Number of pred

wandb: Network error (ConnectionError), entering retry loop.
