In [257]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device


device(type='cuda', index=0)

In [258]:
# Importing the training dataset
dataset_train = pd.read_csv('train_data.csv')
dataset_train

#dataset_train = dataset_train[:200]

Unnamed: 0,Sentence,Transformed sentence
0,udaxihhe,fmvmfthn
1,xdvxrcsn,suiaveib
2,bacghqta,zgvwmloh
3,rgwuwrnh,lmhdulik
4,osizayzf,wfysmuhe
...,...,...
6995,jnozsubc,mnhmecqd
6996,vpzkyffp,qqpmoheo
6997,syuftisv,ibuldqib
6998,ynkfgztm,zetqlfbh


In [259]:
class Lang:
  def __init__(self, name):
    self.name = name
    self.word2index = {}
    self.word2count = {}
    self.index2word = {0: "SOS", 1: "EOS"}
    self.n_words = 2  # Count SOS and EOS
  def addSentence(self, sentence):
    for word in sentence:
      self.addWord(word)
  def addWord(self, word):
    if word not in self.word2index:
      self.word2index[word] = self.n_words
      self.word2count[word] = 1
      self.index2word[self.n_words] = word
      self.n_words += 1
    else:
      self.word2count[word] += 1
  

# create an object of the Lang class for each language
input_lang = Lang('input')
output_lang = Lang('output')

# create a list of all the sentences in the training data
input_sentences = dataset_train['Sentence'].values.tolist()
output_sentences = dataset_train['Transformed sentence'].values.tolist()

# add each sentence to the corresponding language
for i in range(len(input_sentences)):
  input_lang.addSentence(input_sentences[i])
  output_lang.addSentence(output_sentences[i])
  

In [260]:
input_lang.index2word

{0: 'SOS',
 1: 'EOS',
 2: 'u',
 3: 'd',
 4: 'a',
 5: 'x',
 6: 'i',
 7: 'h',
 8: 'e',
 9: 'v',
 10: 'r',
 11: 'c',
 12: 's',
 13: 'n',
 14: 'b',
 15: 'g',
 16: 'q',
 17: 't',
 18: 'w',
 19: 'o',
 20: 'z',
 21: 'y',
 22: 'f',
 23: 'k',
 24: 'm',
 25: 'l',
 26: 'j',
 27: 'p'}

In [261]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, embedding_size,hidden_size, num_layers=1, dropout_p=0.2):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.dropout = nn.Dropout(dropout_p)
        
        self.lstm = nn.GRU(embedding_size, hidden_size, num_layers,batch_first= True)
        
    def forward(self, x):
        
        embedded = self.embedding(x)
        embedded = self.dropout(embedded)
        output, hidden = self.lstm(embedded)
        return output, hidden

    # Predict function to take word as input and output a tensor
    def predict(self, word):
        with torch.no_grad():
            for i in range(len(word)):
                x = torch.tensor([[input_lang.word2index[word[i]]]])
                x = x.to(device)
                output, hidden = self.forward(x)

In [262]:
# Decoder that takes the hidden state of the encoder and outputs a word

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size,num_layers=1, dropout_p=0.2):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size,num_layers, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, encoder_hidden_state, target=None, teacher_forcing_ratio=0.9):
        # Initialize decoder hidden state as encoder hidden state
        decoder_hidden_state = encoder_hidden_state
        # Initialize decoder input as SOS_token
        decoder_input = torch.tensor([[0]], device=device)
        # Initialize output list
        output = []
        output_words = []

        for i in range(9):
            #print(decoder_input.shape,decoder_hidden_state.shape)
            decoder_output, decoder_hidden_state = self.forward_step(decoder_input, decoder_hidden_state)
            
            # Choose top word from decoder's output
            top_word = decoder_output.argmax(2)
            output_words.append(top_word.item())
            # Append the probabiities to the output list
            output.append(decoder_output)
            # Next input is previous output
            decoder_input = top_word
            if(target is not None and np.random.random() < teacher_forcing_ratio):
                decoder_input = target[:,i].unsqueeze(1)
            decoder_hidden_state = decoder_hidden_state.detach()
            # Stop decoding when EOS_token is reached
            #if top_word.item() == 1:
            #    break
        # Convert output list to tensor
        output = torch.cat(output, dim=1)
        return output, output_words
        
    
    def forward_step(self, input, hidden):
        #print(input.shape,hidden.shape)
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        #output = self.dropout(output)
        return output, hidden
    
    # Predict function to take hidden state of encoder and output a word
    def predict(self, hidden):
        with torch.no_grad():
            input = torch.tensor([[0]], device=device)
            output, hidden = self.forward_step(input, hidden)
            return output, hidden

In [263]:
# pass data thru encoder
def encoder_pass(input_tensor, encoder):
  encoder_hidden = encoder(input_tensor)[1]
  return encoder_hidden

# pass data thru decoder
def decoder_pass(decoder_input, decoder_hidden, decoder):
  decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
  return decoder_output, decoder_hidden

# pass data thru encoder and decoder
def encoder_decoder_pass(input_tensor, decoder_input, encoder, decoder):
  encoder_hidden = encoder_pass(input_tensor, encoder)
  decoder_output, decoder_hidden = decoder_pass(decoder_input, encoder_hidden, decoder)
  return decoder_output, decoder_hidden

# convert a sentence to a tensor\
def sentence_to_tensor(lang, sentence):
  indexes = [lang.word2index[word] for word in sentence]
  EOS_token = 1
  indexes.append(EOS_token)
  return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

# convert a tensor to a sentence
def tensor_to_sentence(lang, tensor):
  sentence = []
  for i in tensor:
    word = lang.index2word[i.item()]
    sentence.append(word)
  return sentence

# convert a list of sentences to a list of tensors
def sentences_to_tensors(lang, sentences):
  tensors = []
  for sentence in sentences:
    tensors.append(sentence_to_tensor(lang, sentence))
  return tensors

# convert a list of tensors to a list of sentences
def tensors_to_sentences(lang, tensors):
  sentences = []
  for tensor in tensors:
    sentences.append(tensor_to_sentence(lang, tensor))
  return sentences


In [264]:
# create tensors for the input and output sentences
input_tensors = sentences_to_tensors(input_lang, input_sentences)
output_tensors = sentences_to_tensors(output_lang, output_sentences)

In [265]:
encoder = EncoderRNN(input_lang.n_words, 256, 256, 2).to(device)
decoder = DecoderRNN(256, output_lang.n_words,2).to(device)

test_X = input_tensors[0].view(1,-1)
test_Y = output_tensors[0]

print(test_X.shape)
encoder_output, encoder_hidden = encoder(test_X)
print(encoder_output.shape, encoder_hidden.shape)

#encoder_hidden = encoder_hidden.view(1,1,-1)
print("decoder")
decoder_output, decoder_word = decoder(encoder_hidden)
# convert list to tensor
#decoder_output = torch.tensor(decoder_output, dtype=torch.long, device=device).view(-1, 1)
#print(decoder_output)
print(decoder_output.shape)
print(decoder_word)
print(test_X)
print(test_X.shape)

torch.Size([1, 9])
torch.Size([1, 9, 256]) torch.Size([2, 1, 256])
decoder
torch.Size([1, 9, 28])
[13, 13, 13, 13, 25, 25, 25, 17, 17]
tensor([[2, 3, 4, 5, 6, 7, 7, 8, 1]], device='cuda:0')
torch.Size([1, 9])


In [266]:
class EncoderDecoder(nn.Module):
  def __init__(self, input_lang, output_lang,hidden_size, num_layers=1, dropout_p=0.1):
    super(EncoderDecoder, self).__init__()
    self.encoder = EncoderRNN(input_lang.n_words, hidden_size, hidden_size, num_layers).to(device)
    self.decoder = DecoderRNN(hidden_size, output_lang.n_words,num_layers).to(device)

  def forward(self, input_tensor, output_tensor=None):
    encoder_hidden = self.encoder(input_tensor)[1]
    decoder_output,decoder_word = self.decoder(encoder_hidden, output_tensor)
    #print(decoder_output.shape, input_tensor.shape)
    return decoder_output, decoder_word

In [267]:
def train(input_tensors, output_tensors, model, criterion, optimizer, n_epochs=1000):
  losses = []
  for epoch in range(n_epochs):
    epoch_loss = 0
    for i in range(len(input_tensors)):
      input_tensor = input_tensors[i].view(1,-1)
      output_tensor = output_tensors[i].view(1,-1)
      optimizer.zero_grad()
      output,_ = model(input_tensor, output_tensor)
      loss = criterion(output.view(9,-1), output_tensor.view(9))
      loss.backward()
      optimizer.step()
      epoch_loss += loss.item()
      if(i%1000==0):
        print("Epoch: {}, Iteration: {}, Loss: {:.5f}".format(epoch, i, loss.item()))
    losses.append(epoch_loss)
    print('Epoch: {}, Loss: {:.5f}'.format(epoch, epoch_loss))
  return losses

In [268]:
print(input_tensors[0].shape)
input_tensors = torch.cat(input_tensors, dim=0)
output_tensors = torch.cat(output_tensors, dim=0)
input_tensors= input_tensors.view(-1,9,1)
output_tensors = output_tensors.view(-1,9,1)
print(input_tensors.shape)

torch.Size([9, 1])
torch.Size([7000, 9, 1])


In [269]:
def train_from_to(input_tensors, output_tensors, seq2seq, criterion, optimizer, n_epochs, start_id, end_id):
  return train(input_tensors[start_id:end_id], output_tensors[start_id:end_id], seq2seq, criterion, optimizer, n_epochs)

In [270]:
lr = 0.0001
n_epochs = 25
hidden_size = 512
num_layers = 2

seq2seq = EncoderDecoder(input_lang, output_lang,hidden_size,num_layers).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(seq2seq.parameters(), lr)

In [311]:
# Training Loop
n_epochs = 1
lr=0.00001
start = 0
end = 7000

optimizer = torch.optim.AdamW(seq2seq.parameters(), lr, weight_decay=0.0000001, amsgrad=True)
losses = train_from_to(input_tensors, output_tensors, seq2seq, criterion, optimizer, n_epochs, start, end)

Epoch: 0, Iteration: 0, Loss: 0.01552
Epoch: 0, Iteration: 1000, Loss: 0.04176
Epoch: 0, Iteration: 2000, Loss: 0.15018
Epoch: 0, Iteration: 3000, Loss: 0.04640
Epoch: 0, Iteration: 4000, Loss: 0.17135
Epoch: 0, Iteration: 5000, Loss: 0.08056
Epoch: 0, Iteration: 6000, Loss: 0.17101
Epoch: 0, Loss: 910.35358


In [313]:
# Save the model
#torch.save(seq2seq.state_dict(), 'seq2seq_1_5k.pth')

# load the model
seq2seq = EncoderDecoder(input_lang, output_lang,hidden_size,num_layers).to(device)
seq2seq.load_state_dict(torch.load('seq2seq.pth'))


<All keys matched successfully>

In [305]:
# Evaluate the model

#xrblhucy,ugqqlqvw



test_X = sentence_to_tensor(input_lang, 'xrblhucy')
test_Y = sentence_to_tensor(output_lang, 'ugqqlqvw')

_, output_words = seq2seq(test_X.view(1,-1), test_Y.view(1,-1))
output_words = tensor_to_sentence(output_lang, torch.tensor(output_words, dtype=torch.long, device=device))
test_X = tensor_to_sentence(input_lang, test_X)
test_Y = tensor_to_sentence(output_lang, test_Y)

print('Input sentence: {}'.format(test_X))
print('Predicted sentence: {}'.format(output_words))
print('Actual sentence: {}'.format(test_Y))


Input sentence: ['x', 'r', 'b', 'l', 'h', 'u', 'c', 'y', 'EOS']
Predicted sentence: ['u', 'g', 'q', 'q', 'l', 'q', 'v', 'w', 'EOS']
Actual sentence: ['u', 'g', 'q', 'q', 'l', 'q', 'v', 'w', 'EOS']


In [297]:
# fn to print output for a given input given the encoder and decoder
def evaluate_enc(encoder, decoder, sentence):
    with torch.no_grad():
        input_tensor = sentence_to_tensor(input_lang, sentence)
        input_tensor = input_tensor.view(1,-1)
        encoder_hidden = encoder(input_tensor)[1]
        decoder_output, decoder_word = decoder(encoder_hidden)
        word = tensor_to_sentence(output_lang, torch.tensor(decoder_word, dtype=torch.long, device=device))
        # remove EOS token
        word = word[:-1]
        return word

In [298]:
evaluate_enc(seq2seq.encoder, seq2seq.decoder, 'xrblhucy')

['u', 'g', 'q', 'q', 'l', 'q', 'v', 'w']

In [274]:
# Function to check number of correct predictions
def check_accuracy(input_tensors, output_tensors, seq2seq):
  correct = 0
  for i in range(len(input_tensors)):
    input_tensor = input_tensors[i].view(1,-1)
    output_tensor = output_tensors[i].view(1,-1)
    with torch.no_grad():
      _, output_words = seq2seq(input_tensor)

      output_words = tensor_to_sentence(output_lang, torch.tensor(output_words, dtype=torch.long, device=device))
      output_words = ''.join(output_words)
      output_tensor = output_tensor.view(-1)
      output_tensor = tensor_to_sentence(output_lang, output_tensor)
      output_tensor = ''.join(output_tensor)
      if(output_words == output_tensor):
        correct += 1

  return correct
  

In [310]:
# check accuracy on the training data
correct = check_accuracy(input_tensors[:1000], output_tensors[:1000], seq2seq)
correct

857

In [299]:
# Function to check how many characters match in the two strings
def check(pred: str, true: str):
    correct = 0
    for a, b in zip(pred, true):
        if a == b:
            correct += 1

    # Prediction is more than 8 letters, so penalize for every extra letter.
    correct -= max(0, len(pred) - len(true))
    correct = max(0, correct)
    return correct

# Function to score the model's performance
def evaluate(encoder, decoder):

    # Train data
    print("Obtaining results for training data:")
    train_data = pd.read_csv("train_data.csv").to_numpy()
    results = {
        "pred": [],
        "true": [],
        "score": [],
    }
    correct = [0 for _ in range(9)]
    for x, y in train_data:
        pred = evaluate_enc(encoder, decoder, x)
        #print(pred, y) 
        score = check(pred, y)
        results["pred"].append(pred)
        results["true"].append(y)
        results["score"].append(score)

        correct[score] += 1
    print("Train dataset results:")
    for num_chr in range(9):
        print(
            f"Number of predictions with {num_chr} correct predictions: {correct[num_chr]}"
        )
    points = sum(correct[4:6]) * 0.5 + sum(correct[6:])
    print(f"Points: {points}")
    # Save predicitons and true sentences to inspect manually if required.
    pd.DataFrame.from_dict(results).to_csv("results_train.csv", index=False)

    #----------------------------------------------------------------------------------

    print("Obtaining metrics for eval data:")
    eval_data = pd.read_csv("eval_data.csv").to_numpy()
    results = {
        "pred": [],
        "true": [],
        "score": [],
    }
    correct = [0 for _ in range(9)]
    for x, y in eval_data:
        pred = evaluate_enc(encoder, decoder, x)
        score = check(pred, y)
        results["pred"].append(pred)
        results["true"].append(y)
        results["score"].append(score)

        correct[score] += 1
    print("Eval dataset results:")
    for num_chr in range(9):
        print(
            f"Number of predictions with {num_chr} correct predictions: {correct[num_chr]}"
        )
    points = sum(correct[4:6]) * 0.5 + sum(correct[6:])
    marks = round(min(2, points / 1400 * 2) * 2) / 2  # Rounds to the nearest 0.5
    print(f"Points: {points}")
    print(f"Marks: {marks}")
    # Save predicitons and true sentences to inspect manually if required.
    pd.DataFrame.from_dict(results).to_csv("results_eval.csv", index=False)

In [314]:
# Use function check
evaluate(seq2seq.encoder, seq2seq.decoder)


Obtaining results for training data:
Train dataset results:
Number of predictions with 0 correct predictions: 0
Number of predictions with 1 correct predictions: 591
Number of predictions with 2 correct predictions: 227
Number of predictions with 3 correct predictions: 52
Number of predictions with 4 correct predictions: 9
Number of predictions with 5 correct predictions: 5
Number of predictions with 6 correct predictions: 4
Number of predictions with 7 correct predictions: 7
Number of predictions with 8 correct predictions: 6105
Points: 6123.0
Obtaining metrics for eval data:
Eval dataset results:
Number of predictions with 0 correct predictions: 0
Number of predictions with 1 correct predictions: 1476
Number of predictions with 2 correct predictions: 445
Number of predictions with 3 correct predictions: 72
Number of predictions with 4 correct predictions: 7
Number of predictions with 5 correct predictions: 0
Number of predictions with 6 correct predictions: 0
Number of predictions wi