In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
import numpy as np
import pandas as pd

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, rnn_cell='lstm', dropout=0.5, bidirectional=True):
        super(Encoder, self).__init__()  # Initialize the parent class.
        self.embedding = nn.Embedding(num_embeddings=input_size, embedding_dim=embedding_size)  # Create an embedding layer.
        self.dropout = nn.Dropout(dropout)  # Create a dropout layer.
        self.hidden_size = hidden_size  # Store the hidden size.
        self.num_layers = num_layers  # Store the number of layers.
        self.bidirectional = bidirectional  # Store whether the RNN is bidirectional.
        
        rnn_hidden_size = hidden_size // 2 if bidirectional else hidden_size  # Adjust hidden size for bidirectional RNN.
        
        if rnn_cell.lower() == 'lstm':
            self.rnn = nn.LSTM(embedding_size, rnn_hidden_size, num_layers, batch_first=True, dropout=(0 if num_layers == 1 else dropout), bidirectional=bidirectional)  # Create an LSTM layer.
        elif rnn_cell.lower() == 'gru':
            self.rnn = nn.GRU(embedding_size, rnn_hidden_size, num_layers, batch_first=True, dropout=(0 if num_layers == 1 else dropout), bidirectional=bidirectional)  # Create a GRU layer.
        else:
            self.rnn = nn.RNN(embedding_size, rnn_hidden_size, num_layers, batch_first=True, dropout=(0 if num_layers == 1 else dropout), bidirectional=bidirectional)  # Create an RNN layer.
    
    def forward(self, x):
        embedded = self.embedding(x)  # Embed the input sequences.
        embedded = self.dropout(embedded)  # Apply dropout to the embeddings.
        outputs, hidden = self.rnn(embedded)  # Pass the embeddings through the RNN.

        if self.bidirectional:  # If the RNN is bidirectional.
            if isinstance(hidden, tuple):  # If the hidden state is a tuple (LSTM case).
                h_n, c_n = hidden  # Unpack the hidden states (hidden and cell states for LSTM).
#                 print('enc h bef dir',h_n.shape)  
#                 print('enc c bef dir',c_n.shape)  
                h_n = torch.cat((h_n[0::2], h_n[1::2]), dim=2)  # Concatenate the forward and backward hidden states.
                c_n = torch.cat((c_n[0::2], c_n[1::2]), dim=2)  # Concatenate the forward and backward cell states.
#                 print('enc h af dir',h_n.shape) 
#                 print('enc c af dir',c_n.shape)  
                hidden = (h_n, c_n)  # Pack the adjusted hidden states back into a tuple.
            else:  # If the hidden state is not a tuple (GRU/RNN case).
#                 print('enc hidd bef dir',hidden.shape) 
                hidden = torch.cat((hidden[0::2], hidden[1::2]), dim=2)  # Concatenate the forward and backward hidden states.
#                 print('after dir enc:',hidden.shape) 

        return hidden  # Return the RNN hidden states.

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_size, embedding_size, hidden_size, num_layers, encoder_num_layers, rnn_cell='lstm', dropout=0.5, bidirectional=True):
        super(Decoder, self).__init__()  # Initialize the parent class.
        self.embedding = nn.Embedding(num_embeddings=output_size, embedding_dim=embedding_size)  # Create an embedding layer.
        self.dropout = nn.Dropout(dropout)  # Create a dropout layer.
        self.output_size = output_size  # Store the output size.
        self.hidden_size = hidden_size * encoder_num_layers if bidirectional else hidden_size  # Adjust hidden size for bidirectional encoder.
        self.num_layers = num_layers  # Store the number of layers.
        
        if rnn_cell.lower() == 'lstm':
            self.rnn = nn.LSTM(embedding_size, self.hidden_size, num_layers, batch_first=True, dropout=(0 if num_layers == 1 else dropout))  # Create an LSTM layer.
        elif rnn_cell.lower() == 'gru':
            self.rnn = nn.GRU(embedding_size, self.hidden_size, num_layers, batch_first=True, dropout=(0 if num_layers == 1 else dropout))  # Create a GRU layer.
        else:
            self.rnn = nn.RNN(embedding_size, self.hidden_size, num_layers, batch_first=True, dropout=(0 if num_layers == 1 else dropout))  # Create an RNN layer.
        
        self.fc = nn.Linear(self.hidden_size, output_size)  # Create a fully connected layer for output.

    def forward(self, x, hidden):
        x = x.unsqueeze(1)  # Add a singleton dimension to the input tensor.
        embedded = self.dropout(self.embedding(x))  # Embed the input sequences and apply dropout.
        output, hidden = self.rnn(embedded, hidden)  # Pass the embedded input through the RNN.
        output = self.fc(self.dropout(output.squeeze(1)))  # Apply dropout and pass through the fully connected layer.
        return output, hidden  # Return the output and hidden states.

In [None]:
class Seq_to_Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq_to_Seq, self).__init__()  # Initialize the parent class.
        self.encoder = encoder  # Store the encoder module.
        self.decoder = decoder  # Store the decoder module.
        
    def forward(self, source, target, teaching_force_ratio=0.5):
        batch_size = source.size(0)  # Get the batch size.
        target_len = target.size(1)  # Get the target sequence length.
        target_vocab_size = self.decoder.output_size  # Get the output vocabulary size.
        outputs = torch.zeros(batch_size, target_len, target_vocab_size).to(source.device)  # Initialize output tensor.
        
        encoder_hidden = self.encoder(source)  # Get encoder hidden states.
        
        if isinstance(encoder_hidden, tuple):  # If encoder hidden states is a tuple (LSTM case).
            h_n, c_n = encoder_hidden  # Unpack hidden states.
            if self.encoder.bidirectional:  # If encoder is bidirectional.
                h_n = torch.cat([h_n[i:i+1] for i in range(0, h_n.shape[0], 2)] + [h_n[i:i+1] for i in range(1, h_n.shape[0], 2)], dim=2)  # Concatenate forward and backward hidden states.
                c_n = torch.cat([c_n[i:i+1] for i in range(0, c_n.shape[0], 2)] + [c_n[i:i+1] for i in range(1, c_n.shape[0], 2)], dim=2)  # Concatenate forward and backward cell states.
            
            if h_n.size(0) < self.decoder.num_layers:  # If decoder has more layers than encoder.
                # Create zero tensors with the SAME hidden dimension as h_n
                hidden_dim = h_n.size(2)  # Get the actual hidden dimension from h_n
                zero_h = torch.zeros(self.decoder.num_layers - h_n.size(0), batch_size, hidden_dim, device=h_n.device)  # Create zero tensor for hidden states.
                zero_c = torch.zeros(self.decoder.num_layers - c_n.size(0), batch_size, hidden_dim, device=c_n.device)  # Create zero tensor for cell states.
                h_n = torch.cat([h_n, zero_h], dim=0)  # Concatenate zero tensor to adjust hidden states shape.
                c_n = torch.cat([c_n, zero_c], dim=0)  # Concatenate zero tensor to adjust cell states shape.
            encoder_hidden = (h_n[:self.decoder.num_layers], c_n[:self.decoder.num_layers])  # Update encoder hidden states.
        else:  # If encoder hidden states is not a tuple (GRU/RNN case).
            h_n = encoder_hidden  # Use hidden states directly.
            if self.encoder.bidirectional:  # If encoder is bidirectional.
                h_n = torch.cat([h_n[i:i+1] for i in range(0, h_n.shape[0], 2)] + [h_n[i:i+1] for i in range(1, h_n.shape[0], 2)], dim=2)  # Concatenate forward and backward hidden states.
            if h_n.size(0) < self.decoder.num_layers:  # If decoder has more layers than encoder.
                # Create zero tensor with the SAME hidden dimension as h_n
                hidden_dim = h_n.size(2)  # Get the actual hidden dimension from h_n
                zero_h = torch.zeros(self.decoder.num_layers - h_n.size(0), batch_size, hidden_dim, device=h_n.device)  # Create zero tensor for hidden states.
                h_n = torch.cat([h_n, zero_h], dim=0)  # Concatenate zero tensor to adjust hidden states shape.
            encoder_hidden = h_n[:self.decoder.num_layers]  # Update encoder hidden states.
        
        decoder_input = target[:, 0]  # Get the decoder input for the first time step.
                    
        for t in range(1, target_len):  # Iterate over target sequence.
            decoder_output, encoder_hidden = self.decoder(decoder_input, encoder_hidden)  # Get decoder output and update hidden states.
            outputs[:, t] = decoder_output  # Store decoder output.
            teacher_force = torch.rand(1) < teaching_force_ratio  # Determine whether to use teacher forcing.
            top1 = decoder_output.argmax(1)  # Get the predicted token.
            decoder_input = target[:, t] if teacher_force else top1  # Update decoder input based on teacher forcing.
        return outputs  # Return the final output tensor.

In [None]:
INPUT_DIM = 100  # Set the size of the input vocabulary.
OUTPUT_DIM = 100  # Set the size of the output vocabulary.
ENC_EMB_DIM = 256  # Set the dimension of the input embeddings.
DEC_EMB_DIM = 256  # Set the dimension of the output embeddings.
HID_DIM = 512  # Set the dimension of the hidden states.
ENC_LAYERS = 1  # Set the number of layers in the encoder.
DEC_LAYERS = 3  # Set the number of layers in the decoder.
ENC_RNN_CELL = 'gru'  # Specify the RNN cell type for the encoder.
DEC_RNN_CELL = 'gru'  # Specify the RNN cell type for the decoder.

encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_LAYERS, ENC_RNN_CELL, dropout=0.3, bidirectional=True)  # Initialize the encoder module.
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, DEC_LAYERS, encoder.num_layers, DEC_RNN_CELL, dropout=0.3, bidirectional=True)  # Initialize the decoder module.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Choose the appropriate device for computation.
print(f"Using device: {device}")  # Print the chosen device.
model = Seq_to_Seq(encoder, decoder).to(device)  # Initialize the sequence-to-sequence model and move it to the selected device.
print(model)  # Print the model summary.

In [None]:
# # Define a function to create a vocabulary set from a given text
# def create_vocab(text):
#     # Create a set of unique characters found in the text
#     # Each word in the text is processed to extract its characters
#     vocab = set(char for word in text for char in word)
#     # Add a padding token to the vocabulary
#     vocab.add('<pad>')
#     # Add a start-of-sequence token to the vocabulary
#     vocab.add('<sos>')  # Start of sequence token
#     # Add an end-of-sequence token to the vocabulary
#     vocab.add('<eos>')  # End of sequence token
#     # Return the complete set of vocabulary items
#     return vocab

In [None]:
# First, let's modify the load_data function to ensure correct column ordering
def load_data(path):
    # Make sure we're correctly identifying which column is which
    df = pd.read_csv(path, header=None, names=['telugu', 'transliteration', 'score'], sep='\t')
    
    # Check a few samples to verify column ordering
    #print("Sample data:")
    #print(df.head(3))
    
    # Return the transliteration and Telugu columns in the right order
    return df['transliteration'], df['telugu']  # Note the order - transliteration first, Telugu second

In [None]:
# Load Telugu and transliteration training data from specified path
trans_train, tel_train = load_data('/kaggle/input/dakshina-dataset/te.translit.sampled.train.tsv')

In [None]:
print(trans_train)

In [None]:
print(tel_train)

In [None]:
# Define a function to create a vocabulary set from a given text
def create_vocab(text):
    # Create a set of unique characters found in the text
    # First convert all items to strings and skip any NaN values
    vocab = set()
    for word in text:
        if pd.notna(word):  # Skip NaN values
            for char in str(word):  # Convert to string to handle any non-string values
                vocab.add(char)
    
    # Add special tokens to the vocabulary
    vocab.add('<pad>')
    vocab.add('<sos>')  # Start of sequence token
    vocab.add('<eos>')  # End of sequence token
    
    # Return the complete set of vocabulary items
    return vocab

# Create a vocabulary from the transliteration training data
trans_vocab = create_vocab(trans_train)
# Create a vocabulary from the Telugu training data
tel_vocab = create_vocab(tel_train)

In [None]:
print(len(trans_vocab))
print(trans_vocab)

In [None]:
print(len(tel_vocab))
print(tel_vocab)

In [None]:
# Map each token in the trans vocabulary to a unique index
trans_token_to_index = {token: index for index, token in enumerate(sorted(trans_vocab))}
# Map each token in the tel vocabulary to a unique index
tel_token_to_index = {token: index for index, token in enumerate(sorted(tel_vocab))}

In [None]:
# Print the dictionary mapping trans tokens to indices
print(trans_token_to_index)

In [None]:
# Print the dictionary mapping tel tokens to indices
print(tel_token_to_index)

In [None]:
# Define a Dataset class for handling translation and telugu word pairs
class DakshinaDataset(Dataset):
    def __init__(self, trans_words, tel_words, trans_token_to_index, tel_token_to_index):
        # Store the lists of trans and tel words
        self.trans_words = trans_words
        self.tel_words = tel_words
        # Store the dictionaries that map characters to indices for both languages
        self.trans_token_to_index = trans_token_to_index
        self.tel_token_to_index = tel_token_to_index

    def __len__(self):
        # Return the number of word pairs in the dataset
        return len(self.trans_words)

    def __getitem__(self, index):
        # Fetching the trans and tel words at the specified index
        trans_word = self.trans_words.iloc[index]
#         print(trans_word)
        tel_word = self.tel_words.iloc[index]
#         print(teel_word)
        # Convert the trans word into indices using the trans_token_to_index mapping
        trans_indices = [trans_token_to_index[char] for char in trans_word]
#         print(trans_indices)
        # Convert the telugu word into indices, adding <sos> and <eos> tokens
        tel_indices = [tel_token_to_index['<sos>']] + [tel_token_to_index[char] for char in tel_word] + [tel_token_to_index['<eos>']]
#         print(tel_indices)
        # Return the indices as tensor objects
        return torch.tensor(trans_indices, dtype=torch.long), torch.tensor(tel_indices, dtype=torch.long)

In [None]:
# Define a function for padding sequences and packing batches
# packet_fn specifies a function to control how batches are created from the individual data items
def packet_fn(batch):
    # Unzip the batch to separate trans and telugu indices
    trans, tel = zip(*batch)
#     print(trans, tel)
    # Pad the sequences of trans indices
    trans_padded = pad_sequence(trans, batch_first=True, padding_value=trans_token_to_index['<pad>'])
#     print(trans_padded)
    # Pad the sequences of tel indices
    tel_padded = pad_sequence(tel, batch_first=True, padding_value=tel_token_to_index['<pad>'])
#     print(tel_padded)
    # Return the padded batches
    return trans_padded, tel_padded

In [None]:
# Load training data into the DakshinaDataset
train_dataset = DakshinaDataset(trans_train, tel_train, trans_token_to_index, tel_token_to_index)
# Create a DataLoader to batch and shuffle the dataset
# packet_fn specifies a function to control how batches are created from the individual data items
train_loader = DataLoader(train_dataset, batch_size = 64, collate_fn=packet_fn, shuffle=True)

In [None]:
print(len(train_dataset))

In [None]:
print(train_dataset[4000])

In [None]:
# Define a word accuracy function for word-level accuracy
def word_accuracy(outputs, targets, ignore_index):
    # Assuming outputs and targets are batched sequences of token indices
    # Ignoring <pad> tokens as specified by `ignore_index`
    correct = 0  # Initialize the count of correct predictions.
    total = 0  # Initialize the total number of sequences.
    for out, tar in zip(outputs, targets):  # Iterate over each output and target pair.
        # Ignoring padding in accuracy calculation
#         print('out bef pad:',out)  # Uncomment to print the output before removing padding.
#         print('tar:',tar)  # Uncomment to print the target.
        out = out[out != ignore_index]  # Remove padding tokens from the output.
        tar = tar[tar != ignore_index]  # Remove padding tokens from the target.
        ignore_index_eos = 0  # Define an ignore index for end of sequence.
        out = out[out != ignore_index_eos]  # Remove end of sequence tokens from the output.
        tar = tar[tar != ignore_index_eos]  # Remove end of sequence tokens from the target.
#         print('out aft pad:',out)  # Uncomment to print the output after removing padding.
#         print('tar:',tar)  # Uncomment to print the target after removing padding.
        if torch.equal(out, tar):  # Check if the processed output and target are identical.
            correct += 1  # Increment the correct count.
#             print('correct:',correct)  # Uncomment to print the correct count.
        total += 1  # Increment the total count.
#         print('total:',total)  # Uncomment to print the total count.
    return correct / total if total > 0 else 0  # Calculate and return the accuracy.

In [None]:
def train(model, iterator, optimizer, criterion, clip, device, ignore_index):
    # Set the model to training mode
    model.train()
    # Initialize epoch loss and accuracy
    epoch_loss = 0
    epoch_acc = 0
    
    # Iterate through the data iterator
    for source, target in iterator:
        # Move source and target tensors to the specified device
        source = source.to(device)
        target = target.to(device)
        
        # Zero the gradients
        optimizer.zero_grad()
        # Forward pass: compute model predictions
        output = model(source, target)
        
        output_dim = output.shape[-1]
        # Slice the output and target tensors to remove <sos> token and maintain sequence structure
        output = output[:, 1:, :]
        target = target[:, 1:]
        
        # Flatten all dimensions except for the batch dimension for loss calculation
        output_flat = output.reshape(-1, output_dim)
        target_flat = target.reshape(-1)
        #print(output_flat.shape)
        #print(target_flat.shape)
        
        # Compute the loss
        loss = criterion(output_flat, target_flat)
        # Calculate word-by-word accuracy
        acc = word_accuracy(output.argmax(dim=2), target, ignore_index)
        
        # Backpropagation
        loss.backward()
        # Clip gradients to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        # Update model parameters
        optimizer.step()
        
        # Accumulate epoch loss and accuracy
        epoch_loss += loss.item()
        epoch_acc += acc
    
    # Return average epoch loss and accuracy
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion, device, ignore_index):
    # Set the model to evaluation mode
    model.eval()
    # Initialize epoch loss and accuracy
    epoch_loss = 0
    epoch_acc = 0
    
    # Iterate through the data iterator
    with torch.no_grad():
        for source, target in iterator:
            # Move source and target tensors to the specified device
            source = source.to(device)
            target = target.to(device)
            
            # Forward pass: compute model predictions without teacher forcing
            output = model(source, target, 0)
            output_dim = output.shape[-1]
            # Slice the output and target tensors to remove <sos> token and maintain sequence structure
            output = output[:, 1:, :]
            target = target[:, 1:]
            
            # Flatten all dimensions except for the batch dimension for loss calculation
            output_flat = output.reshape(-1, output_dim)
            target_flat = target.reshape(-1)
            
            # Compute the loss
            loss = criterion(output_flat, target_flat)
            # Calculate word-by-word accuracy
            acc = word_accuracy(output.argmax(dim=2), target, ignore_index)
            
            # Accumulate epoch loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc
            
    # Return average epoch loss and accuracy
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
# Load validation data by reading a TSV file
trans_valid, tel_valid = load_data('/kaggle/input/dakshina-dataset/te.translit.sampled.dev.tsv')

# Create a validation dataset using the DakshinaDataset class.
valid_dataset = DakshinaDataset(trans_valid, tel_valid, trans_token_to_index, tel_token_to_index)

# Create a DataLoader to batch and shuffle the dataset
# 'collate_fn=packet_fn' specifies a function to control how batches are created from the individual data items.
# 'shuffle=True' ensures that the data is shuffled at every epoch which helps to reduce model overfitting
valid_loader = DataLoader(valid_dataset, batch_size=64, collate_fn=packet_fn, shuffle=True)

In [None]:
print(len(trans_valid))

In [None]:
# -embed_size-64-layers_enc-3-layers_dec-3-hid_size-512-cell_type-lstm-bidirectional-True-dropout-0.2
# Define the dimensions and configurations for the encoder and decoder
INPUT_DIM = 100
OUTPUT_DIM = 100
ENC_EMB_DIM = 64
DEC_EMB_DIM = 64
HID_DIM = 512
ENC_LAYERS = 3
DEC_LAYERS = 3
ENC_RNN_CELL = 'lstm'
DEC_RNN_CELL = 'lstm'

# Initialize the encoder with the specified parameters
encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_LAYERS, ENC_RNN_CELL, dropout=0.2, bidirectional=True)
# Initialize the decoder with the specified parameters, using the number of encoder layers
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, DEC_LAYERS, encoder.num_layers, DEC_RNN_CELL, dropout=0.2, bidirectional=True)
# Determine the device for model training (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# Initialize the sequence-to-sequence model with the encoder and decoder
model = Seq_to_Seq(encoder, decoder).to(device)
print(model)

In [None]:
# Original method (causing the error)
def __getitem__(self, index):
    # Get the transliteration and Telugu words for the current index
    trans_word = self.trans_data.iloc[index]
    tel_word = self.tel_data.iloc[index]
    
    # Convert the trans word into indices using the trans_token_to_index mapping
    trans_indices = [trans_token_to_index[char] for char in trans_word]  # Error line
    
    # Convert the telugu word into indices, adding <sos> and <eos> tokens
    tel_indices = [tel_token_to_index['<sos>']] + [tel_token_to_index[char] for char in tel_word] + [tel_token_to_index['<eos>']]
    
    # Convert lists to tensors
    return torch.tensor(trans_indices), torch.tensor(tel_indices)

In [None]:
# Redefine your Dataset class
class TransliterationDataset(torch.utils.data.Dataset):
    def __init__(self, trans_data, tel_data):
        self.trans_data = trans_data
        self.tel_data = tel_data
    
    def __len__(self):
        return len(self.trans_data)
    
    def __getitem__(self, index):
        # Get the transliteration and Telugu words for the current index
        trans_word = self.trans_data.iloc[index]
        tel_word = self.tel_data.iloc[index]
        
        # Handle NaN or float values by converting to string
        if pd.isna(trans_word) or not isinstance(trans_word, str):
            trans_word = str(trans_word) if not pd.isna(trans_word) else ""
            
        if pd.isna(tel_word) or not isinstance(tel_word, str):
            tel_word = str(tel_word) if not pd.isna(tel_word) else ""
        
        # Convert the transliteration word into indices
        trans_indices = []
        for char in trans_word:
            if char in trans_token_to_index:
                trans_indices.append(trans_token_to_index[char])
            else:
                # Skip unknown characters
                continue
        
        # If no valid characters were found, use a minimal valid sequence
        if not trans_indices:
            trans_indices = [trans_token_to_index.get('<pad>', 0)]
        
        # Convert the Telugu word into indices, adding <sos> and <eos> tokens
        tel_indices = [tel_token_to_index['<sos>']]
        for char in tel_word:
            if char in tel_token_to_index:
                tel_indices.append(tel_token_to_index[char])
            else:
                # Skip unknown characters
                continue
        tel_indices.append(tel_token_to_index['<eos>'])
        
        # Convert lists to tensors
        return torch.tensor(trans_indices), torch.tensor(tel_indices)

In [None]:
# Load the test data from the specified TSV file location
trans_test, tel_test = load_data('/kaggle/input/dakshina-dataset/te.translit.sampled.test.tsv')

# Create test_dataset using the AksharantarDataset class, initializing it with test data
# and corresponding token-to-index mappings for both Latin and Bangla scripts
test_dataset = DakshinaDataset(trans_test, tel_test, trans_token_to_index, tel_token_to_index)

# A DataLoader for the test dataset. Here, the batch size is set to 1, indicates
# that the model will process one item at a time. This is for testing to make
# detailed predictions per sample without batching effects.
test_loader = DataLoader(test_dataset, batch_size=32, collate_fn=packet_fn, shuffle=False)
# print(test_dataset[0])

In [None]:
# Define batch size
BATCH_SIZE = 32  # You can adjust this as needed

# Define a padding collate function to handle variable length sequences
def pad_collate_fn(batch):
    # Sort the batch in descending order by source length
    batch.sort(key=lambda x: len(x[0]), reverse=True)
    
    # Get source and target sequences
    src_seqs, trg_seqs = zip(*batch)
    
    # Find length of longest sequence in batch
    src_lengths = [len(s) for s in src_seqs]
    trg_lengths = [len(t) for t in trg_seqs]
    max_src_len = max(src_lengths)
    max_trg_len = max(trg_lengths)
    
    # Pad source sequences
    src_padded = torch.zeros(len(src_seqs), max_src_len).long()
    for i, seq in enumerate(src_seqs):
        end = src_lengths[i]
        src_padded[i, :end] = seq
    
    # Pad target sequences
    trg_padded = torch.zeros(len(trg_seqs), max_trg_len).long()
    for i, seq in enumerate(trg_seqs):
        end = trg_lengths[i]
        trg_padded[i, :end] = seq
    
    # Return padded tensors and lengths
    return src_padded, trg_padded

# Create datasets
train_dataset = TransliterationDataset(trans_train, tel_train)
valid_dataset = TransliterationDataset(trans_valid, tel_valid)
test_dataset = TransliterationDataset(trans_test, tel_test)

# Create data loaders
train_loader = torch.utils.data.DataLoader(
    train_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True,
    collate_fn=pad_collate_fn
)

valid_loader = torch.utils.data.DataLoader(
    valid_dataset, 
    batch_size=BATCH_SIZE,
    collate_fn=pad_collate_fn
)

test_loader = torch.utils.data.DataLoader(
    test_dataset, 
    batch_size=BATCH_SIZE,
    collate_fn=pad_collate_fn
)

In [None]:
# Setting the number of epochs the training process should run
NUM_EPOCHS = 1
# Set the maximum norm of the gradients to 1 to prevent exploding gradients
CLIP = 1
# Create Adam optimizer with default parameters
optimizer = torch.optim.Adam(model.parameters())
# Padding token index should be ignored in loss calculation
ignore_index = tel_token_to_index['<pad>']
# Define the loss function with 'ignore_index' to avoid affecting loss calculation with padding tokens
criterion = nn.CrossEntropyLoss(ignore_index=ignore_index).to(device)

# Start the training process for the defined number of epochs
for epoch in range(NUM_EPOCHS):
    # Doing training on the train dataset and return average loss and accuracy
    train_loss, train_accuracy = train(model, train_loader, optimizer, criterion, CLIP, device, ignore_index)
    # Evaluating the model on the validation dataset and return average loss and accuracy
    val_loss, val_accuracy = evaluate(model, valid_loader, criterion, device, ignore_index)
    
    # Print the loss and accuracy for each epoch
    print(f'Epoch: {epoch+1}')
    print(f'\tTrain_Loss: {train_loss:.3f}, Train_Accuracy: {train_accuracy*100:.2f}%')
    print(f'\tVal_Loss: {val_loss:.3f},  Val_Accuracy: {val_accuracy*100:.2f}%')

In [None]:
print(len(trans_test))

In [None]:
def decode_indices(indices, index_to_token):
    # Filter out indices for padding, start-of-sequence, and end-of-sequence tokens to ensure only valid character indices are decoded
    valid_indices = [index for index in indices if index in index_to_token and index not in (tel_token_to_index['<pad>'], tel_token_to_index['<sos>'], tel_token_to_index['<eos>'])]
    # Convert each index to its corresponding character and join them to form the decoded string
    return ''.join([index_to_token[index] for index in valid_indices])

In [None]:
def predict(model, iterator, device):
    # Set the model to evaluation mode to disable dropout or batch normalization effects during inference
    model.eval()
    predictions = []
    # Disables gradient calculations for performance improvement since they are not needed in inference
    with torch.no_grad():
        for source, target in iterator:
            # Ensure the source and target tensors are on the correct device (GPU or CPU)
            source = source.to(device)
            target = target.to(device)
            # Obtain model output without teacher forcing (i.e., the model relies entirely on its predictions)
            output = model(source, target, 0)
            # Get the index with the highest probability from output predictions
            output = output.argmax(2)
            # Convert tensors to CPU numpy arrays for easier manipulation and extraction
            source = source.cpu().numpy()
            output = output.cpu().numpy()
            target = target.cpu().numpy()
            # Store the tuple of source and decoded output predictions
            predictions.append((source, target, output))
    # Return all predictions made over the iterator
    return predictions

In [None]:
# Create dictionaries to map indices back to characters, observing the interpretation of prediction outputs
trans_index_to_token = {index: char for char, index in trans_token_to_index.items()}
tel_index_to_token = {index: char for char, index in tel_token_to_index.items()}

In [None]:
print(trans_index_to_token)

In [None]:
print(tel_index_to_token)

In [None]:
# Taking the prediction function to generate outputs for all samples in the test_loader
test_predictions = predict(model, test_loader, device)
# print(len(test_predictions[0]))
# Loop through the list of tuples containing source and output indices from the test predictions
for source_indices, target_indices, output_indices in test_predictions:
    # Iterate through each example in the batch. This is necessary as batches may contain multiple examples
    for i in range(source_indices.shape[0]):
        # Decode the source indices to their corresponding text using the mapping dictionary for trans script
        input_text = decode_indices(source_indices[i], trans_index_to_token)
        
        target_text = decode_indices(target_indices[i], tel_index_to_token)

        # Decode the output indices to their corresponding text using the mapping dictionary for telugu script
        predicted_text = decode_indices(output_indices[i], tel_index_to_token)
        # Print the original input text and its corresponding predicted transliteration
        #print(f'Input Text: {input_text} -> Actual Text: {target_text} -> Predicted Text: {predicted_text}')

In [40]:
import pandas as pd

# Create lists to store the data
input_texts = []
actual_texts = []
predicted_texts = []
test_predictions = predict(model, test_loader, device)

# Loop through the list of tuples containing source and output indices from the test predictions
for source_indices, target_indices, output_indices in test_predictions:
    # Iterate through each example in the batch. This is necessary as batches may contain multiple examples
    for i in range(source_indices.shape[0]):
        # Decode the source indices to their corresponding text using the mapping dictionary for trans script
        input_text = decode_indices(source_indices[i], trans_index_to_token)
        target_text = decode_indices(target_indices[i], tel_index_to_token)
        # Decode the output indices to their corresponding text using the mapping dictionary for telugu script
        predicted_text = decode_indices(output_indices[i], tel_index_to_token)
        # Append the texts to the lists
        input_texts.append(input_text)
        actual_texts.append(target_text)
        predicted_texts.append(predicted_text)

# Create a DataFrame from the lists
df = pd.DataFrame({
    'Input Text': input_texts,
    'Actual Text': actual_texts,
    'Predicted Text': predicted_texts
})

# Save the DataFrame to a CSV file
df.to_csv('predictions_without_attn.csv', index=False, encoding='utf-8')

In [41]:
import wandb
import numpy as np
from types import SimpleNamespace
import random

key = input('Enter your API:')
wandb.login(key=key) #key="7836c2516a56fb9071600184ea90a0f6357f99dc"

Enter your API: 7836c2516a56fb9071600184ea90a0f6357f99dc


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mtentuvenkatesh2[0m ([33mtentuvenkatesh2-indian-institute-of-technology-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [42]:
sweep_config = {
    'method': 'bayes',
    'name' : 'sweep all final new lr 5',
    'metric': {
        'name': 'Val_Accuracy',
        'goal': 'maximize'
    },
    'parameters': {
        'input_embed_size': {
            'values': [16,32,64,256,512]
        },
        'num_enc_layers':{
            'values': [1,2,3]
        },
        'num_dec_layers':{
            'values': [1,2,3]
        },
        'hid_layer_size': {
            'values': [16,32,64,256,512]
        },
        'cell_type': {
            'values': ['rnn','gru','lstm']
        },
        'bidirectional':{
            'values': [True, False]
        },
        'dropout': {
            'values': [0.2, 0.3]
        },
        'new_learning_rate':{
            'values': [0.001,0.01,0.1]
        }
#       'beam search in decoder with different beam sizes': 
    }
}

sweep_id = wandb.sweep(sweep = sweep_config, project="DA6401_A3_P_1")

Create sweep with ID: lrdb1qkr
Sweep URL: https://wandb.ai/tentuvenkatesh2-indian-institute-of-technology-madras/DA6401_A3_P_1/sweeps/lrdb1qkr


In [43]:
import wandb

def main():
    # Initialize a new wandb run
    with wandb.init() as run:
        # Construct run name from configuration
        run_name = "-embed_size-"+str(wandb.config.input_embed_size)+"-layers_enc-"+str(wandb.config.num_enc_layers)+"-layers_dec-"+str(wandb.config.num_dec_layers)+"-hid_size-"+str(wandb.config.hid_layer_size)+"-cell_type-"+wandb.config.cell_type+"-bidirectional-"+str(wandb.config.bidirectional)+"-dropout-"+str(wandb.config.dropout)+"-lr-"+str(wandb.config.new_learning_rate)
        wandb.run.name = run_name

        # Constants defining the dimensions of the input and output character sets
        INPUT_DIM = 100  # size of the trans character set
        OUTPUT_DIM = 100  # size of the telugu character set

        # Constants defining the dimensions of the embeddings for encoder and decoder
        ENC_EMB_DIM = wandb.config.input_embed_size  # Encoder embedding dimension
        DEC_EMB_DIM = wandb.config.input_embed_size  # Decoder embedding dimension

        # Constants defining the dimension of the hidden layers for encoder and decoder
        HID_DIM = wandb.config.hid_layer_size  # Hidden dimension size

        # Constants defining the number of layers for encoder and decoder
        ENC_LAYERS = wandb.config.num_enc_layers  # Number of layers in the encoder
        DEC_LAYERS = wandb.config.num_dec_layers  # Number of layers in the decoder
        

        # Constants defining the type of RNN cell to use for encoder and decoder
        ENC_RNN_CELL = wandb.config.cell_type  # RNN cell type for the encoder
        DEC_RNN_CELL = wandb.config.cell_type  # RNN cell type for the decoder

        # Instantiate the encoder with specified configurations
        encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_LAYERS, ENC_RNN_CELL, dropout = wandb.config.dropout, bidirectional = wandb.config.bidirectional)
        # Instantiate the decoder with specified configurations
        decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, DEC_LAYERS, encoder.num_layers, DEC_RNN_CELL, dropout = wandb.config.dropout, bidirectional = wandb.config.bidirectional)

        # Determine the computing device (CUDA if available, otherwise CPU)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        # Print the device will be used
        print(f"Using device: {device}")

        # Instantiate the Seq_to_Seq model and move it to the chosen computing device
        model = Seq_to_Seq(encoder, decoder).to(device)
        print(model)
        
        
        # Setting the number of epochs the training process should run
        NUM_EPOCHS = 5
        # Set the maximum norm of the gradients to 1 to prevent exploding gradients
        CLIP = 1
        # Initialize the optimizer, Adam
        optimizer = torch.optim.Adam(model.parameters(), lr=wandb.config.new_learning_rate)  # Set the learning rate to 0.001


        # Padding token index should be ignored in loss calculation
        ignore_index = tel_token_to_index['<pad>']
        # Define the loss function with 'ignore_index' to avoid affecting loss calculation with padding tokens
        criterion = nn.CrossEntropyLoss(ignore_index=ignore_index).to(device)

        # Start the training process for the defined number of epochs
        for epoch in range(NUM_EPOCHS):
            # Doing training on the train dataset and return average loss and accuracy
            train_loss, train_accuracy = train(model, train_loader, optimizer, criterion, CLIP, device, ignore_index)
            # Evaluating the model on the validation dataset and return average loss and accuracy
            val_loss, val_accuracy = evaluate(model, valid_loader, criterion, device, ignore_index)

            # Print the loss and accuracy for each epoch
            print(f'Epoch: {epoch+1}')
            print(f'\tTrain_Loss: {train_loss:.3f}, Train_Accuracy: {train_accuracy*100:.2f}%')
            print(f'\tVal_Loss: {val_loss:.3f},  Val_Accuracy: {val_accuracy*100:.2f}%')
            wandb.log({"Epoch": {epoch+1}})
            wandb.log({"train_accuracy": train_accuracy * 100, "training_loss": train_loss})
            wandb.log({"Val_Accuracy": val_accuracy * 100, "Val_Loss": val_loss})


wandb.agent(sweep_id, function=main, count=50)
wandb.finish()

[34m[1mwandb[0m: Agent Starting Run: ddvoksm8 with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: rnn
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 16
[34m[1mwandb[0m: 	input_embed_size: 16
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): RNN(16, 8, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): RNN(16, 48, num_layers=3, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=48, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 1.814, Train_Accuracy: 0.01%
	Val_Loss: 2.055,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 1.606, Train_Accuracy: 0.02%
	Val_Loss: 1.961,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 1.548, Train_Accuracy: 0.02%
	Val_Loss: 1.862,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 1.505, Train_Accuracy: 0.02%
	Val_Loss: 1.812,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 1.473, Train_Accuracy: 0.03%
	Val_Loss: 1.771,  Val_Accuracy: 0.02%


0,1
Val_Accuracy,▁▁▁▁█
Val_Loss,█▆▃▂▁
train_accuracy,▁▅▅▆█
training_loss,█▄▃▂▁

0,1
Val_Accuracy,0.01756
Val_Loss,1.77061
train_accuracy,0.02561
training_loss,1.47277


[34m[1mwandb[0m: Agent Starting Run: ew477ivf with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: gru
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 32
[34m[1mwandb[0m: 	input_embed_size: 16
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 1
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): GRU(16, 32, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): GRU(16, 32, batch_first=True)
    (fc): Linear(in_features=32, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 1.881, Train_Accuracy: 0.00%
	Val_Loss: 2.063,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 1.598, Train_Accuracy: 0.02%
	Val_Loss: 1.891,  Val_Accuracy: 0.02%
Epoch: 3
	Train_Loss: 1.492, Train_Accuracy: 0.06%
	Val_Loss: 1.773,  Val_Accuracy: 0.11%
Epoch: 4
	Train_Loss: 1.408, Train_Accuracy: 0.12%
	Val_Loss: 1.668,  Val_Accuracy: 0.21%
Epoch: 5
	Train_Loss: 1.332, Train_Accuracy: 0.19%
	Val_Loss: 1.564,  Val_Accuracy: 0.53%


0,1
Val_Accuracy,▁▁▂▄█
Val_Loss,█▆▄▂▁
train_accuracy,▁▂▃▅█
training_loss,█▄▃▂▁

0,1
Val_Accuracy,0.52669
Val_Loss,1.56381
train_accuracy,0.19296
training_loss,1.3316


[34m[1mwandb[0m: Agent Starting Run: bplc7jdu with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 32
[34m[1mwandb[0m: 	input_embed_size: 32
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(32, 16, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(32, 32, num_layers=3, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=32, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 1.914, Train_Accuracy: 0.00%
	Val_Loss: 2.001,  Val_Accuracy: 0.02%
Epoch: 2
	Train_Loss: 1.603, Train_Accuracy: 0.00%
	Val_Loss: 1.844,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 1.451, Train_Accuracy: 0.03%
	Val_Loss: 1.651,  Val_Accuracy: 0.05%
Epoch: 4
	Train_Loss: 1.333, Train_Accuracy: 0.05%
	Val_Loss: 1.535,  Val_Accuracy: 0.16%
Epoch: 5
	Train_Loss: 1.252, Train_Accuracy: 0.15%
	Val_Loss: 1.408,  Val_Accuracy: 0.58%


0,1
Val_Accuracy,▁▁▂▃█
Val_Loss,█▆▄▂▁
train_accuracy,▁▁▂▃█
training_loss,█▅▃▂▁

0,1
Val_Accuracy,0.57935
Val_Loss,1.40839
train_accuracy,0.15369
training_loss,1.25223


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 6qwzd8kc with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 64
[34m[1mwandb[0m: 	input_embed_size: 32
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(32, 64, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(32, 64, num_layers=2, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=64, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 1.744, Train_Accuracy: 0.01%
	Val_Loss: 1.866,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 1.370, Train_Accuracy: 0.08%
	Val_Loss: 1.529,  Val_Accuracy: 0.42%
Epoch: 3
	Train_Loss: 1.118, Train_Accuracy: 0.73%
	Val_Loss: 1.236,  Val_Accuracy: 3.42%
Epoch: 4
	Train_Loss: 0.951, Train_Accuracy: 2.18%
	Val_Loss: 1.090,  Val_Accuracy: 7.30%
Epoch: 5
	Train_Loss: 0.842, Train_Accuracy: 3.98%
	Val_Loss: 0.957,  Val_Accuracy: 10.34%


0,1
Val_Accuracy,▁▁▃▆█
Val_Loss,█▅▃▂▁
train_accuracy,▁▁▂▅█
training_loss,█▅▃▂▁

0,1
Val_Accuracy,10.34059
Val_Loss,0.95735
train_accuracy,3.97619
training_loss,0.84163


[34m[1mwandb[0m: Agent Starting Run: z9epp9zm with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: rnn
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 16
[34m[1mwandb[0m: 	input_embed_size: 512
[34m[1mwandb[0m: 	new_learning_rate: 0.1
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): RNN(512, 8, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): RNN(512, 48, num_layers=3, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=48, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 2.456, Train_Accuracy: 0.00%
	Val_Loss: 3.573,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 2.520, Train_Accuracy: 0.00%
	Val_Loss: 3.284,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 2.532, Train_Accuracy: 0.00%
	Val_Loss: 2.771,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 2.505, Train_Accuracy: 0.00%
	Val_Loss: 2.635,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 2.506, Train_Accuracy: 0.00%
	Val_Loss: 2.703,  Val_Accuracy: 0.00%


0,1
Val_Accuracy,▁▁▁▁▁
Val_Loss,█▆▂▁▂
train_accuracy,▁▁▁▁▁
training_loss,▁▇█▅▆

0,1
Val_Accuracy,0.0
Val_Loss,2.70294
train_accuracy,0.0
training_loss,2.50635


[34m[1mwandb[0m: Agent Starting Run: j0b9vzrc with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: gru
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 256
[34m[1mwandb[0m: 	input_embed_size: 16
[34m[1mwandb[0m: 	new_learning_rate: 0.01
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): GRU(16, 256, num_layers=3, batch_first=True, dropout=0.2)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 16)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): GRU(16, 256, num_layers=3, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=256, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 1.779, Train_Accuracy: 0.01%
	Val_Loss: 2.074,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 1.638, Train_Accuracy: 0.01%
	Val_Loss: 2.087,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 1.613, Train_Accuracy: 0.01%
	Val_Loss: 2.077,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 1.623, Train_Accuracy: 0.01%
	Val_Loss: 2.078,  Val_Accuracy: 0.02%
Epoch: 5
	Train_Loss: 1.626, Train_Accuracy: 0.02%
	Val_Loss: 1.996,  Val_Accuracy: 0.00%


0,1
Val_Accuracy,▁▁▁█▁
Val_Loss,▇█▇▇▁
train_accuracy,▄▇▁▄█
training_loss,█▂▁▁▂

0,1
Val_Accuracy,0.0
Val_Loss,1.99573
train_accuracy,0.01537
training_loss,1.62578


[34m[1mwandb[0m: Agent Starting Run: du6j9dl8 with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 64
[34m[1mwandb[0m: 	input_embed_size: 256
[34m[1mwandb[0m: 	new_learning_rate: 0.01
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 2


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(256, 32, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(256, 128, num_layers=2, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=128, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 1.307, Train_Accuracy: 0.36%
	Val_Loss: 1.346,  Val_Accuracy: 2.21%
Epoch: 2
	Train_Loss: 1.069, Train_Accuracy: 1.20%
	Val_Loss: 1.176,  Val_Accuracy: 4.11%
Epoch: 3
	Train_Loss: 1.009, Train_Accuracy: 1.86%
	Val_Loss: 1.190,  Val_Accuracy: 4.48%
Epoch: 4
	Train_Loss: 0.977, Train_Accuracy: 2.29%
	Val_Loss: 1.115,  Val_Accuracy: 6.29%
Epoch: 5
	Train_Loss: 0.949, Train_Accuracy: 2.62%
	Val_Loss: 1.049,  Val_Accuracy: 7.51%


0,1
Val_Accuracy,▁▄▄▆█
Val_Loss,█▄▄▃▁
train_accuracy,▁▄▆▇█
training_loss,█▃▂▂▁

0,1
Val_Accuracy,7.51404
Val_Loss,1.04859
train_accuracy,2.6169
training_loss,0.94863


[34m[1mwandb[0m: Agent Starting Run: sg91e5lz with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: rnn
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 64
[34m[1mwandb[0m: 	input_embed_size: 256
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 2


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): RNN(256, 32, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): RNN(256, 128, num_layers=3, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=128, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 1.648, Train_Accuracy: 0.02%
	Val_Loss: 1.939,  Val_Accuracy: 0.11%
Epoch: 2
	Train_Loss: 1.454, Train_Accuracy: 0.05%
	Val_Loss: 1.868,  Val_Accuracy: 0.04%
Epoch: 3
	Train_Loss: 1.394, Train_Accuracy: 0.11%
	Val_Loss: 1.709,  Val_Accuracy: 0.18%
Epoch: 4
	Train_Loss: 1.341, Train_Accuracy: 0.13%
	Val_Loss: 1.707,  Val_Accuracy: 0.23%
Epoch: 5
	Train_Loss: 1.306, Train_Accuracy: 0.15%
	Val_Loss: 1.680,  Val_Accuracy: 0.18%


0,1
Val_Accuracy,▄▁▆█▆
Val_Loss,█▆▂▂▁
train_accuracy,▁▃▆▇█
training_loss,█▄▃▂▁

0,1
Val_Accuracy,0.17556
Val_Loss,1.68001
train_accuracy,0.15369
training_loss,1.30647


[34m[1mwandb[0m: Agent Starting Run: ol140j3j with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 16
[34m[1mwandb[0m: 	input_embed_size: 512
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(512, 16, num_layers=3, batch_first=True, dropout=0.2)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(512, 16, num_layers=2, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=16, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 1.981, Train_Accuracy: 0.00%
	Val_Loss: 2.107,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 1.697, Train_Accuracy: 0.00%
	Val_Loss: 2.057,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 1.650, Train_Accuracy: 0.01%
	Val_Loss: 2.005,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 1.599, Train_Accuracy: 0.01%
	Val_Loss: 1.901,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 1.550, Train_Accuracy: 0.02%
	Val_Loss: 1.846,  Val_Accuracy: 0.00%


0,1
Val_Accuracy,▁▁▁▁▁
Val_Loss,█▇▅▂▁
train_accuracy,▁▂▄▅█
training_loss,█▃▃▂▁

0,1
Val_Accuracy,0.0
Val_Loss,1.84635
train_accuracy,0.01878
training_loss,1.55041


[34m[1mwandb[0m: Agent Starting Run: g1fxl09g with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 64
[34m[1mwandb[0m: 	input_embed_size: 512
[34m[1mwandb[0m: 	new_learning_rate: 0.1
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(512, 64, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(512, 64, num_layers=2, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=64, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 1.920, Train_Accuracy: 0.00%
	Val_Loss: 2.340,  Val_Accuracy: 0.00%
Epoch: 2
	Train_Loss: 1.893, Train_Accuracy: 0.00%
	Val_Loss: 2.344,  Val_Accuracy: 0.00%
Epoch: 3
	Train_Loss: 1.885, Train_Accuracy: 0.00%
	Val_Loss: 2.503,  Val_Accuracy: 0.00%
Epoch: 4
	Train_Loss: 1.887, Train_Accuracy: 0.00%
	Val_Loss: 2.419,  Val_Accuracy: 0.00%
Epoch: 5
	Train_Loss: 1.888, Train_Accuracy: 0.00%
	Val_Loss: 2.357,  Val_Accuracy: 0.00%


0,1
Val_Accuracy,▁▁▁▁▁
Val_Loss,▁▁█▄▂
train_accuracy,▁██▁▁
training_loss,█▃▁▁▂

0,1
Val_Accuracy,0.0
Val_Loss,2.35726
train_accuracy,0.0
training_loss,1.88838


[34m[1mwandb[0m: Agent Starting Run: 1ce0hm0l with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 64
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 64)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(64, 512, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 64)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(64, 512, num_layers=2, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 1.055, Train_Accuracy: 8.82%
	Val_Loss: 0.644,  Val_Accuracy: 32.34%
Epoch: 2
	Train_Loss: 0.306, Train_Accuracy: 42.26%
	Val_Loss: 0.507,  Val_Accuracy: 46.55%
Epoch: 3
	Train_Loss: 0.204, Train_Accuracy: 56.37%
	Val_Loss: 0.507,  Val_Accuracy: 50.09%
Epoch: 4
	Train_Loss: 0.155, Train_Accuracy: 63.97%
	Val_Loss: 0.495,  Val_Accuracy: 52.31%
Epoch: 5
	Train_Loss: 0.127, Train_Accuracy: 68.99%
	Val_Loss: 0.501,  Val_Accuracy: 52.94%


0,1
Val_Accuracy,▁▆▇██
Val_Loss,█▂▂▁▁
train_accuracy,▁▅▇▇█
training_loss,█▂▂▁▁

0,1
Val_Accuracy,52.93927
Val_Loss,0.5014
train_accuracy,68.98783
training_loss,0.12734


[34m[1mwandb[0m: Agent Starting Run: ayifkw7s with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 256
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(256, 512, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 0.897, Train_Accuracy: 11.81%
	Val_Loss: 0.668,  Val_Accuracy: 31.52%
Epoch: 2
	Train_Loss: 0.338, Train_Accuracy: 38.89%
	Val_Loss: 0.568,  Val_Accuracy: 41.95%
Epoch: 3
	Train_Loss: 0.245, Train_Accuracy: 50.74%
	Val_Loss: 0.541,  Val_Accuracy: 47.43%
Epoch: 4
	Train_Loss: 0.197, Train_Accuracy: 58.00%
	Val_Loss: 0.544,  Val_Accuracy: 48.11%
Epoch: 5
	Train_Loss: 0.165, Train_Accuracy: 62.77%
	Val_Loss: 0.539,  Val_Accuracy: 49.17%


0,1
Val_Accuracy,▁▅▇██
Val_Loss,█▃▁▁▁
train_accuracy,▁▅▆▇█
training_loss,█▃▂▁▁

0,1
Val_Accuracy,49.17024
Val_Loss,0.53921
train_accuracy,62.774
training_loss,0.1654


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 3xqrlwf7 with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 256
[34m[1mwandb[0m: 	new_learning_rate: 0.01
[34m[1mwandb[0m: 	num_dec_layers: 1
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(256, 256, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(256, 512, batch_first=True)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 0.872, Train_Accuracy: 5.69%
	Val_Loss: 1.025,  Val_Accuracy: 12.69%
Epoch: 2
	Train_Loss: 0.688, Train_Accuracy: 10.85%
	Val_Loss: 0.907,  Val_Accuracy: 15.39%
Epoch: 3
	Train_Loss: 0.659, Train_Accuracy: 12.61%
	Val_Loss: 0.921,  Val_Accuracy: 17.17%
Epoch: 4
	Train_Loss: 0.635, Train_Accuracy: 13.62%
	Val_Loss: 0.920,  Val_Accuracy: 17.01%
Epoch: 5
	Train_Loss: 0.637, Train_Accuracy: 13.53%
	Val_Loss: 0.898,  Val_Accuracy: 16.12%


0,1
Val_Accuracy,▁▅██▆
Val_Loss,█▁▂▂▁
train_accuracy,▁▆▇██
training_loss,█▃▂▁▁

0,1
Val_Accuracy,16.11657
Val_Loss,0.89796
train_accuracy,13.53111
training_loss,0.63699


[34m[1mwandb[0m: Agent Starting Run: rvgbtaop with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 256
[34m[1mwandb[0m: 	new_learning_rate: 0.01
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 2


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.3)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 1.455, Train_Accuracy: 0.11%
	Val_Loss: 1.514,  Val_Accuracy: 0.44%
Epoch: 2
	Train_Loss: 1.188, Train_Accuracy: 0.60%
	Val_Loss: 1.397,  Val_Accuracy: 1.63%
Epoch: 3
	Train_Loss: 1.105, Train_Accuracy: 1.17%
	Val_Loss: 1.321,  Val_Accuracy: 2.88%
Epoch: 4
	Train_Loss: 1.056, Train_Accuracy: 1.59%
	Val_Loss: 1.253,  Val_Accuracy: 3.97%
Epoch: 5
	Train_Loss: 1.018, Train_Accuracy: 2.16%
	Val_Loss: 1.223,  Val_Accuracy: 4.69%


0,1
Val_Accuracy,▁▃▅▇█
Val_Loss,█▅▃▂▁
train_accuracy,▁▃▅▆█
training_loss,█▄▂▂▁

0,1
Val_Accuracy,4.6875
Val_Loss,1.22278
train_accuracy,2.15847
training_loss,1.01797


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: bi3k251m with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 512
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(512, 256, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(512, 512, num_layers=2, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 0.684, Train_Accuracy: 17.12%
	Val_Loss: 0.603,  Val_Accuracy: 35.59%
Epoch: 2
	Train_Loss: 0.328, Train_Accuracy: 39.21%
	Val_Loss: 0.540,  Val_Accuracy: 44.96%
Epoch: 3
	Train_Loss: 0.264, Train_Accuracy: 47.48%
	Val_Loss: 0.502,  Val_Accuracy: 47.77%
Epoch: 4
	Train_Loss: 0.228, Train_Accuracy: 52.90%
	Val_Loss: 0.489,  Val_Accuracy: 51.57%
Epoch: 5
	Train_Loss: 0.201, Train_Accuracy: 56.52%
	Val_Loss: 0.473,  Val_Accuracy: 52.23%


0,1
Val_Accuracy,▁▅▆██
Val_Loss,█▅▃▂▁
train_accuracy,▁▅▆▇█
training_loss,█▃▂▁▁

0,1
Val_Accuracy,52.23148
Val_Loss,0.47322
train_accuracy,56.52152
training_loss,0.2013


[34m[1mwandb[0m: Agent Starting Run: 5rmin3bw with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 512
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(512, 256, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(512, 512, num_layers=3, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 0.934, Train_Accuracy: 9.35%
	Val_Loss: 0.721,  Val_Accuracy: 29.70%
Epoch: 2
	Train_Loss: 0.387, Train_Accuracy: 32.19%
	Val_Loss: 0.554,  Val_Accuracy: 41.42%
Epoch: 3
	Train_Loss: 0.299, Train_Accuracy: 42.71%
	Val_Loss: 0.535,  Val_Accuracy: 46.07%
Epoch: 4
	Train_Loss: 0.250, Train_Accuracy: 48.57%
	Val_Loss: 0.518,  Val_Accuracy: 48.06%
Epoch: 5
	Train_Loss: 0.221, Train_Accuracy: 53.09%
	Val_Loss: 0.506,  Val_Accuracy: 50.26%


0,1
Val_Accuracy,▁▅▇▇█
Val_Loss,█▃▂▁▁
train_accuracy,▁▅▆▇█
training_loss,█▃▂▁▁

0,1
Val_Accuracy,50.26427
Val_Loss,0.50604
train_accuracy,53.08728
training_loss,0.2213


[34m[1mwandb[0m: Agent Starting Run: kzm2bkc5 with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 512
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(512, 512, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(512, 512, num_layers=3, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 1.287, Train_Accuracy: 1.28%
	Val_Loss: 1.078,  Val_Accuracy: 9.90%
Epoch: 2
	Train_Loss: 0.626, Train_Accuracy: 14.12%
	Val_Loss: 0.772,  Val_Accuracy: 25.68%
Epoch: 3
	Train_Loss: 0.451, Train_Accuracy: 26.88%
	Val_Loss: 0.686,  Val_Accuracy: 32.72%
Epoch: 4
	Train_Loss: 0.365, Train_Accuracy: 35.06%
	Val_Loss: 0.627,  Val_Accuracy: 38.17%
Epoch: 5
	Train_Loss: 0.315, Train_Accuracy: 40.90%
	Val_Loss: 0.614,  Val_Accuracy: 40.82%


0,1
Val_Accuracy,▁▅▆▇█
Val_Loss,█▃▂▁▁
train_accuracy,▁▃▆▇█
training_loss,█▃▂▁▁

0,1
Val_Accuracy,40.82459
Val_Loss,0.61379
train_accuracy,40.9035
training_loss,0.31464


[34m[1mwandb[0m: Agent Starting Run: utin8xn1 with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 512
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(512, 256, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(512, 512, num_layers=3, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 0.986, Train_Accuracy: 8.40%
	Val_Loss: 0.703,  Val_Accuracy: 30.45%
Epoch: 2
	Train_Loss: 0.385, Train_Accuracy: 32.81%
	Val_Loss: 0.566,  Val_Accuracy: 42.21%
Epoch: 3
	Train_Loss: 0.292, Train_Accuracy: 43.56%
	Val_Loss: 0.532,  Val_Accuracy: 46.78%
Epoch: 4
	Train_Loss: 0.242, Train_Accuracy: 49.98%
	Val_Loss: 0.517,  Val_Accuracy: 49.37%
Epoch: 5
	Train_Loss: 0.215, Train_Accuracy: 54.13%
	Val_Loss: 0.523,  Val_Accuracy: 49.81%


0,1
Val_Accuracy,▁▅▇██
Val_Loss,█▃▂▁▁
train_accuracy,▁▅▆▇█
training_loss,█▃▂▁▁

0,1
Val_Accuracy,49.81335
Val_Loss,0.52345
train_accuracy,54.12972
training_loss,0.21476


[34m[1mwandb[0m: Agent Starting Run: 0uo3uw19 with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 512
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(512, 512, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(512, 512, num_layers=2, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 0.968, Train_Accuracy: 7.19%
	Val_Loss: 0.792,  Val_Accuracy: 23.58%
Epoch: 2
	Train_Loss: 0.454, Train_Accuracy: 26.45%
	Val_Loss: 0.643,  Val_Accuracy: 35.39%
Epoch: 3
	Train_Loss: 0.348, Train_Accuracy: 36.96%
	Val_Loss: 0.598,  Val_Accuracy: 41.52%
Epoch: 4
	Train_Loss: 0.298, Train_Accuracy: 43.53%
	Val_Loss: 0.567,  Val_Accuracy: 44.73%
Epoch: 5
	Train_Loss: 0.259, Train_Accuracy: 48.38%
	Val_Loss: 0.546,  Val_Accuracy: 46.32%


0,1
Val_Accuracy,▁▅▇██
Val_Loss,█▄▂▂▁
train_accuracy,▁▄▆▇█
training_loss,█▃▂▁▁

0,1
Val_Accuracy,46.31967
Val_Loss,0.54603
train_accuracy,48.38006
training_loss,0.25944


[34m[1mwandb[0m: Agent Starting Run: 6p6b90gl with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 64
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 64)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(64, 512, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 64)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(64, 512, num_layers=2, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 1.150, Train_Accuracy: 5.64%
	Val_Loss: 0.697,  Val_Accuracy: 27.28%
Epoch: 2
	Train_Loss: 0.354, Train_Accuracy: 36.02%
	Val_Loss: 0.512,  Val_Accuracy: 44.96%
Epoch: 3
	Train_Loss: 0.239, Train_Accuracy: 50.06%
	Val_Loss: 0.478,  Val_Accuracy: 50.62%
Epoch: 4
	Train_Loss: 0.191, Train_Accuracy: 57.63%
	Val_Loss: 0.478,  Val_Accuracy: 51.79%
Epoch: 5
	Train_Loss: 0.160, Train_Accuracy: 62.70%
	Val_Loss: 0.492,  Val_Accuracy: 54.70%


0,1
Val_Accuracy,▁▆▇▇█
Val_Loss,█▂▁▁▁
train_accuracy,▁▅▆▇█
training_loss,█▂▂▁▁

0,1
Val_Accuracy,54.70044
Val_Loss,0.49196
train_accuracy,62.70368
training_loss,0.16024


[34m[1mwandb[0m: Agent Starting Run: yphrje7z with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 256
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(256, 256, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(256, 512, num_layers=3, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 0.890, Train_Accuracy: 13.64%
	Val_Loss: 0.597,  Val_Accuracy: 36.89%
Epoch: 2
	Train_Loss: 0.289, Train_Accuracy: 44.22%
	Val_Loss: 0.512,  Val_Accuracy: 46.92%
Epoch: 3
	Train_Loss: 0.208, Train_Accuracy: 55.44%
	Val_Loss: 0.502,  Val_Accuracy: 51.44%
Epoch: 4
	Train_Loss: 0.169, Train_Accuracy: 61.73%
	Val_Loss: 0.483,  Val_Accuracy: 53.55%
Epoch: 5
	Train_Loss: 0.143, Train_Accuracy: 66.23%
	Val_Loss: 0.486,  Val_Accuracy: 55.02%


0,1
Val_Accuracy,▁▅▇▇█
Val_Loss,█▃▂▁▁
train_accuracy,▁▅▇▇█
training_loss,█▂▂▁▁

0,1
Val_Accuracy,55.01645
Val_Loss,0.48566
train_accuracy,66.22904
training_loss,0.14284


[34m[1mwandb[0m: Agent Starting Run: j2yeocrl with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 256
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(256, 256, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 0.655, Train_Accuracy: 22.11%
	Val_Loss: 0.568,  Val_Accuracy: 42.35%
Epoch: 2
	Train_Loss: 0.261, Train_Accuracy: 48.68%
	Val_Loss: 0.484,  Val_Accuracy: 48.62%
Epoch: 3
	Train_Loss: 0.193, Train_Accuracy: 57.99%
	Val_Loss: 0.511,  Val_Accuracy: 51.93%
Epoch: 4
	Train_Loss: 0.158, Train_Accuracy: 64.06%
	Val_Loss: 0.480,  Val_Accuracy: 53.66%
Epoch: 5
	Train_Loss: 0.139, Train_Accuracy: 67.51%
	Val_Loss: 0.502,  Val_Accuracy: 54.33%


0,1
Val_Accuracy,▁▅▇██
Val_Loss,█▁▃▁▃
train_accuracy,▁▅▇▇█
training_loss,█▃▂▁▁

0,1
Val_Accuracy,54.33176
Val_Loss,0.50186
train_accuracy,67.50574
training_loss,0.13894


[34m[1mwandb[0m: Agent Starting Run: 53vzj76s with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 256
[34m[1mwandb[0m: 	input_embed_size: 256
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(256, 256, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(256, 256, num_layers=3, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=256, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 1.467, Train_Accuracy: 0.18%
	Val_Loss: 1.308,  Val_Accuracy: 2.56%
Epoch: 2
	Train_Loss: 0.778, Train_Accuracy: 7.04%
	Val_Loss: 0.803,  Val_Accuracy: 22.04%
Epoch: 3
	Train_Loss: 0.537, Train_Accuracy: 18.99%
	Val_Loss: 0.656,  Val_Accuracy: 30.70%
Epoch: 4
	Train_Loss: 0.433, Train_Accuracy: 27.14%
	Val_Loss: 0.613,  Val_Accuracy: 37.12%
Epoch: 5
	Train_Loss: 0.374, Train_Accuracy: 33.22%
	Val_Loss: 0.560,  Val_Accuracy: 41.65%


0,1
Val_Accuracy,▁▄▆▇█
Val_Loss,█▃▂▂▁
train_accuracy,▁▂▅▇█
training_loss,█▄▂▁▁

0,1
Val_Accuracy,41.65065
Val_Loss,0.55985
train_accuracy,33.22078
training_loss,0.37355


[34m[1mwandb[0m: Agent Starting Run: 42qzy0ed with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 256
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(256, 256, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 0.680, Train_Accuracy: 19.22%
	Val_Loss: 0.579,  Val_Accuracy: 41.66%
Epoch: 2
	Train_Loss: 0.290, Train_Accuracy: 44.21%
	Val_Loss: 0.502,  Val_Accuracy: 48.84%
Epoch: 3
	Train_Loss: 0.223, Train_Accuracy: 53.54%
	Val_Loss: 0.490,  Val_Accuracy: 51.06%
Epoch: 4
	Train_Loss: 0.189, Train_Accuracy: 58.25%
	Val_Loss: 0.483,  Val_Accuracy: 52.20%
Epoch: 5
	Train_Loss: 0.165, Train_Accuracy: 62.23%
	Val_Loss: 0.483,  Val_Accuracy: 55.12%


0,1
Val_Accuracy,▁▅▆▆█
Val_Loss,█▂▁▁▁
train_accuracy,▁▅▇▇█
training_loss,█▃▂▁▁

0,1
Val_Accuracy,55.11532
Val_Loss,0.48303
train_accuracy,62.23407
training_loss,0.16471


[34m[1mwandb[0m: Agent Starting Run: qn5l1dkw with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 512
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 1
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(512, 512, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(512, 512, batch_first=True)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 0.926, Train_Accuracy: 7.79%
	Val_Loss: 0.789,  Val_Accuracy: 24.70%
Epoch: 2
	Train_Loss: 0.454, Train_Accuracy: 27.41%
	Val_Loss: 0.634,  Val_Accuracy: 35.58%
Epoch: 3
	Train_Loss: 0.355, Train_Accuracy: 37.16%
	Val_Loss: 0.591,  Val_Accuracy: 40.09%
Epoch: 4
	Train_Loss: 0.305, Train_Accuracy: 43.05%
	Val_Loss: 0.549,  Val_Accuracy: 43.76%
Epoch: 5
	Train_Loss: 0.270, Train_Accuracy: 47.31%
	Val_Loss: 0.547,  Val_Accuracy: 44.90%


0,1
Val_Accuracy,▁▅▆██
Val_Loss,█▄▂▁▁
train_accuracy,▁▄▆▇█
training_loss,█▃▂▁▁

0,1
Val_Accuracy,44.89762
Val_Loss,0.54681
train_accuracy,47.30533
training_loss,0.26986


[34m[1mwandb[0m: Agent Starting Run: ztb1ntf9 with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 64
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 1
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 64)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(64, 256, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 64)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(64, 512, batch_first=True)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 0.693, Train_Accuracy: 17.68%
	Val_Loss: 0.534,  Val_Accuracy: 40.20%
Epoch: 2
	Train_Loss: 0.288, Train_Accuracy: 43.49%
	Val_Loss: 0.491,  Val_Accuracy: 48.39%
Epoch: 3
	Train_Loss: 0.218, Train_Accuracy: 53.52%
	Val_Loss: 0.452,  Val_Accuracy: 52.39%
Epoch: 4
	Train_Loss: 0.182, Train_Accuracy: 59.29%
	Val_Loss: 0.459,  Val_Accuracy: 53.06%
Epoch: 5
	Train_Loss: 0.153, Train_Accuracy: 63.78%
	Val_Loss: 0.456,  Val_Accuracy: 54.99%


0,1
Val_Accuracy,▁▅▇▇█
Val_Loss,█▄▁▂▁
train_accuracy,▁▅▆▇█
training_loss,█▃▂▁▁

0,1
Val_Accuracy,54.98688
Val_Loss,0.45618
train_accuracy,63.77608
training_loss,0.15335


[34m[1mwandb[0m: Agent Starting Run: l0vg90er with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 32
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(32, 256, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(32, 512, num_layers=3, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 1.175, Train_Accuracy: 4.33%
	Val_Loss: 0.718,  Val_Accuracy: 25.04%
Epoch: 2
	Train_Loss: 0.396, Train_Accuracy: 31.25%
	Val_Loss: 0.503,  Val_Accuracy: 44.86%
Epoch: 3
	Train_Loss: 0.262, Train_Accuracy: 46.94%
	Val_Loss: 0.458,  Val_Accuracy: 50.71%
Epoch: 4
	Train_Loss: 0.209, Train_Accuracy: 54.76%
	Val_Loss: 0.458,  Val_Accuracy: 53.50%
Epoch: 5
	Train_Loss: 0.177, Train_Accuracy: 59.75%
	Val_Loss: 0.452,  Val_Accuracy: 53.42%


0,1
Val_Accuracy,▁▆▇██
Val_Loss,█▂▁▁▁
train_accuracy,▁▄▆▇█
training_loss,█▃▂▁▁

0,1
Val_Accuracy,53.41884
Val_Loss,0.45228
train_accuracy,59.75472
training_loss,0.17697


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 8i0oeyl3 with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 256
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(256, 256, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 0.616, Train_Accuracy: 24.11%
	Val_Loss: 0.548,  Val_Accuracy: 42.83%
Epoch: 2
	Train_Loss: 0.249, Train_Accuracy: 49.90%
	Val_Loss: 0.487,  Val_Accuracy: 49.74%
Epoch: 3
	Train_Loss: 0.187, Train_Accuracy: 59.43%
	Val_Loss: 0.497,  Val_Accuracy: 52.83%
Epoch: 4
	Train_Loss: 0.154, Train_Accuracy: 64.88%
	Val_Loss: 0.472,  Val_Accuracy: 54.62%
Epoch: 5
	Train_Loss: 0.133, Train_Accuracy: 68.77%
	Val_Loss: 0.509,  Val_Accuracy: 55.28%


0,1
Val_Accuracy,▁▅▇██
Val_Loss,█▂▃▁▄
train_accuracy,▁▅▇▇█
training_loss,█▃▂▁▁

0,1
Val_Accuracy,55.27979
Val_Loss,0.50926
train_accuracy,68.77282
training_loss,0.13275


[34m[1mwandb[0m: Agent Starting Run: 2yiow80q with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 512
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(512, 512, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(512, 512, num_layers=2, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 1.098, Train_Accuracy: 3.95%
	Val_Loss: 0.917,  Val_Accuracy: 17.20%
Epoch: 2
	Train_Loss: 0.524, Train_Accuracy: 20.52%
	Val_Loss: 0.678,  Val_Accuracy: 31.26%
Epoch: 3
	Train_Loss: 0.392, Train_Accuracy: 32.06%
	Val_Loss: 0.609,  Val_Accuracy: 37.82%
Epoch: 4
	Train_Loss: 0.328, Train_Accuracy: 39.49%
	Val_Loss: 0.589,  Val_Accuracy: 42.01%
Epoch: 5
	Train_Loss: 0.286, Train_Accuracy: 44.43%
	Val_Loss: 0.570,  Val_Accuracy: 43.53%


0,1
Val_Accuracy,▁▅▆██
Val_Loss,█▃▂▁▁
train_accuracy,▁▄▆▇█
training_loss,█▃▂▁▁

0,1
Val_Accuracy,43.53471
Val_Loss,0.57039
train_accuracy,44.43213
training_loss,0.28596


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: k2gixiuc with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 256
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(256, 256, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 0.632, Train_Accuracy: 23.27%
	Val_Loss: 0.583,  Val_Accuracy: 41.09%
Epoch: 2
	Train_Loss: 0.256, Train_Accuracy: 49.19%
	Val_Loss: 0.521,  Val_Accuracy: 49.33%
Epoch: 3
	Train_Loss: 0.191, Train_Accuracy: 58.75%
	Val_Loss: 0.486,  Val_Accuracy: 52.27%
Epoch: 4
	Train_Loss: 0.159, Train_Accuracy: 64.27%
	Val_Loss: 0.495,  Val_Accuracy: 52.25%
Epoch: 5
	Train_Loss: 0.137, Train_Accuracy: 68.05%
	Val_Loss: 0.496,  Val_Accuracy: 52.68%


0,1
Val_Accuracy,▁▆███
Val_Loss,█▄▁▂▂
train_accuracy,▁▅▇▇█
training_loss,█▃▂▁▁

0,1
Val_Accuracy,52.68148
Val_Loss,0.49556
train_accuracy,68.048
training_loss,0.13702


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: ya329q69 with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 512
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(512, 256, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(512, 512, num_layers=2, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 0.697, Train_Accuracy: 17.19%
	Val_Loss: 0.623,  Val_Accuracy: 35.86%
Epoch: 2
	Train_Loss: 0.335, Train_Accuracy: 38.61%
	Val_Loss: 0.563,  Val_Accuracy: 44.07%
Epoch: 3
	Train_Loss: 0.263, Train_Accuracy: 47.46%
	Val_Loss: 0.535,  Val_Accuracy: 47.33%
Epoch: 4
	Train_Loss: 0.231, Train_Accuracy: 51.97%
	Val_Loss: 0.517,  Val_Accuracy: 49.53%
Epoch: 5
	Train_Loss: 0.205, Train_Accuracy: 56.00%
	Val_Loss: 0.510,  Val_Accuracy: 50.90%


0,1
Val_Accuracy,▁▅▆▇█
Val_Loss,█▄▃▁▁
train_accuracy,▁▅▆▇█
training_loss,█▃▂▁▁

0,1
Val_Accuracy,50.90183
Val_Loss,0.51022
train_accuracy,55.99603
training_loss,0.20497


[34m[1mwandb[0m: Agent Starting Run: r591uh4y with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 256
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(256, 256, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(256, 512, num_layers=3, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 0.988, Train_Accuracy: 9.25%
	Val_Loss: 0.668,  Val_Accuracy: 31.97%
Epoch: 2
	Train_Loss: 0.348, Train_Accuracy: 37.11%
	Val_Loss: 0.536,  Val_Accuracy: 44.44%
Epoch: 3
	Train_Loss: 0.253, Train_Accuracy: 48.88%
	Val_Loss: 0.489,  Val_Accuracy: 50.39%
Epoch: 4
	Train_Loss: 0.210, Train_Accuracy: 55.37%
	Val_Loss: 0.462,  Val_Accuracy: 53.30%
Epoch: 5
	Train_Loss: 0.181, Train_Accuracy: 59.57%
	Val_Loss: 0.449,  Val_Accuracy: 53.85%


0,1
Val_Accuracy,▁▅▇██
Val_Loss,█▄▂▁▁
train_accuracy,▁▅▇▇█
training_loss,█▂▂▁▁

0,1
Val_Accuracy,53.85127
Val_Loss,0.44862
train_accuracy,59.57448
training_loss,0.18145


[34m[1mwandb[0m: Agent Starting Run: 4sblytun with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 512
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(512, 256, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(512, 512, num_layers=3, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 0.999, Train_Accuracy: 7.35%
	Val_Loss: 0.728,  Val_Accuracy: 26.49%
Epoch: 2
	Train_Loss: 0.420, Train_Accuracy: 29.71%
	Val_Loss: 0.577,  Val_Accuracy: 39.43%
Epoch: 3
	Train_Loss: 0.311, Train_Accuracy: 41.16%
	Val_Loss: 0.554,  Val_Accuracy: 45.74%
Epoch: 4
	Train_Loss: 0.261, Train_Accuracy: 47.47%
	Val_Loss: 0.524,  Val_Accuracy: 47.95%
Epoch: 5
	Train_Loss: 0.226, Train_Accuracy: 52.23%
	Val_Loss: 0.526,  Val_Accuracy: 48.52%


0,1
Val_Accuracy,▁▅▇██
Val_Loss,█▃▂▁▁
train_accuracy,▁▄▆▇█
training_loss,█▃▂▁▁

0,1
Val_Accuracy,48.52066
Val_Loss,0.52624
train_accuracy,52.23097
training_loss,0.22613


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: ju02shit with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 256
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 2


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.3)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 0.865, Train_Accuracy: 13.16%
	Val_Loss: 0.629,  Val_Accuracy: 35.56%
Epoch: 2
	Train_Loss: 0.311, Train_Accuracy: 41.55%
	Val_Loss: 0.514,  Val_Accuracy: 46.40%
Epoch: 3
	Train_Loss: 0.232, Train_Accuracy: 52.33%
	Val_Loss: 0.480,  Val_Accuracy: 50.36%
Epoch: 4
	Train_Loss: 0.186, Train_Accuracy: 59.26%
	Val_Loss: 0.503,  Val_Accuracy: 53.63%
Epoch: 5
	Train_Loss: 0.161, Train_Accuracy: 63.62%
	Val_Loss: 0.457,  Val_Accuracy: 55.13%


0,1
Val_Accuracy,▁▅▆▇█
Val_Loss,█▃▂▃▁
train_accuracy,▁▅▆▇█
training_loss,█▂▂▁▁

0,1
Val_Accuracy,55.12733
Val_Loss,0.45732
train_accuracy,63.61649
training_loss,0.16148


[34m[1mwandb[0m: Agent Starting Run: jd6j3910 with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 512
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(512, 256, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(512, 512, num_layers=2, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 0.734, Train_Accuracy: 15.36%
	Val_Loss: 0.655,  Val_Accuracy: 34.82%
Epoch: 2
	Train_Loss: 0.347, Train_Accuracy: 37.02%
	Val_Loss: 0.566,  Val_Accuracy: 44.48%
Epoch: 3
	Train_Loss: 0.276, Train_Accuracy: 46.36%
	Val_Loss: 0.528,  Val_Accuracy: 47.81%
Epoch: 4
	Train_Loss: 0.234, Train_Accuracy: 51.51%
	Val_Loss: 0.517,  Val_Accuracy: 49.68%
Epoch: 5
	Train_Loss: 0.207, Train_Accuracy: 55.69%
	Val_Loss: 0.502,  Val_Accuracy: 51.28%


0,1
Val_Accuracy,▁▅▇▇█
Val_Loss,█▄▂▂▁
train_accuracy,▁▅▆▇█
training_loss,█▃▂▁▁

0,1
Val_Accuracy,51.28253
Val_Loss,0.50174
train_accuracy,55.68989
training_loss,0.20732


[34m[1mwandb[0m: Agent Starting Run: rjm77j7f with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 256
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(256, 512, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 0.984, Train_Accuracy: 8.26%
	Val_Loss: 0.709,  Val_Accuracy: 29.30%
Epoch: 2
	Train_Loss: 0.381, Train_Accuracy: 33.00%
	Val_Loss: 0.561,  Val_Accuracy: 42.17%
Epoch: 3
	Train_Loss: 0.280, Train_Accuracy: 45.45%
	Val_Loss: 0.528,  Val_Accuracy: 47.33%
Epoch: 4
	Train_Loss: 0.230, Train_Accuracy: 52.16%
	Val_Loss: 0.506,  Val_Accuracy: 50.05%
Epoch: 5
	Train_Loss: 0.198, Train_Accuracy: 56.84%
	Val_Loss: 0.507,  Val_Accuracy: 51.17%


0,1
Val_Accuracy,▁▅▇██
Val_Loss,█▃▂▁▁
train_accuracy,▁▅▆▇█
training_loss,█▃▂▁▁

0,1
Val_Accuracy,51.17164
Val_Loss,0.50743
train_accuracy,56.8365
training_loss,0.19834


[34m[1mwandb[0m: Agent Starting Run: mopjeglj with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 256
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(256, 512, num_layers=3, batch_first=True, dropout=0.3)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(256, 512, num_layers=3, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 0.958, Train_Accuracy: 10.18%
	Val_Loss: 0.623,  Val_Accuracy: 34.49%
Epoch: 2
	Train_Loss: 0.330, Train_Accuracy: 39.52%
	Val_Loss: 0.512,  Val_Accuracy: 46.59%
Epoch: 3
	Train_Loss: 0.233, Train_Accuracy: 51.34%
	Val_Loss: 0.482,  Val_Accuracy: 50.86%
Epoch: 4
	Train_Loss: 0.193, Train_Accuracy: 57.93%
	Val_Loss: 0.475,  Val_Accuracy: 54.03%
Epoch: 5
	Train_Loss: 0.165, Train_Accuracy: 62.77%
	Val_Loss: 0.460,  Val_Accuracy: 54.24%


0,1
Val_Accuracy,▁▅▇██
Val_Loss,█▃▂▂▁
train_accuracy,▁▅▆▇█
training_loss,█▂▂▁▁

0,1
Val_Accuracy,54.24398
Val_Loss,0.46023
train_accuracy,62.76888
training_loss,0.16503


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: vwm4saof with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 512
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(512, 256, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(512, 512, num_layers=2, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 0.646, Train_Accuracy: 20.49%
	Val_Loss: 0.593,  Val_Accuracy: 37.28%
Epoch: 2
	Train_Loss: 0.294, Train_Accuracy: 43.90%
	Val_Loss: 0.561,  Val_Accuracy: 44.66%
Epoch: 3
	Train_Loss: 0.229, Train_Accuracy: 53.15%
	Val_Loss: 0.545,  Val_Accuracy: 47.13%
Epoch: 4
	Train_Loss: 0.192, Train_Accuracy: 58.35%
	Val_Loss: 0.540,  Val_Accuracy: 48.48%
Epoch: 5
	Train_Loss: 0.171, Train_Accuracy: 62.01%
	Val_Loss: 0.553,  Val_Accuracy: 49.64%


0,1
Val_Accuracy,▁▅▇▇█
Val_Loss,█▄▂▁▃
train_accuracy,▁▅▇▇█
training_loss,█▃▂▁▁

0,1
Val_Accuracy,49.63964
Val_Loss,0.55324
train_accuracy,62.00618
training_loss,0.17091


[34m[1mwandb[0m: Agent Starting Run: rmprqymi with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 512
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 2


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(512, 512, num_layers=2, batch_first=True, dropout=0.3)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(512, 512, num_layers=3, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 1.018, Train_Accuracy: 8.32%
	Val_Loss: 0.660,  Val_Accuracy: 30.10%
Epoch: 2
	Train_Loss: 0.362, Train_Accuracy: 35.54%
	Val_Loss: 0.538,  Val_Accuracy: 44.39%
Epoch: 3
	Train_Loss: 0.264, Train_Accuracy: 47.59%
	Val_Loss: 0.481,  Val_Accuracy: 49.61%
Epoch: 4
	Train_Loss: 0.214, Train_Accuracy: 54.30%
	Val_Loss: 0.484,  Val_Accuracy: 51.48%
Epoch: 5
	Train_Loss: 0.186, Train_Accuracy: 58.72%
	Val_Loss: 0.482,  Val_Accuracy: 52.56%


0,1
Val_Accuracy,▁▅▇██
Val_Loss,█▃▁▁▁
train_accuracy,▁▅▆▇█
training_loss,█▂▂▁▁

0,1
Val_Accuracy,52.56413
Val_Loss,0.48167
train_accuracy,58.71647
training_loss,0.18553


[34m[1mwandb[0m: Agent Starting Run: h4qspryj with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 512
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 1
[34m[1mwandb[0m: 	num_enc_layers: 2


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(512, 512, num_layers=2, batch_first=True, dropout=0.3)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(512, 512, batch_first=True)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 0.934, Train_Accuracy: 7.77%
	Val_Loss: 0.842,  Val_Accuracy: 22.84%
Epoch: 2
	Train_Loss: 0.458, Train_Accuracy: 27.55%
	Val_Loss: 0.668,  Val_Accuracy: 34.95%
Epoch: 3
	Train_Loss: 0.356, Train_Accuracy: 37.47%
	Val_Loss: 0.589,  Val_Accuracy: 41.09%
Epoch: 4
	Train_Loss: 0.305, Train_Accuracy: 43.57%
	Val_Loss: 0.584,  Val_Accuracy: 43.07%
Epoch: 5
	Train_Loss: 0.270, Train_Accuracy: 47.85%
	Val_Loss: 0.556,  Val_Accuracy: 45.28%


0,1
Val_Accuracy,▁▅▇▇█
Val_Loss,█▄▂▂▁
train_accuracy,▁▄▆▇█
training_loss,█▃▂▁▁

0,1
Val_Accuracy,45.27739
Val_Loss,0.55551
train_accuracy,47.8541
training_loss,0.26983


[34m[1mwandb[0m: Agent Starting Run: ezd4xr52 with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 512
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 2


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(512, 256, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(512, 1024, num_layers=2, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=1024, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 0.564, Train_Accuracy: 25.90%
	Val_Loss: 0.585,  Val_Accuracy: 40.88%
Epoch: 2
	Train_Loss: 0.247, Train_Accuracy: 50.08%
	Val_Loss: 0.522,  Val_Accuracy: 47.24%
Epoch: 3
	Train_Loss: 0.196, Train_Accuracy: 58.09%
	Val_Loss: 0.522,  Val_Accuracy: 50.47%
Epoch: 4
	Train_Loss: 0.165, Train_Accuracy: 62.44%
	Val_Loss: 0.508,  Val_Accuracy: 52.66%
Epoch: 5
	Train_Loss: 0.149, Train_Accuracy: 65.63%
	Val_Loss: 0.518,  Val_Accuracy: 52.96%


0,1
Val_Accuracy,▁▅▇██
Val_Loss,█▂▂▁▂
train_accuracy,▁▅▇▇█
training_loss,█▃▂▁▁

0,1
Val_Accuracy,52.96237
Val_Loss,0.51759
train_accuracy,65.63323
training_loss,0.14895


[34m[1mwandb[0m: Agent Starting Run: vhc54e98 with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 256
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(256, 256, num_layers=3, batch_first=True, dropout=0.3, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(256, 1536, num_layers=3, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=1536, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 0.710, Train_Accuracy: 21.71%
	Val_Loss: 0.575,  Val_Accuracy: 42.81%
Epoch: 2
	Train_Loss: 0.232, Train_Accuracy: 52.44%
	Val_Loss: 0.521,  Val_Accuracy: 49.11%
Epoch: 3
	Train_Loss: 0.169, Train_Accuracy: 62.09%
	Val_Loss: 0.480,  Val_Accuracy: 52.35%
Epoch: 4
	Train_Loss: 0.142, Train_Accuracy: 66.82%
	Val_Loss: 0.500,  Val_Accuracy: 54.15%
Epoch: 5
	Train_Loss: 0.123, Train_Accuracy: 69.80%
	Val_Loss: 0.503,  Val_Accuracy: 54.31%


0,1
Val_Accuracy,▁▅▇██
Val_Loss,█▄▁▂▃
train_accuracy,▁▅▇██
training_loss,█▂▂▁▁

0,1
Val_Accuracy,54.30866
Val_Loss,0.50341
train_accuracy,69.80471
training_loss,0.1231


[34m[1mwandb[0m: Agent Starting Run: 5ssjgqq1 with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 64
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 64)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(64, 256, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 64)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(64, 512, num_layers=2, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 0.776, Train_Accuracy: 16.03%
	Val_Loss: 0.578,  Val_Accuracy: 40.84%
Epoch: 2
	Train_Loss: 0.287, Train_Accuracy: 43.59%
	Val_Loss: 0.500,  Val_Accuracy: 49.63%
Epoch: 3
	Train_Loss: 0.211, Train_Accuracy: 54.59%
	Val_Loss: 0.478,  Val_Accuracy: 53.35%
Epoch: 4
	Train_Loss: 0.172, Train_Accuracy: 60.62%
	Val_Loss: 0.481,  Val_Accuracy: 54.80%
Epoch: 5
	Train_Loss: 0.149, Train_Accuracy: 64.66%
	Val_Loss: 0.452,  Val_Accuracy: 55.60%


0,1
Val_Accuracy,▁▅▇██
Val_Loss,█▄▂▃▁
train_accuracy,▁▅▇▇█
training_loss,█▃▂▁▁

0,1
Val_Accuracy,55.60135
Val_Loss,0.45191
train_accuracy,64.66173
training_loss,0.14859


[34m[1mwandb[0m: Agent Starting Run: d2vzqbeg with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 32
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(32, 512, num_layers=3, batch_first=True, dropout=0.3)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(32, 512, num_layers=2, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 1.112, Train_Accuracy: 6.49%
	Val_Loss: 0.662,  Val_Accuracy: 29.35%
Epoch: 2
	Train_Loss: 0.343, Train_Accuracy: 37.46%
	Val_Loss: 0.496,  Val_Accuracy: 46.86%
Epoch: 3
	Train_Loss: 0.230, Train_Accuracy: 52.14%
	Val_Loss: 0.477,  Val_Accuracy: 51.21%
Epoch: 4
	Train_Loss: 0.177, Train_Accuracy: 60.17%
	Val_Loss: 0.485,  Val_Accuracy: 52.99%
Epoch: 5
	Train_Loss: 0.144, Train_Accuracy: 65.46%
	Val_Loss: 0.496,  Val_Accuracy: 53.88%


0,1
Val_Accuracy,▁▆▇██
Val_Loss,█▂▁▁▂
train_accuracy,▁▅▆▇█
training_loss,█▂▂▁▁

0,1
Val_Accuracy,53.88084
Val_Loss,0.49576
train_accuracy,65.45936
training_loss,0.14429


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: jsw5fy99 with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 64
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 2


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 64)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(64, 512, num_layers=2, batch_first=True, dropout=0.3)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 64)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(64, 512, num_layers=2, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 1.025, Train_Accuracy: 10.37%
	Val_Loss: 0.605,  Val_Accuracy: 36.20%
Epoch: 2
	Train_Loss: 0.292, Train_Accuracy: 43.96%
	Val_Loss: 0.490,  Val_Accuracy: 49.93%
Epoch: 3
	Train_Loss: 0.203, Train_Accuracy: 56.70%
	Val_Loss: 0.452,  Val_Accuracy: 53.81%
Epoch: 4
	Train_Loss: 0.157, Train_Accuracy: 64.19%
	Val_Loss: 0.455,  Val_Accuracy: 56.06%
Epoch: 5
	Train_Loss: 0.129, Train_Accuracy: 69.02%
	Val_Loss: 0.460,  Val_Accuracy: 56.80%


0,1
Val_Accuracy,▁▆▇██
Val_Loss,█▃▁▁▁
train_accuracy,▁▅▇▇█
training_loss,█▂▂▁▁

0,1
Val_Accuracy,56.80163
Val_Loss,0.46006
train_accuracy,69.01779
training_loss,0.12889


[34m[1mwandb[0m: Agent Starting Run: r0iqc0d3 with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 32
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(32, 512, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 32)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(32, 512, num_layers=2, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 1.538, Train_Accuracy: 0.26%
	Val_Loss: 1.181,  Val_Accuracy: 5.42%
Epoch: 2
	Train_Loss: 0.516, Train_Accuracy: 22.04%
	Val_Loss: 0.548,  Val_Accuracy: 41.10%
Epoch: 3
	Train_Loss: 0.282, Train_Accuracy: 44.71%
	Val_Loss: 0.490,  Val_Accuracy: 47.50%
Epoch: 4
	Train_Loss: 0.211, Train_Accuracy: 54.64%
	Val_Loss: 0.463,  Val_Accuracy: 51.90%
Epoch: 5
	Train_Loss: 0.171, Train_Accuracy: 60.59%
	Val_Loss: 0.472,  Val_Accuracy: 53.20%


0,1
Val_Accuracy,▁▆▇██
Val_Loss,█▂▁▁▁
train_accuracy,▁▄▆▇█
training_loss,█▃▂▁▁

0,1
Val_Accuracy,53.20169
Val_Loss,0.4719
train_accuracy,60.5851
training_loss,0.17147


[34m[1mwandb[0m: Agent Starting Run: yqcvv3rs with config:
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 64
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 2


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 64)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(64, 256, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 64)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(64, 1024, num_layers=2, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=1024, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 0.675, Train_Accuracy: 21.42%
	Val_Loss: 0.506,  Val_Accuracy: 44.82%
Epoch: 2
	Train_Loss: 0.232, Train_Accuracy: 51.92%
	Val_Loss: 0.492,  Val_Accuracy: 50.46%
Epoch: 3
	Train_Loss: 0.163, Train_Accuracy: 62.16%
	Val_Loss: 0.478,  Val_Accuracy: 54.84%
Epoch: 4
	Train_Loss: 0.129, Train_Accuracy: 68.51%
	Val_Loss: 0.496,  Val_Accuracy: 54.62%
Epoch: 5
	Train_Loss: 0.107, Train_Accuracy: 72.49%
	Val_Loss: 0.504,  Val_Accuracy: 55.17%


0,1
Val_Accuracy,▁▅███
Val_Loss,█▄▁▅█
train_accuracy,▁▅▇▇█
training_loss,█▃▂▁▁

0,1
Val_Accuracy,55.17445
Val_Loss,0.50394
train_accuracy,72.49457
training_loss,0.10732


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: r42c6rn1 with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 256
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(256, 512, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 256)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(256, 512, num_layers=3, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 1.176, Train_Accuracy: 3.31%
	Val_Loss: 0.898,  Val_Accuracy: 17.76%
Epoch: 2
	Train_Loss: 0.499, Train_Accuracy: 22.68%
	Val_Loss: 0.629,  Val_Accuracy: 36.14%
Epoch: 3
	Train_Loss: 0.349, Train_Accuracy: 36.70%
	Val_Loss: 0.584,  Val_Accuracy: 41.02%
Epoch: 4
	Train_Loss: 0.280, Train_Accuracy: 45.08%
	Val_Loss: 0.539,  Val_Accuracy: 45.51%
Epoch: 5
	Train_Loss: 0.239, Train_Accuracy: 50.64%
	Val_Loss: 0.538,  Val_Accuracy: 47.63%


0,1
Val_Accuracy,▁▅▆██
Val_Loss,█▃▂▁▁
train_accuracy,▁▄▆▇█
training_loss,█▃▂▁▁

0,1
Val_Accuracy,47.6253
Val_Loss,0.53805
train_accuracy,50.64472
training_loss,0.23869


[34m[1mwandb[0m: Agent Starting Run: t5rv2dnr with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 512
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(512, 512, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(512, 512, num_layers=2, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 1.036, Train_Accuracy: 5.12%
	Val_Loss: 0.841,  Val_Accuracy: 19.62%
Epoch: 2
	Train_Loss: 0.497, Train_Accuracy: 23.00%
	Val_Loss: 0.711,  Val_Accuracy: 33.38%
Epoch: 3
	Train_Loss: 0.379, Train_Accuracy: 33.58%
	Val_Loss: 0.622,  Val_Accuracy: 38.72%
Epoch: 4
	Train_Loss: 0.314, Train_Accuracy: 41.14%
	Val_Loss: 0.611,  Val_Accuracy: 41.72%
Epoch: 5
	Train_Loss: 0.278, Train_Accuracy: 45.67%
	Val_Loss: 0.565,  Val_Accuracy: 43.77%


0,1
Val_Accuracy,▁▅▇▇█
Val_Loss,█▅▂▂▁
train_accuracy,▁▄▆▇█
training_loss,█▃▂▁▁

0,1
Val_Accuracy,43.76848
Val_Loss,0.56502
train_accuracy,45.66878
training_loss,0.27811


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: ekphg063 with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 512
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 1


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(512, 512, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 512)
    (dropout): Dropout(p=0.2, inplace=False)
    (rnn): LSTM(512, 512, num_layers=3, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 1.178, Train_Accuracy: 2.64%
	Val_Loss: 0.997,  Val_Accuracy: 13.81%
Epoch: 2
	Train_Loss: 0.523, Train_Accuracy: 21.05%
	Val_Loss: 0.704,  Val_Accuracy: 31.05%
Epoch: 3
	Train_Loss: 0.366, Train_Accuracy: 35.17%
	Val_Loss: 0.627,  Val_Accuracy: 37.68%
Epoch: 4
	Train_Loss: 0.291, Train_Accuracy: 43.94%
	Val_Loss: 0.615,  Val_Accuracy: 40.20%
Epoch: 5
	Train_Loss: 0.246, Train_Accuracy: 50.09%
	Val_Loss: 0.624,  Val_Accuracy: 42.67%


0,1
Val_Accuracy,▁▅▇▇█
Val_Loss,█▃▁▁▁
train_accuracy,▁▄▆▇█
training_loss,█▃▂▁▁

0,1
Val_Accuracy,42.67445
Val_Loss,0.62396
train_accuracy,50.09221
training_loss,0.24595


In [44]:
import pandas as pd
df = pd.read_csv("predictions_without_attn.csv")
df.head()

Unnamed: 0,Input Text,Actual Text,Predicted Text
0,amgiikarimchaadu,అంగీకరించాడు,అంగికరించాడు
1,angeekarinchaadu,అంగీకరించాడు,అంగీకరించాడు
2,angiikarinchaadu,అంగీకరించాడు,అంగికరించాడు
3,angeekarinchaali,అంగీకరించాలి,అంగీకరించాలి
4,angeekarinchaka,అంగీకరించక,అంజీకరించక


In [None]:
# -embed_size-64-layers_enc-2-layers_dec-2-hid_size-512-cell_type-lstm-bidirectional-False-dropout-0.3-lr-0.001	56.801633648728554
# -embed_size-64-layers_enc-1-layers_dec-2-hid_size-512-cell_type-lstm-bidirectional-True-dropout-0.3-lr-0.001	55.60134535777647
# -embed_size-64-layers_enc-2-layers_dec-2-hid_size-512-cell_type-lstm-bidirectional-True-dropout-0.3-lr-0.001	55.17445298639858
# -embed_size-256-layers_enc-3-layers_dec-3-hid_size-512-cell_type-lstm-bidirectional-True-dropout-0.3-lr-0.001	54.30865612063867
# -embed_size-32-layers_enc-3-layers_dec-2-hid_size-512-cell_type-lstm-bidirectional-False-dropout-0.3-lr-0.001	53.88083973979894
# -embed_size-32-layers_enc-1-layers_dec-2-hid_size-512-cell_type-lstm-bidirectional-False-dropout-0.3-lr-0.001	53.20169278533412
# -embed_size-512-layers_enc-2-layers_dec-2-hid_size-512-cell_type-lstm-bidirectional-True-dropout-0.3-lr-0.001	52.96237433471319
# -embed_size-256-layers_enc-1-layers_dec-3-hid_size-512-cell_type-lstm-bidirectional-False-dropout-0.3-lr-0.001	47.625295683027794
# -embed_size-512-layers_enc-1-layers_dec-2-hid_size-512-cell_type-lstm-bidirectional-False-dropout-0.3-lr-0.001	43.76848018923714
# -embed_size-512-layers_enc-1-layers_dec-3-hid_size-512-cell_type-lstm-bidirectional-False-dropout-0.2-lr-0.001	42.674452986398585

In [48]:
# -embed_size-64-layers_enc-2-layers_dec-2-hid_size-512-cell_type-lstm-bidirectional-False-dropout-0.3-lr-0.001	56.801633648728554
sweep_config = {
    'method': 'bayes',
    'name' : 'Best sweep test',
    'metric': {
        'name': 'Test_Accuracy',
        'goal': 'maximize'
    },
    'parameters': {
        'input_embed_size': {
            'values': [64]
        },
        'num_enc_layers':{
            'values': [2]
        },
        'num_dec_layers':{
            'values': [2]
        },
        'hid_layer_size': {
            'values': [512]
        },
        'cell_type': {
            'values': ['lstm']
        },
        'bidirectional':{
            'values': [False]
        },
        'dropout': {
            'values': [0.3]
        },
        'new_learning_rate':{
            'values': [0.001]
        }
#       'beam search in decoder with different beam sizes': 
    }
}

sweep_id = wandb.sweep(sweep = sweep_config, project="DA6401_A3_P_1_Best")

Create sweep with ID: oei6a9zj
Sweep URL: https://wandb.ai/tentuvenkatesh2-indian-institute-of-technology-madras/DA6401_A3_P_1_Best/sweeps/oei6a9zj


In [49]:
import wandb

def main():
    # Initialize a new wandb run
    with wandb.init() as run:
        # Construct run name from configuration
        run_name = "-embed_size-"+str(wandb.config.input_embed_size)+"-layers_enc-"+str(wandb.config.num_enc_layers)+"-layers_dec-"+str(wandb.config.num_dec_layers)+"-hid_size-"+str(wandb.config.hid_layer_size)+"-cell_type-"+wandb.config.cell_type+"-bidirectional-"+str(wandb.config.bidirectional)+"-dropout-"+str(wandb.config.dropout)+"-lr-"+str(wandb.config.new_learning_rate)
        wandb.run.name = run_name

        # Constants defining the dimensions of the input and output character sets
        INPUT_DIM = 100  # size of the trans character set
        OUTPUT_DIM = 100  # size of the telugu character set

        # Constants defining the dimensions of the embeddings for encoder and decoder
        ENC_EMB_DIM = wandb.config.input_embed_size  # Encoder embedding dimension
        DEC_EMB_DIM = wandb.config.input_embed_size  # Decoder embedding dimension

        # Constants defining the dimension of the hidden layers for encoder and decoder
        HID_DIM = wandb.config.hid_layer_size  # Hidden dimension size

        # Constants defining the number of layers for encoder and decoder
        ENC_LAYERS = wandb.config.num_enc_layers  # Number of layers in the encoder
        DEC_LAYERS = wandb.config.num_dec_layers  # Number of layers in the decoder
        

        # Constants defining the type of RNN cell to use for encoder and decoder
        ENC_RNN_CELL = wandb.config.cell_type  # RNN cell type for the encoder
        DEC_RNN_CELL = wandb.config.cell_type  # RNN cell type for the decoder

        # Instantiate the encoder with specified configurations
        encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_LAYERS, ENC_RNN_CELL, dropout = wandb.config.dropout, bidirectional = wandb.config.bidirectional)
        # Instantiate the decoder with specified configurations
        decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, DEC_LAYERS, encoder.num_layers, DEC_RNN_CELL, dropout = wandb.config.dropout, bidirectional = wandb.config.bidirectional)

        # Determine the computing device (CUDA if available, otherwise CPU)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        # Print the device will be used
        print(f"Using device: {device}")

        # Instantiate the Seq_to_Seq model and move it to the chosen computing device
        model = Seq_to_Seq(encoder, decoder).to(device)
        print(model)
        
        
        # Setting the number of epochs the training process should run
        NUM_EPOCHS = 10
        # Set the maximum norm of the gradients to 1 to prevent exploding gradients
        CLIP = 1
        # Initialize the optimizer, Adam
        optimizer = torch.optim.Adam(model.parameters(), lr=wandb.config.new_learning_rate)  # Set the learning rate to 0.001


        # Padding token index should be ignored in loss calculation
        ignore_index = tel_token_to_index['<pad>']
        # Define the loss function with 'ignore_index' to avoid affecting loss calculation with padding tokens
        criterion = nn.CrossEntropyLoss(ignore_index=ignore_index).to(device)

        # Start the training process for the defined number of epochs
        for epoch in range(NUM_EPOCHS):
            # Doing training on the train dataset and return average loss and accuracy
            train_loss, train_accuracy = train(model, train_loader, optimizer, criterion, CLIP, device, ignore_index)
            # Evaluating the model on the validation dataset and return average loss and accuracy
            val_loss, val_accuracy = evaluate(model, test_loader, criterion, device, ignore_index)

            # Print the loss and accuracy for each epoch
            print(f'Epoch: {epoch+1}')
            print(f'\tTrain_Loss: {train_loss:.3f}, Train_Accuracy: {train_accuracy*100:.2f}%')
            print(f'\tTest_Loss: {val_loss:.3f},  Test_Accuracy: {val_accuracy*100:.2f}%')
            wandb.log({"train_accuracy": train_accuracy * 100, "training_loss": train_loss})
            wandb.log({"Test_Accuracy": val_accuracy * 100, "Test_Loss": val_loss})


wandb.agent(sweep_id, function=main, count=1)
wandb.finish()

[34m[1mwandb[0m: Agent Starting Run: 79jdhrlt with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 64
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 2
[34m[1mwandb[0m: 	num_enc_layers: 2


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 64)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(64, 512, num_layers=2, batch_first=True, dropout=0.3)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 64)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(64, 512, num_layers=2, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 0.979, Train_Accuracy: 11.35%
	Test_Loss: 0.592,  Test_Accuracy: 38.90%
Epoch: 2
	Train_Loss: 0.294, Train_Accuracy: 43.49%
	Test_Loss: 0.490,  Test_Accuracy: 48.16%
Epoch: 3
	Train_Loss: 0.206, Train_Accuracy: 56.07%
	Test_Loss: 0.458,  Test_Accuracy: 53.45%
Epoch: 4
	Train_Loss: 0.162, Train_Accuracy: 63.11%
	Test_Loss: 0.479,  Test_Accuracy: 54.48%
Epoch: 5
	Train_Loss: 0.133, Train_Accuracy: 67.56%
	Test_Loss: 0.497,  Test_Accuracy: 54.95%
Epoch: 6
	Train_Loss: 0.115, Train_Accuracy: 71.25%
	Test_Loss: 0.483,  Test_Acc

0,1
Test_Accuracy,▁▅▇▇▇█████
Test_Loss,█▃▁▂▃▂▃▃▄▅
train_accuracy,▁▄▆▆▇▇▇███
training_loss,█▃▂▂▁▁▁▁▁▁

0,1
Test_Accuracy,57.00475
Test_Loss,0.52649
train_accuracy,79.27394
training_loss,0.07534


In [45]:
#-embed_size-64-layers_enc-3-layers_dec-3-hid_size-512-cell_type-lstm-bidirectional-False-dropout-0.3-lr-0.001	56.3507170313424
sweep_config = {
    'method': 'bayes',
    'name' : 'Best sweep test',
    'metric': {
        'name': 'Test_Accuracy',
        'goal': 'maximize'
    },
    'parameters': {
        'input_embed_size': {
            'values': [64]
        },
        'num_enc_layers':{
            'values': [3]
        },
        'num_dec_layers':{
            'values': [3]
        },
        'hid_layer_size': {
            'values': [512]
        },
        'cell_type': {
            'values': ['lstm']
        },
        'bidirectional':{
            'values': [False]
        },
        'dropout': {
            'values': [0.3]
        },
        'new_learning_rate':{
            'values': [0.001]
        }
#       'beam search in decoder with different beam sizes': 
    }
}

sweep_id = wandb.sweep(sweep = sweep_config, project="DA6401_A3_P_1_Best")

Create sweep with ID: 70ucjzem
Sweep URL: https://wandb.ai/tentuvenkatesh2-indian-institute-of-technology-madras/DA6401_A3_P_1_Best/sweeps/70ucjzem


In [46]:
import wandb

def main():
    # Initialize a new wandb run
    with wandb.init() as run:
        # Construct run name from configuration
        run_name = "-embed_size-"+str(wandb.config.input_embed_size)+"-layers_enc-"+str(wandb.config.num_enc_layers)+"-layers_dec-"+str(wandb.config.num_dec_layers)+"-hid_size-"+str(wandb.config.hid_layer_size)+"-cell_type-"+wandb.config.cell_type+"-bidirectional-"+str(wandb.config.bidirectional)+"-dropout-"+str(wandb.config.dropout)+"-lr-"+str(wandb.config.new_learning_rate)
        wandb.run.name = run_name

        # Constants defining the dimensions of the input and output character sets
        INPUT_DIM = 100  # size of the trans character set
        OUTPUT_DIM = 100  # size of the telugu character set

        # Constants defining the dimensions of the embeddings for encoder and decoder
        ENC_EMB_DIM = wandb.config.input_embed_size  # Encoder embedding dimension
        DEC_EMB_DIM = wandb.config.input_embed_size  # Decoder embedding dimension

        # Constants defining the dimension of the hidden layers for encoder and decoder
        HID_DIM = wandb.config.hid_layer_size  # Hidden dimension size

        # Constants defining the number of layers for encoder and decoder
        ENC_LAYERS = wandb.config.num_enc_layers  # Number of layers in the encoder
        DEC_LAYERS = wandb.config.num_dec_layers  # Number of layers in the decoder
        

        # Constants defining the type of RNN cell to use for encoder and decoder
        ENC_RNN_CELL = wandb.config.cell_type  # RNN cell type for the encoder
        DEC_RNN_CELL = wandb.config.cell_type  # RNN cell type for the decoder

        # Instantiate the encoder with specified configurations
        encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_LAYERS, ENC_RNN_CELL, dropout = wandb.config.dropout, bidirectional = wandb.config.bidirectional)
        # Instantiate the decoder with specified configurations
        decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, DEC_LAYERS, encoder.num_layers, DEC_RNN_CELL, dropout = wandb.config.dropout, bidirectional = wandb.config.bidirectional)

        # Determine the computing device (CUDA if available, otherwise CPU)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        # Print the device will be used
        print(f"Using device: {device}")

        # Instantiate the Seq_to_Seq model and move it to the chosen computing device
        model = Seq_to_Seq(encoder, decoder).to(device)
        print(model)
        
        
        # Setting the number of epochs the training process should run
        NUM_EPOCHS = 10
        # Set the maximum norm of the gradients to 1 to prevent exploding gradients
        CLIP = 1
        # Initialize the optimizer, Adam
        optimizer = torch.optim.Adam(model.parameters(), lr=wandb.config.new_learning_rate)  # Set the learning rate to 0.001


        # Padding token index should be ignored in loss calculation
        ignore_index = tel_token_to_index['<pad>']
        # Define the loss function with 'ignore_index' to avoid affecting loss calculation with padding tokens
        criterion = nn.CrossEntropyLoss(ignore_index=ignore_index).to(device)

        # Start the training process for the defined number of epochs
        for epoch in range(NUM_EPOCHS):
            # Doing training on the train dataset and return average loss and accuracy
            train_loss, train_accuracy = train(model, train_loader, optimizer, criterion, CLIP, device, ignore_index)
            # Evaluating the model on the validation dataset and return average loss and accuracy
            val_loss, val_accuracy = evaluate(model, test_loader, criterion, device, ignore_index)

            # Print the loss and accuracy for each epoch
            print(f'Epoch: {epoch+1}')
            print(f'\tTrain_Loss: {train_loss:.3f}, Train_Accuracy: {train_accuracy*100:.2f}%')
            print(f'\tTest_Loss: {val_loss:.3f},  Test_Accuracy: {val_accuracy*100:.2f}%')
            wandb.log({"train_accuracy": train_accuracy * 100, "training_loss": train_loss})
            wandb.log({"Test_Accuracy": val_accuracy * 100, "Test_Loss": val_loss})


wandb.agent(sweep_id, function=main, count=1)
wandb.finish()

[34m[1mwandb[0m: Agent Starting Run: 3xx8xu84 with config:
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	hid_layer_size: 512
[34m[1mwandb[0m: 	input_embed_size: 64
[34m[1mwandb[0m: 	new_learning_rate: 0.001
[34m[1mwandb[0m: 	num_dec_layers: 3
[34m[1mwandb[0m: 	num_enc_layers: 3


Using device: cuda
Seq_to_Seq(
  (encoder): Encoder(
    (embedding): Embedding(100, 64)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(64, 512, num_layers=3, batch_first=True, dropout=0.3)
  )
  (decoder): Decoder(
    (embedding): Embedding(100, 64)
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): LSTM(64, 512, num_layers=3, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=512, out_features=100, bias=True)
  )
)
Epoch: 1
	Train_Loss: 1.255, Train_Accuracy: 2.45%
	Test_Loss: 0.880,  Test_Accuracy: 17.77%
Epoch: 2
	Train_Loss: 0.430, Train_Accuracy: 29.68%
	Test_Loss: 0.537,  Test_Accuracy: 43.42%
Epoch: 3
	Train_Loss: 0.263, Train_Accuracy: 47.70%
	Test_Loss: 0.523,  Test_Accuracy: 48.89%
Epoch: 4
	Train_Loss: 0.202, Train_Accuracy: 56.62%
	Test_Loss: 0.485,  Test_Accuracy: 52.51%
Epoch: 5
	Train_Loss: 0.165, Train_Accuracy: 62.19%
	Test_Loss: 0.488,  Test_Accuracy: 52.97%
Epoch: 6
	Train_Loss: 0.140, Train_Accuracy: 66.61%
	Test_Loss: 0.466,  Test_Accu

0,1
Test_Accuracy,▁▆▆▇▇█████
Test_Loss,█▂▂▁▁▁▁▁▁▂
train_accuracy,▁▄▅▆▇▇▇███
training_loss,█▃▂▂▁▁▁▁▁▁

0,1
Test_Accuracy,56.67489
Test_Loss,0.535
train_accuracy,76.04974
training_loss,0.09004


In [1]:
import pandas as pd
df = pd.read_csv("predictions_without_attn.csv")
df

FileNotFoundError: [Errno 2] No such file or directory: 'predictions_without_attn.csv'