In [1]:
# Copy dataset from Drive to local content directory
!cp -r /content/drive/MyDrive/dakshina_dataset_v1.0 /content/

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import pandas as pd
import numpy as np
import wandb
import os
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

Using device: cuda


In [3]:
# Choose your language
LANG = 'hi'  # Hindi
DATA_DIR = f'/content/dakshina_dataset_v1.0/{LANG}/lexicons/'

def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    input_texts, target_texts = [], []
    for line in lines:
        latin, devanagari = line.strip().split('\t')[:2]
        input_texts.append(list(latin.lower()))
        target_texts.append(['<sos>'] + list(devanagari) + ['<eos>'])
    return input_texts, target_texts

train_x, train_y = load_data(os.path.join(DATA_DIR, 'hi.translit.sampled.train.tsv'))
val_x, val_y = load_data(os.path.join(DATA_DIR, 'hi.translit.sampled.dev.tsv'))
test_x, test_y = load_data(os.path.join(DATA_DIR, 'hi.translit.sampled.test.tsv'))

# Create vocab
def build_vocab(seqs):
    vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2}
    for seq in seqs:
        for ch in seq:
            if ch not in vocab:
                vocab[ch] = len(vocab)
    return vocab

src_vocab = build_vocab(train_x)
tgt_vocab = build_vocab(train_y)

inv_tgt_vocab = {v: k for k, v in tgt_vocab.items()}
# Special token indices
PAD_IDX = tgt_vocab['<pad>']
SOS_IDX = tgt_vocab['<sos>']
EOS_IDX = tgt_vocab['<eos>']


In [4]:
class TransliterationDataset(Dataset):
    def __init__(self, src_data, tgt_data, src_vocab, tgt_vocab):
        self.src_data = src_data
        self.tgt_data = tgt_data
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab

    def __len__(self):
        return len(self.src_data)

    def __getitem__(self, idx):
        src_seq = [self.src_vocab[ch] for ch in self.src_data[idx]]
        tgt_seq = [self.tgt_vocab[ch] for ch in self.tgt_data[idx]]
        return torch.tensor(src_seq), torch.tensor(tgt_seq)

def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_lens = torch.tensor([len(s) for s in src_batch])
    tgt_lens = torch.tensor([len(t) for t in tgt_batch])
    src_batch = pad_sequence(src_batch, padding_value=0, batch_first=True)
    tgt_batch = pad_sequence(tgt_batch, padding_value=0, batch_first=True)
    return src_batch, tgt_batch, src_lens, tgt_lens

BATCH_SIZE = 64
train_dataset = TransliterationDataset(train_x, train_y, src_vocab, tgt_vocab)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_dataset = TransliterationDataset(val_x, val_y, src_vocab, tgt_vocab)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
# Prepare test loader
test_dataset = TransliterationDataset(test_x, test_y, src_vocab, tgt_vocab)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)

In [5]:
import torch.nn.functional as F

class Attention(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.attn = nn.Linear(hidden_size * 2, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))  # [H]

    def forward(self, hidden, encoder_outputs):
        # hidden: [B, H]
        # encoder_outputs: [B, T, H]
        B, T, H = encoder_outputs.shape

        # Repeat hidden across the time steps
        hidden = hidden.unsqueeze(1).repeat(1, T, 1)  # [B, T, H]

        # Concatenate hidden and encoder_outputs → compute energy
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))  # [B, T, H]

        # Expand `v` for batch matrix multiplication
        v = self.v.unsqueeze(0).unsqueeze(2).expand(B, -1, -1)  # [B, H, 1]

        # Compute attention scores
        scores = torch.bmm(energy, v).squeeze(2)  # [B, T]

        # Softmax over time dimension to get attention weights
        attn_weights = F.softmax(scores, dim=1)  # [B, T]

        # Compute context vector as weighted sum of encoder outputs
        context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs).squeeze(1)  # [B, H]

        return context, attn_weights

In [21]:
class Seq2Seq(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.encoder_embed = nn.Embedding(config['src_vocab_size'], config['embedding_size'])
        self.decoder_embed = nn.Embedding(config['tgt_vocab_size'], config['embedding_size'])

        cell_type = config['cell_type']
        cell = {'RNN': nn.RNN, 'GRU': nn.GRU, 'LSTM': nn.LSTM}[cell_type]
        self.cell_type = cell_type
        self.config = config
        hidden_size = config['hidden_size']
        embed_size = config['embedding_size']

        self.encoder = cell(embed_size, hidden_size, config['num_encoder_layers'],
                            batch_first=True, dropout=config['dropout'] if config['num_encoder_layers'] > 1 else 0)

        self.decoder = cell(embed_size + hidden_size, hidden_size, config['num_decoder_layers'],
                            batch_first=False, dropout=config['dropout'] if config['num_decoder_layers'] > 1 else 0)

        self.attn = Attention(hidden_size)
        self.fc = nn.Linear(hidden_size * 2, config['tgt_vocab_size'])

    def forward(self, src, tgt, src_len, tgt_len):
        batch_size, tgt_len = tgt.shape
        device = src.device
        hidden_size = self.config['hidden_size']
        trg_vocab_size = self.config['tgt_vocab_size']

        src_embed = self.encoder_embed(src)  # [B, T_src, E]
        tgt_embed = self.decoder_embed(tgt[:, :-1])  # [B, T_tgt-1, E]

        encoder_outputs, h = self.encoder(src_embed)  # [B, T_src, H]

        if self.cell_type == 'LSTM':
            h_t, c_t = h
        else:
            h_t = h

        outputs = torch.zeros(batch_size, tgt_embed.size(1), trg_vocab_size).to(device)
        all_attn_weights = []

        for t in range(tgt_embed.size(1)):
            input_embed = tgt_embed[:, t]  # [B, E]
            h_cur = h_t[-1] if self.cell_type == 'LSTM' else h_t[-1]  # [B, H]
            context, attn_weights = self.attn(h_cur, encoder_outputs)  # [B, H], [B, T_src]
            all_attn_weights.append(attn_weights)

            decoder_input = torch.cat([input_embed, context], dim=1).unsqueeze(0)  # [1, B, E+H]

            if self.cell_type == 'LSTM':
                out, (h_t, c_t) = self.decoder(decoder_input, (h_t, c_t))
            else:
                out, h_t = self.decoder(decoder_input, h_t)

            out = out.squeeze(0)  # [B, H]
            final_out = self.fc(torch.cat((out, context), dim=1))  # [B, V]
            outputs[:, t] = final_out

        attn_tensor = torch.stack(all_attn_weights, dim=0)  # [T_tgt-1, B, T_src]
        return outputs, attn_tensor

    def beam_search_decode(self, src, src_len, beam_size, sos_idx, eos_idx, device, return_attn=False):
        # Encode the source
        encoder_outputs, h = self.encoder(self.encoder_embed(src))  # [B, T_src, H]

        if self.cell_type == 'LSTM':
            h_t, c_t = h
        else:
            h_t = h

        B = src.size(0)
        assert B == 1, "Beam search only supports batch size 1 for simplicity"

        # Initialize beams
        beams = [(torch.tensor([sos_idx], device=device), 0.0, h_t, c_t if self.cell_type == 'LSTM' else None, [])]

        for _ in range(50):  # max length
            new_beams = []
            for seq, score, h_t, c_t, attn_list in beams:
                if seq[-1] == eos_idx:
                    new_beams.append((seq, score, h_t, c_t, attn_list))
                    continue

                input_embed = self.decoder_embed(seq[-1].unsqueeze(0)).unsqueeze(0)  # [1, 1, E]
                h_cur = h_t[-1] if self.cell_type == 'LSTM' else h_t[-1]  # [1, H]

                context, attn_weights = self.attn(h_cur, encoder_outputs)  # [1, H], [1, T_src]

                decoder_input = torch.cat([input_embed.squeeze(0), context], dim=1).unsqueeze(0)

                if self.cell_type == 'LSTM':
                    out, (new_h, new_c) = self.decoder(decoder_input, (h_t, c_t))
                else:
                    out, new_h = self.decoder(decoder_input, h_t)

                out = out.squeeze(0)  # [1, H]
                final_out = self.fc(torch.cat((out, context), dim=1))  # [1, V]
                log_probs = torch.log_softmax(final_out, dim=1)  # [1, V]
                topk_log_probs, topk_indices = torch.topk(log_probs, beam_size, dim=1)

                for i in range(beam_size):
                    new_seq = torch.cat([seq, topk_indices[0][i].unsqueeze(0)], dim=0)
                    new_score = score + topk_log_probs[0][i].item()
                    new_attn_list = attn_list + [attn_weights[0].detach().cpu().tolist()]
                    new_beams.append((new_seq, new_score, new_h, new_c if self.cell_type == 'LSTM' else None, new_attn_list))

            beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_size]

            # Early stop if all sequences ended
            if all(seq[-1] == eos_idx for seq, _, _, _, _ in beams):
                break

        best_seq, best_score, _, _, best_attn = beams[0]
        if return_attn:
            return best_seq.tolist(), best_attn  # return both sequence and attention weights
        else:
            return best_seq.tolist()

In [7]:
def train_model(train_loader, val_loader, config=None):
    with wandb.init(config=config) as run:
        config = wandb.config

        # Skip invalid layer configs
        if config.num_encoder_layers != config.num_decoder_layers:
            print(f"Skipping run due to layer mismatch: enc={config.num_encoder_layers}, dec={config.num_decoder_layers}")
            return

        # Set run name from config
        run.name = (
            f"ed_{config.embedding_size}"
            f"_hs_{config.hidden_size}"
            f"_enc_{config.num_encoder_layers}"
            f"_dec_{config.num_decoder_layers}"
            f"_cell_{config.cell_type}"
            f"_drop_{config.dropout}"
        )
        run.save()

        # Create model
        model = Seq2Seq({
            'src_vocab_size': len(src_vocab),
            'tgt_vocab_size': len(tgt_vocab),
            'embedding_size': config.embedding_size,
            'hidden_size': config.hidden_size,
            'num_encoder_layers': config.num_encoder_layers,
            'num_decoder_layers': config.num_decoder_layers,
            'cell_type': config.cell_type,
            'dropout': config.dropout
        }).to(device)

        optimizer = optim.Adam(model.parameters(), lr=0.001)
        criterion = nn.CrossEntropyLoss(ignore_index=0)

        best_val_acc = 0
        for epoch in range(10):
            # === Training ===
            model.train()
            total_train_loss = 0
            correct_train = 0
            total_tokens = 0

            for src, tgt, src_len, tgt_len in train_loader:
                src, tgt = src.to(device), tgt.to(device)
                optimizer.zero_grad()

                output = model(src, tgt, src_len, tgt_len)
                loss = criterion(output.view(-1, output.shape[-1]), tgt[:, 1:].contiguous().view(-1))
                loss.backward()
                optimizer.step()

                total_train_loss += loss.item()
                pred = output.argmax(-1)
                mask = tgt[:, 1:] != 0
                correct_train += ((pred == tgt[:, 1:]) * mask).sum().item()
                total_tokens += mask.sum().item()

            train_loss = total_train_loss / len(train_loader)
            train_acc = correct_train / total_tokens

            # === Validation ===
            model.eval()
            total_val_loss = 0
            correct_val = 0
            val_tokens = 0

            with torch.no_grad():
                for src, tgt, src_len, tgt_len in val_loader:
                    src, tgt = src.to(device), tgt.to(device)
                    output = model(src, tgt, src_len, tgt_len)
                    loss = criterion(output.view(-1, output.shape[-1]), tgt[:, 1:].contiguous().view(-1))
                    total_val_loss += loss.item()

                    pred = output.argmax(-1)
                    mask = tgt[:, 1:] != 0
                    correct_val += ((pred == tgt[:, 1:]) * mask).sum().item()
                    val_tokens += mask.sum().item()

            val_loss = total_val_loss / len(val_loader)
            val_acc = correct_val / val_tokens

            # Log to wandb
            wandb.log({
                'epoch': epoch,
                'train_loss': train_loss,
                'val_loss': val_loss,
                'train_accuracy': train_acc,
                'val_accuracy': val_acc
            })

        # Print final metrics
        print(f"\nFinal Run Metrics for {run.name}:")
        print(f"Train Loss: {train_loss:.4f} | Train Accuracy: {train_acc:.4f}")
        print(f"Val   Loss: {val_loss:.4f} | Val   Accuracy: {val_acc:.4f}\n")

In [8]:
sweep_config = {
    'method': 'bayes',
    'metric': {'name': 'val_loss', 'goal': 'minimize'},
    'parameters': {
        'embedding_size': {'values': [16, 32, 64, 256]},
        'hidden_size': {'values': [16, 32, 64, 256]},
        'num_encoder_layers': {'values': [1]},
        'num_decoder_layers': {'values': [1]},
        'cell_type': {'values': ['RNN', 'GRU', 'LSTM']},
        'dropout': {'values': [0.2, 0.3]},
        # 'beam_size': {'values': [1, 2, 3, 5, ]}
    }
}

In [10]:
sweep_id = wandb.sweep(sweep_config, project="Assignment3_Attempt1")

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Create sweep with ID: n6e7axcm
Sweep URL: https://wandb.ai/vinyk-sd-indian-institute-of-technology-madras/Assignment3_Attempt1/sweeps/n6e7axcm


In [None]:
wandb.agent(sweep_id, function=lambda: train_model(train_loader, val_loader), count=100)# Run 15 trials (you can increase if needed)

[34m[1mwandb[0m: Agent Starting Run: 7cyuj9t3 with config:
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_size: 32
[34m[1mwandb[0m: 	hidden_size: 16
[34m[1mwandb[0m: 	num_decoder_layers: 1
[34m[1mwandb[0m: 	num_encoder_layers: 1



Final Run Metrics for ed_32_hs_16_enc_1_dec_1_cell_GRU_drop_0.2:
Train Loss: 0.5470 | Train Accuracy: 0.8441
Val   Loss: 0.5998 | Val   Accuracy: 0.8384



0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▅▇▇██████
train_loss,█▄▃▂▂▁▁▁▁▁
val_accuracy,▁▅▇▇▇█████
val_loss,█▄▃▂▂▁▁▁▁▁

0,1
epoch,9.0
train_accuracy,0.84413
train_loss,0.54704
val_accuracy,0.83837
val_loss,0.59975


[34m[1mwandb[0m: Agent Starting Run: 1c8p394d with config:
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_size: 16
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	num_decoder_layers: 1
[34m[1mwandb[0m: 	num_encoder_layers: 1



Final Run Metrics for ed_16_hs_256_enc_1_dec_1_cell_RNN_drop_0.2:
Train Loss: 0.3470 | Train Accuracy: 0.8868
Val   Loss: 0.4491 | Val   Accuracy: 0.8685



0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▆▇▇██████
train_loss,█▃▂▂▂▁▁▁▁▁
val_accuracy,▁▅▆▆▇█▇█▇█
val_loss,█▄▄▃▂▁▂▁▂▁

0,1
epoch,9.0
train_accuracy,0.88678
train_loss,0.34699
val_accuracy,0.86849
val_loss,0.44908


[34m[1mwandb[0m: Agent Starting Run: k88ccrc8 with config:
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 32
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	num_decoder_layers: 1
[34m[1mwandb[0m: 	num_encoder_layers: 1



Final Run Metrics for ed_32_hs_256_enc_1_dec_1_cell_RNN_drop_0.3:
Train Loss: 0.2953 | Train Accuracy: 0.8983
Val   Loss: 0.4093 | Val   Accuracy: 0.8780



0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▆▇▇▇█████
train_loss,█▃▂▂▂▁▁▁▁▁
val_accuracy,▂▄▁▇▆█▇▇██
val_loss,█▆█▃▃▁▁▂▁▂

0,1
epoch,9.0
train_accuracy,0.89832
train_loss,0.29528
val_accuracy,0.87795
val_loss,0.40935


[34m[1mwandb[0m: Agent Starting Run: m9tkmr3m with config:
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 16
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	num_decoder_layers: 1
[34m[1mwandb[0m: 	num_encoder_layers: 1



Final Run Metrics for ed_16_hs_256_enc_1_dec_1_cell_RNN_drop_0.3:
Train Loss: 0.3228 | Train Accuracy: 0.8922
Val   Loss: 0.4302 | Val   Accuracy: 0.8711



0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▆▇▇██████
train_loss,█▃▂▂▂▁▁▁▁▁
val_accuracy,▁▄▃▇▇██▇█▇
val_loss,█▅▆▂▂▂▁▁▁▁

0,1
epoch,9.0
train_accuracy,0.89224
train_loss,0.32277
val_accuracy,0.8711
val_loss,0.43025


[34m[1mwandb[0m: Agent Starting Run: ev9zfezu with config:
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 32
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	num_decoder_layers: 1
[34m[1mwandb[0m: 	num_encoder_layers: 1



Final Run Metrics for ed_32_hs_256_enc_1_dec_1_cell_RNN_drop_0.3:
Train Loss: 0.3009 | Train Accuracy: 0.8975
Val   Loss: 0.3845 | Val   Accuracy: 0.8846



0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▇▇▇▇▇████
train_loss,█▃▂▂▂▂▁▁▁▁
val_accuracy,▁▅▅▆▆▇▇▇██
val_loss,█▄▃▃▂▂▂▂▁▁

0,1
epoch,9.0
train_accuracy,0.89749
train_loss,0.3009
val_accuracy,0.88464
val_loss,0.38449


[34m[1mwandb[0m: Agent Starting Run: 6s0h4xbk with config:
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 64
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	num_decoder_layers: 1
[34m[1mwandb[0m: 	num_encoder_layers: 1



Final Run Metrics for ed_64_hs_256_enc_1_dec_1_cell_RNN_drop_0.3:
Train Loss: 0.2976 | Train Accuracy: 0.8972
Val   Loss: 0.3791 | Val   Accuracy: 0.8846



0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▆▇▇▇█████
train_loss,█▃▂▂▂▁▁▁▁▁
val_accuracy,▁▄▇▆▇▇▇███
val_loss,█▅▂▂▁▂▁▁▂▁

0,1
epoch,9.0
train_accuracy,0.89723
train_loss,0.29764
val_accuracy,0.88455
val_loss,0.37912


[34m[1mwandb[0m: Agent Starting Run: nj7mk5tr with config:
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 32
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	num_decoder_layers: 1
[34m[1mwandb[0m: 	num_encoder_layers: 1



Final Run Metrics for ed_32_hs_256_enc_1_dec_1_cell_RNN_drop_0.3:
Train Loss: 0.3083 | Train Accuracy: 0.8952
Val   Loss: 0.3825 | Val   Accuracy: 0.8832



0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▇▇▇██████
train_loss,█▃▂▂▂▁▁▁▁▁
val_accuracy,▁▅▄▅▇▇▇▇▇█
val_loss,█▄▅▃▂▂▂▂▂▁

0,1
epoch,9.0
train_accuracy,0.89519
train_loss,0.3083
val_accuracy,0.88325
val_loss,0.38246


[34m[1mwandb[0m: Agent Starting Run: w4dp1vg8 with config:
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 64
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	num_decoder_layers: 1
[34m[1mwandb[0m: 	num_encoder_layers: 1



Final Run Metrics for ed_64_hs_256_enc_1_dec_1_cell_RNN_drop_0.3:
Train Loss: 0.2978 | Train Accuracy: 0.8973
Val   Loss: 0.3682 | Val   Accuracy: 0.8851



0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▆▇▇██████
train_loss,█▃▂▂▁▁▁▁▁▁
val_accuracy,▁▄▅▇▇█▇█▅█
val_loss,█▅▃▂▂▂▂▂▄▁

0,1
epoch,9.0
train_accuracy,0.89731
train_loss,0.29781
val_accuracy,0.88513
val_loss,0.36816


[34m[1mwandb[0m: Agent Starting Run: xi1iyxxs with config:
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 256
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	num_decoder_layers: 1
[34m[1mwandb[0m: 	num_encoder_layers: 1



Final Run Metrics for ed_256_hs_256_enc_1_dec_1_cell_RNN_drop_0.3:
Train Loss: 0.3212 | Train Accuracy: 0.8907
Val   Loss: 0.3886 | Val   Accuracy: 0.8780



0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▆▇█▇█████
train_loss,█▃▂▂▂▁▁▁▁▁
val_accuracy,▁▄▅▇▇▇█▇█▇
val_loss,█▅▃▂▂▂▂▂▁▂

0,1
epoch,9.0
train_accuracy,0.89071
train_loss,0.32125
val_accuracy,0.87795
val_loss,0.38856


[34m[1mwandb[0m: Agent Starting Run: pciz0dnf with config:
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 256
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	num_decoder_layers: 1
[34m[1mwandb[0m: 	num_encoder_layers: 1



Final Run Metrics for ed_256_hs_256_enc_1_dec_1_cell_GRU_drop_0.3:
Train Loss: 0.2405 | Train Accuracy: 0.9102
Val   Loss: 0.3411 | Val   Accuracy: 0.8916



0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▆▇▇▇▇████
train_loss,█▃▂▂▂▂▁▁▁▁
val_accuracy,▁▂▇▆▇▆█▇██
val_loss,█▆▂▂▂▂▁▁▁▁

0,1
epoch,9.0
train_accuracy,0.91025
train_loss,0.24045
val_accuracy,0.89164
val_loss,0.34108


[34m[1mwandb[0m: Agent Starting Run: 2g4dgpz8 with config:
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 64
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	num_decoder_layers: 1
[34m[1mwandb[0m: 	num_encoder_layers: 1



Final Run Metrics for ed_64_hs_256_enc_1_dec_1_cell_GRU_drop_0.3:
Train Loss: 0.2440 | Train Accuracy: 0.9099
Val   Loss: 0.3440 | Val   Accuracy: 0.8893



0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▇▇▇██████
train_loss,█▃▂▂▂▁▁▁▁▁
val_accuracy,▁▄▆▇▇▆███▇
val_loss,█▄▃▃▃▃▂▁▁▂

0,1
epoch,9.0
train_accuracy,0.90987
train_loss,0.24404
val_accuracy,0.88933
val_loss,0.34396


[34m[1mwandb[0m: Agent Starting Run: vsmqgobd with config:
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 32
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	num_decoder_layers: 1
[34m[1mwandb[0m: 	num_encoder_layers: 1



Final Run Metrics for ed_32_hs_256_enc_1_dec_1_cell_RNN_drop_0.3:
Train Loss: 0.3281 | Train Accuracy: 0.8903
Val   Loss: 0.4007 | Val   Accuracy: 0.8783



0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▆▇▇▇█████
train_loss,█▃▂▂▂▁▁▁▁▁
val_accuracy,▁▄▅▇▆▇▇███
val_loss,█▅▄▂▂▃▂▁▂▁

0,1
epoch,9.0
train_accuracy,0.89026
train_loss,0.32805
val_accuracy,0.87833
val_loss,0.40071


[34m[1mwandb[0m: Agent Starting Run: tarvjkjn with config:
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 256
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	num_decoder_layers: 1
[34m[1mwandb[0m: 	num_encoder_layers: 1



Final Run Metrics for ed_256_hs_256_enc_1_dec_1_cell_LSTM_drop_0.3:
Train Loss: 0.2332 | Train Accuracy: 0.9121
Val   Loss: 0.3259 | Val   Accuracy: 0.8931



0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▆▇▇▇▇████
train_loss,█▃▂▂▂▂▁▁▁▁
val_accuracy,▁▃▆▇█▇▇███
val_loss,█▆▃▃▂▂▂▂▁▁

0,1
epoch,9.0
train_accuracy,0.91209
train_loss,0.23321
val_accuracy,0.89315
val_loss,0.32587


[34m[1mwandb[0m: Agent Starting Run: z5n8ee1n with config:
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 256
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	num_decoder_layers: 1
[34m[1mwandb[0m: 	num_encoder_layers: 1



Final Run Metrics for ed_256_hs_256_enc_1_dec_1_cell_RNN_drop_0.3:
Train Loss: 0.3269 | Train Accuracy: 0.8906
Val   Loss: 0.4251 | Val   Accuracy: 0.8718



0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▇▇▇▇█████
train_loss,█▃▂▂▂▁▁▁▁▁
val_accuracy,▁▆▇▆▇▄██▇▇
val_loss,█▃▂▄▂▅▂▁▂▃

0,1
epoch,9.0
train_accuracy,0.89062
train_loss,0.32686
val_accuracy,0.87185
val_loss,0.42512


[34m[1mwandb[0m: Agent Starting Run: h3g3ffi9 with config:
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 16
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	num_decoder_layers: 1
[34m[1mwandb[0m: 	num_encoder_layers: 1



Final Run Metrics for ed_16_hs_256_enc_1_dec_1_cell_RNN_drop_0.3:
Train Loss: 0.3240 | Train Accuracy: 0.8920
Val   Loss: 0.4122 | Val   Accuracy: 0.8774



0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▆▇▇▇█████
train_loss,█▃▂▂▂▁▁▁▁▁
val_accuracy,▁▄▆▇▇▇████
val_loss,█▅▃▂▂▂▁▁▁▁

0,1
epoch,9.0
train_accuracy,0.89202
train_loss,0.32403
val_accuracy,0.87738
val_loss,0.41224


[34m[1mwandb[0m: Agent Starting Run: uzs1167p with config:
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 32
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	num_decoder_layers: 1
[34m[1mwandb[0m: 	num_encoder_layers: 1



Final Run Metrics for ed_32_hs_32_enc_1_dec_1_cell_LSTM_drop_0.3:
Train Loss: 0.3863 | Train Accuracy: 0.8795
Val   Loss: 0.4226 | Val   Accuracy: 0.8743



0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▆▇███████
train_loss,█▃▂▁▁▁▁▁▁▁
val_accuracy,▁▆▇▇██████
val_loss,█▃▂▂▂▁▁▁▁▁

0,1
epoch,9.0
train_accuracy,0.87947
train_loss,0.38632
val_accuracy,0.87428
val_loss,0.42263


[34m[1mwandb[0m: Agent Starting Run: pfxc3e5u with config:
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 256
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	num_decoder_layers: 1
[34m[1mwandb[0m: 	num_encoder_layers: 1



Final Run Metrics for ed_256_hs_256_enc_1_dec_1_cell_LSTM_drop_0.3:
Train Loss: 0.2332 | Train Accuracy: 0.9121
Val   Loss: 0.3359 | Val   Accuracy: 0.8921



0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▆▇▇▇▇████
train_loss,█▃▂▂▂▂▁▁▁▁
val_accuracy,▁▅▆▆▇█▇▇██
val_loss,█▄▃▂▁▁▁▁▁▁

0,1
epoch,9.0
train_accuracy,0.91207
train_loss,0.23318
val_accuracy,0.8921
val_loss,0.33593


[34m[1mwandb[0m: Agent Starting Run: 4f8s1nhl with config:
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 256
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	num_decoder_layers: 1
[34m[1mwandb[0m: 	num_encoder_layers: 1



Final Run Metrics for ed_256_hs_256_enc_1_dec_1_cell_LSTM_drop_0.3:
Train Loss: 0.2346 | Train Accuracy: 0.9116
Val   Loss: 0.3413 | Val   Accuracy: 0.8921



0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▆▇▇▇█████
train_loss,█▃▂▂▂▂▁▁▁▁
val_accuracy,▁▅▆▇▇▇█▇█▇
val_loss,█▄▃▂▁▁▁▁▁▂

0,1
epoch,9.0
train_accuracy,0.91163
train_loss,0.23464
val_accuracy,0.89213
val_loss,0.34131


[34m[1mwandb[0m: Agent Starting Run: 1ghi93ei with config:
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 256
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	num_decoder_layers: 1
[34m[1mwandb[0m: 	num_encoder_layers: 1



Final Run Metrics for ed_256_hs_256_enc_1_dec_1_cell_LSTM_drop_0.3:
Train Loss: 0.2276 | Train Accuracy: 0.9130
Val   Loss: 0.3341 | Val   Accuracy: 0.8941



0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▇▇▇▇█████
train_loss,█▃▂▂▂▁▁▁▁▁
val_accuracy,▁▅▆▇▇▇▇▇██
val_loss,█▄▃▂▁▁▁▁▁▁

0,1
epoch,9.0
train_accuracy,0.91305
train_loss,0.22758
val_accuracy,0.89413
val_loss,0.33406


[34m[1mwandb[0m: Agent Starting Run: fc9v2kmx with config:
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 256
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	num_decoder_layers: 1
[34m[1mwandb[0m: 	num_encoder_layers: 1



Final Run Metrics for ed_256_hs_256_enc_1_dec_1_cell_GRU_drop_0.3:
Train Loss: 0.2417 | Train Accuracy: 0.9096
Val   Loss: 0.3483 | Val   Accuracy: 0.8929



0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▆▇▇▇▇████
train_loss,█▃▂▂▂▂▁▁▁▁
val_accuracy,▁▄▂▆▇█▇█▇█
val_loss,█▄▆▂▂▁▁▂▂▂

0,1
epoch,9.0
train_accuracy,0.9096
train_loss,0.24167
val_accuracy,0.89294
val_loss,0.34831


[34m[1mwandb[0m: Agent Starting Run: my7zko1q with config:
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 256
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	num_decoder_layers: 1
[34m[1mwandb[0m: 	num_encoder_layers: 1



Final Run Metrics for ed_256_hs_256_enc_1_dec_1_cell_LSTM_drop_0.3:
Train Loss: 0.2314 | Train Accuracy: 0.9117
Val   Loss: 0.3310 | Val   Accuracy: 0.8936



0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▆▇▇▇█████
train_loss,█▃▂▂▂▂▁▁▁▁
val_accuracy,▁▄▄▆▇▇▇▇██
val_loss,█▅▄▂▂▁▁▁▂▁

0,1
epoch,9.0
train_accuracy,0.91171
train_loss,0.23141
val_accuracy,0.89361
val_loss,0.33096


[34m[1mwandb[0m: Agent Starting Run: 78961nic with config:
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 256
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	num_decoder_layers: 1
[34m[1mwandb[0m: 	num_encoder_layers: 1



Final Run Metrics for ed_256_hs_256_enc_1_dec_1_cell_GRU_drop_0.3:
Train Loss: 0.2427 | Train Accuracy: 0.9099
Val   Loss: 0.3327 | Val   Accuracy: 0.8924



0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▆▇▇▇▇████
train_loss,█▃▂▂▂▂▁▁▁▁
val_accuracy,▁▄▆▆▇█▇▇██
val_loss,█▆▃▃▂▂▂▂▁▁

0,1
epoch,9.0
train_accuracy,0.90991
train_loss,0.24265
val_accuracy,0.89239
val_loss,0.33274


[34m[1mwandb[0m: Agent Starting Run: t72svme2 with config:
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 256
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	num_decoder_layers: 1
[34m[1mwandb[0m: 	num_encoder_layers: 1



Final Run Metrics for ed_256_hs_256_enc_1_dec_1_cell_LSTM_drop_0.3:
Train Loss: 0.2301 | Train Accuracy: 0.9125
Val   Loss: 0.3245 | Val   Accuracy: 0.8950



0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▆▇▇▇█████
train_loss,█▃▂▂▂▁▁▁▁▁
val_accuracy,▁▅▆▇▇▇▇███
val_loss,█▄▂▂▂▁▁▁▁▁

0,1
epoch,9.0
train_accuracy,0.9125
train_loss,0.2301
val_accuracy,0.89503
val_loss,0.32454


[34m[1mwandb[0m: Agent Starting Run: bnisekfv with config:
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 32
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	num_decoder_layers: 1
[34m[1mwandb[0m: 	num_encoder_layers: 1
