In [1]:
import wandb
import torch
import torch.nn as nn
import torch.optim as optim
import random
from torch.nn.utils.rnn import pad_sequence

# Log in to W&B
wandb.login(key='acdc26d2fc17a56e83ea3ae6c10e496128dee648')

# ---------- Model Definitions ----------
class Encoder(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim, num_layers, cell_type='LSTM', dropout=0.2, bidirectional=False):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_dim, embed_dim, padding_idx=0)
        rnn_cls = {'RNN': nn.RNN, 'LSTM': nn.LSTM, 'GRU': nn.GRU}[cell_type]
        self.rnn = rnn_cls(embed_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True, bidirectional=bidirectional)
        self.cell_type = cell_type
        self.bidirectional = bidirectional

    def forward(self, src):
        embedded = self.embedding(src)
        outputs, hidden = self.rnn(embedded)
        return hidden

class Decoder(nn.Module):
    def __init__(self, output_dim, embed_dim, hidden_dim, num_layers, cell_type='LSTM', dropout=0.2, bidirectional=False):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_dim, embed_dim, padding_idx=0)
        rnn_cls = {'RNN': nn.RNN, 'LSTM': nn.LSTM, 'GRU': nn.GRU}[cell_type]
        self.rnn = rnn_cls(embed_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True, bidirectional=bidirectional)
        self.fc_out = nn.Linear(hidden_dim * (2 if bidirectional else 1), output_dim)
        self.cell_type = cell_type
        self.bidirectional = bidirectional

    def forward(self, input, hidden):
        input = input.unsqueeze(1)
        embedded = self.embedding(input)
        output, hidden = self.rnn(embedded, hidden)
        output = self.fc_out(output.squeeze(1))
        return output, hidden

class Seq2Seq(nn.Module):
    def __init__(self, input_dim, output_dim, embed_dim, hidden_dim, enc_layers, dec_layers,
                 cell_type='LSTM', dropout=0.2, bidirectional=False):
        super(Seq2Seq, self).__init__()
        self.encoder = Encoder(input_dim, embed_dim, hidden_dim, enc_layers, cell_type, dropout, bidirectional)
        self.decoder = Decoder(output_dim, embed_dim, hidden_dim, dec_layers, cell_type, dropout, bidirectional)
        self.cell_type = cell_type


    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size, trg_len = trg.size()
        outputs = torch.zeros(batch_size, trg_len, self.decoder.fc_out.out_features, device=src.device)
    
        hidden = self.encoder(src)

        def flatten_bidirectional(hidden_state):
            # Convert (num_layers * 2, batch, hidden_size) -> (num_layers, batch, hidden_size * 2)
            num_layers = self.decoder.rnn.num_layers
            batch = hidden_state.size(1)
            hidden_size = hidden_state.size(2)
            return torch.cat(
                [hidden_state[i * 2:(i + 1) * 2] for i in range(num_layers)],
                dim=2
            )
    
        if self.cell_type == 'LSTM':
            h, c = hidden
            if self.encoder.bidirectional:
                h = flatten_bidirectional(h)
                c = flatten_bidirectional(c)
            else:
                h = h[:self.decoder.rnn.num_layers]
                c = c[:self.decoder.rnn.num_layers]
            decoder_hidden = (h, c)
        else:
            h = hidden
            if self.encoder.bidirectional:
                h = flatten_bidirectional(h)
            else:
                h = h[:self.decoder.rnn.num_layers]
            decoder_hidden = h
    
        input = trg[:, 0]
        for t in range(1, trg_len):
            output, decoder_hidden = self.decoder(input, decoder_hidden)
            outputs[:, t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[:, t] if teacher_force else top1
    
        return outputs


# ---------- Utility Functions ----------
def build_vocab(sequences):
    chars = set(ch for seq in sequences for ch in seq)
    stoi = {'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3}
    for ch in sorted(chars):
        stoi[ch] = len(stoi)
    itos = {i: ch for ch, i in stoi.items()}
    return stoi, itos

def encode_sequence(seq, stoi):
    return [stoi.get(c, stoi['<unk>']) for c in seq]

def prepare_batch(pairs, inp_stoi, out_stoi, device):
    src_seq = [torch.tensor(encode_sequence(src, inp_stoi) + [inp_stoi['<eos>']]) for src, _ in pairs]
    trg_seq = [torch.tensor([out_stoi['<sos>']] + encode_sequence(trg, out_stoi) + [out_stoi['<eos>']]) for _, trg in pairs]
    src_batch = pad_sequence(src_seq, batch_first=True, padding_value=inp_stoi['<pad>'])
    trg_batch = pad_sequence(trg_seq, batch_first=True, padding_value=out_stoi['<pad>'])
    return src_batch.to(device), trg_batch.to(device)

def read_dataset(path):
    with open(path, encoding='utf-8') as f:
        lines = f.read().strip().split('\n')
        return [(l.split('\t')[1], l.split('\t')[0]) for l in lines if '\t' in l]

def calculate_accuracy(preds, targets, ignore_index=0):
    preds = preds.argmax(dim=-1)
    mask = targets != ignore_index
    correct = (preds == targets) & mask
    return (correct.sum().item() / mask.sum().item())*100

def evaluate(model, data, src_vocab, tgt_vocab, device, criterion, batch_size):
    model.eval()
    total_loss = 0
    total_acc = 0
    with torch.no_grad():
        for i in range(0, len(data), batch_size):
            batch = data[i:i + batch_size]
            src, trg = prepare_batch(batch, src_vocab, tgt_vocab, device)
            output = model(src, trg)
            loss = criterion(output[:, 1:].reshape(-1, output.shape[-1]), trg[:, 1:].reshape(-1))
            acc = calculate_accuracy(output[:, 1:], trg[:, 1:])
            total_loss += loss.item()
            total_acc += acc
    return total_loss / len(data), total_acc / (len(data) // batch_size)

# ---------- Test Function ----------
def predict_and_log_test_examples(model, test_path, src_vocab, tgt_vocab, device, num_examples=10):
    model.eval()
    itos = tgt_vocab[1]  # index to string
    stoi = src_vocab[0]  # string to index

    test_data = read_dataset(test_path)
    examples = random.sample(test_data, num_examples)
    predictions_log = []

    for src_text, tgt_text in examples:
        src_tensor = torch.tensor(encode_sequence(src_text, stoi) + [stoi['<eos>']], device=device).unsqueeze(0)
        trg_indexes = [tgt_vocab[0]['<sos>']]
        hidden = model.encoder(src_tensor)

        if model.cell_type == 'LSTM':
            decoder_hidden = (hidden[0][:model.decoder.rnn.num_layers], hidden[1][:model.decoder.rnn.num_layers])
        else:
            decoder_hidden = hidden[:model.decoder.rnn.num_layers]

        input = torch.tensor([tgt_vocab[0]['<sos>']], device=device)

        decoded_tokens = []
        for _ in range(30):  # max length
            output, decoder_hidden = model.decoder(input, decoder_hidden)
            top1 = output.argmax(1)
            if top1.item() == tgt_vocab[0]['<eos>']:
                break
            decoded_tokens.append(itos[top1.item()])
            input = top1

        prediction = ''.join(decoded_tokens)

        # Print to notebook output
        print(f"Input: {src_text} | Target: {tgt_text} | Prediction: {prediction}")
        predictions_log.append(wandb.Html(f"<b>Input:</b> {src_text} &nbsp; <b>Target:</b> {tgt_text} &nbsp; <b>Pred:</b> {prediction}"))

    # Log to W&B as a table or HTML
    wandb.log({"Test Predictions": wandb.Html("<br>".join([str(p) for p in predictions_log]))})



# ---------- Train Function ----------


def train():
    wandb.init(config={
        "embed_dim": 128,
        "hidden_dim": 256,
        "enc_layers": 2,
        "dec_layers": 2,
        "cell_type": "LSTM",
        "dropout": 0.2,
        "epochs": 10,
        "batch_size": 64,
        "bidirectional": False,
        "learning_rate": 0.001,
        "optimizer": "adam",
        "teacher_forcing_ratio": 0.5,
        "beam_width": 1
    })
    config = wandb.config
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    train_data = read_dataset("/kaggle/input/dakshina-dataset/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv")
    dev_data = read_dataset("/kaggle/input/dakshina-dataset/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv")

    src_vocab, tgt_vocab = build_vocab([src for src, _ in train_data]), build_vocab([tgt for _, tgt in train_data])
    model = Seq2Seq(len(src_vocab[0]), len(tgt_vocab[0]), config.embed_dim, config.hidden_dim,
                    config.enc_layers, config.dec_layers, config.cell_type, config.dropout, config.bidirectional).to(device)

    if config.optimizer == "adam":
        optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)
    elif config.optimizer == "nadam":
        optimizer = optim.NAdam(model.parameters(), lr=config.learning_rate)
    else:
        raise ValueError("Unsupported optimizer")

    criterion = nn.CrossEntropyLoss(ignore_index=0)

    for epoch in range(config.epochs):
        model.train()
        total_loss = 0
        total_acc = 0
        random.shuffle(train_data)

        for i in range(0, len(train_data), config.batch_size):
            batch = train_data[i:i + config.batch_size]
            src, trg = prepare_batch(batch, src_vocab[0], tgt_vocab[0], device)

            optimizer.zero_grad()
            output = model(src, trg, teacher_forcing_ratio=config.teacher_forcing_ratio)
            loss = criterion(output[:, 1:].reshape(-1, output.shape[-1]), trg[:, 1:].reshape(-1))
            acc = calculate_accuracy(output[:, 1:], trg[:, 1:])
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            total_acc += acc

        avg_train_loss = total_loss / len(train_data)
        avg_train_acc = total_acc / (len(train_data) // config.batch_size)
        val_loss, val_acc = evaluate(model, dev_data, src_vocab[0], tgt_vocab[0], device, criterion, config.batch_size)

        wandb.log({
            "Train Loss": avg_train_loss,
            "Train Accuracy": avg_train_acc,
            "Validation Loss": val_loss,
            "Validation Accuracy": val_acc,
            "Epoch": epoch + 1,
            "Learning Rate": config.learning_rate,
            "Teacher Forcing Ratio": config.teacher_forcing_ratio,
            "Optimizer": config.optimizer,
            "Bidirectional": config.bidirectional,
            "Beam Width": config.beam_width
        })

        print(f"Epoch {epoch + 1}/{config.epochs} | Train Loss: {avg_train_loss:.4f}, Train Acc: {avg_train_acc:.2f}% | Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%")
        # At the end of train()
    
    test_path = "/kaggle/input/dakshina-dataset/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv"
    predict_and_log_test_examples(model, test_path, src_vocab, tgt_vocab, device)
    wandb.finish()

# ---------- Sweep Setup ----------
sweep_config = {
    'method': 'grid',
    'metric': {'name': 'Validation Loss', 'goal': 'minimize'},
    'parameters': {
        'embed_dim': {'values': [32, 64, 256]},
        'hidden_dim': {'values': [64, 128]},
        'enc_layers': {'values': [1, 2]},
        'dec_layers': {'values': [1, 2]},
        'cell_type': {'values': ['LSTM', 'GRU']},
        'dropout': {'values': [0.2, 0.3]},
        'batch_size': {'value': 32},
        'epochs': {'value': 10},
        'bidirectional': {'values': [False]},
        'learning_rate': {'values': [0.001, 0.002, 0.0001]},
        'optimizer': {'values': ['adam', 'nadam']},
        'teacher_forcing_ratio': {'values': [0.2, 0.5, 0.7]},
        'beam_width': {'values': [1, 3, 5]}
    }
}

sweep_id = wandb.sweep(sweep_config, project="Vinod_Assignment_3_new")
wandb.agent(sweep_id, function=train, count=1)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mviinod9[0m ([33mviinod9-iitm[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Create sweep with ID: lo2q1yvj
Sweep URL: https://wandb.ai/viinod9-iitm/Vinod_Assignment_3_new/sweeps/lo2q1yvj


[34m[1mwandb[0m: Agent Starting Run: kt3gkud0 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_width: 1
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embed_dim: 32
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.2




Epoch 1/10 | Train Loss: 0.0862, Train Acc: 27.57% | Val Loss: 0.0680, Val Acc: 39.72%
Epoch 2/10 | Train Loss: 0.0592, Train Acc: 44.71% | Val Loss: 0.0496, Val Acc: 53.19%
Epoch 3/10 | Train Loss: 0.0482, Train Acc: 52.75% | Val Loss: 0.0431, Val Acc: 58.62%
Epoch 4/10 | Train Loss: 0.0430, Train Acc: 57.09% | Val Loss: 0.0393, Val Acc: 62.09%
Epoch 5/10 | Train Loss: 0.0398, Train Acc: 60.10% | Val Loss: 0.0373, Val Acc: 64.15%
Epoch 6/10 | Train Loss: 0.0375, Train Acc: 62.11% | Val Loss: 0.0358, Val Acc: 65.26%
Epoch 7/10 | Train Loss: 0.0358, Train Acc: 63.87% | Val Loss: 0.0342, Val Acc: 66.93%
Epoch 8/10 | Train Loss: 0.0345, Train Acc: 65.12% | Val Loss: 0.0332, Val Acc: 67.62%
Epoch 9/10 | Train Loss: 0.0333, Train Acc: 66.25% | Val Loss: 0.0327, Val Acc: 68.44%
Epoch 10/10 | Train Loss: 0.0325, Train Acc: 66.99% | Val Loss: 0.0321, Val Acc: 68.70%
Input: pakaude | Target: पकौड़े | Prediction: पौकाड़
Input: kabaddi | Target: कबड्डी | Prediction: कबबददीी
Input: neelaabh | Targ

0,1
Beam Width,▁▁▁▁▁▁▁▁▁▁
Epoch,▁▂▃▃▄▅▆▆▇█
Learning Rate,▁▁▁▁▁▁▁▁▁▁
Teacher Forcing Ratio,▁▁▁▁▁▁▁▁▁▁
Train Accuracy,▁▄▅▆▇▇▇███
Train Loss,█▄▃▂▂▂▁▁▁▁
Validation Accuracy,▁▄▆▆▇▇████
Validation Loss,█▄▃▂▂▂▁▁▁▁

0,1
Beam Width,1
Bidirectional,False
Epoch,10
Learning Rate,0.001
Optimizer,adam
Teacher Forcing Ratio,0.2
Train Accuracy,66.99424
Train Loss,0.03252
Validation Accuracy,68.69594
Validation Loss,0.03208
