# AFML Hackathon Part 2 - Team 44_XLR8
## LSTM Translation: Encoded Text → English

In [None]:
# Mount Google Drive (if using Colab)
try:
    from google.colab import drive
    drive.mount('/content/drive')
    import os
    os.chdir('/content/drive/MyDrive/AFML_KAAGLE')  # Adjust path
    print("✓ Running on Google Colab")
except:
    print("✓ Running locally")

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from tqdm import tqdm

torch.manual_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

## Model Architecture (DO NOT MODIFY)

In [None]:
class CharLSTMTranslator(nn.Module):
    def __init__(self, input_vocab_size, output_vocab_size, emb_size=64, hidden_size=128, num_layers=1, max_len=512):
        super().__init__()
        self.src_embedding = nn.Embedding(input_vocab_size, emb_size, padding_idx=0)
        self.tgt_embedding = nn.Embedding(output_vocab_size, emb_size, padding_idx=0)
        self.pos_embedding = nn.Embedding(max_len, emb_size)
        self.encoder = nn.LSTM(emb_size, hidden_size, num_layers, batch_first=True)
        self.decoder = nn.LSTM(emb_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_vocab_size)
        
    def forward(self, src, tgt):
        batch_size, seq_len = src.size()
        pos_idx = torch.arange(seq_len, device=src.device).unsqueeze(0).repeat(batch_size, 1)
        pos_idx = torch.clamp(pos_idx, max=511)
        pos_embedded = self.pos_embedding(pos_idx)
        embedded_src = self.src_embedding(src) + pos_embedded
        _, (hidden, cell) = self.encoder(embedded_src)
        embedded_tgt = self.tgt_embedding(tgt)
        outputs, _ = self.decoder(embedded_tgt, (hidden, cell))
        logits = self.fc(outputs)
        return logits

## Load Denoised Weights

In [None]:
def load_model_from_matrix(model, weights_matrix, original_len):
    weights_matrix = torch.tensor(weights_matrix, dtype=torch.float32)
    flat_weights = weights_matrix.reshape(-1)[:original_len]
    offset = 0
    for p in model.parameters():
        numel = p.numel()
        new_data = flat_weights[offset : offset + numel].view_as(p)
        p.data.copy_(new_data)
        offset += numel
    print(f"✓ Loaded {offset} parameters")
    return model

df_weights = pd.read_csv("submission.csv").to_numpy()
model = CharLSTMTranslator(input_vocab_size=73, output_vocab_size=96)
model = load_model_from_matrix(model, df_weights, 254624)
model = model.to(device)

## Load Training Data

In [None]:
df_train = pd.read_csv("train-part2.csv")
encoded_texts = df_train['encoded_text'].tolist()
english_texts = df_train['text'].tolist()
print(f"Loaded {len(encoded_texts)} samples")

## Build Vocabularies

In [None]:
all_encoded_chars = set(''.join(encoded_texts))
all_english_chars = set(''.join(english_texts))

encoded_vocab = {c: i+1 for i, c in enumerate(sorted(all_encoded_chars))}
encoded_vocab['<PAD>'] = 0

english_vocab = {c: i+1 for i, c in enumerate(sorted(all_english_chars))}
english_vocab['<PAD>'] = 0

rev_english_vocab = {i: c for c, i in english_vocab.items()}

sos_token = 71
eos_token = 70

print(f"Encoded vocab: {len(encoded_vocab)}, English vocab: {len(english_vocab)}")

## Dataset and DataLoader

In [None]:
def text_to_seq(texts, vocab):
    return [[vocab.get(c, 0) for c in t] for t in texts]

encoded_seqs = text_to_seq(encoded_texts, encoded_vocab)
english_seqs = text_to_seq(english_texts, english_vocab)

class TranslationDataset(Dataset):
    def __init__(self, src_seqs, tgt_seqs, sos_token, eos_token, max_len=512):
        self.src_seqs = src_seqs
        self.tgt_seqs = tgt_seqs
        self.sos_token = sos_token
        self.eos_token = eos_token
        self.max_len = max_len
    
    def __len__(self):
        return len(self.src_seqs)
    
    def __getitem__(self, idx):
        src_seq = self.src_seqs[idx][:self.max_len]
        tgt_seq = self.tgt_seqs[idx][:self.max_len-2]
        src = torch.LongTensor(src_seq)
        tgt = torch.LongTensor([self.sos_token] + tgt_seq + [self.eos_token])
        return src, tgt

def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_padded = pad_sequence(src_batch, batch_first=True, padding_value=0)
    tgt_padded = pad_sequence(tgt_batch, batch_first=True, padding_value=0)
    return src_padded, tgt_padded

dataset = TranslationDataset(encoded_seqs, english_seqs, sos_token, eos_token)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

print(f"Train batches: {len(train_loader)}, Val batches: {len(val_loader)}")

## Training

In [None]:
NUM_EPOCHS = 15
LEARNING_RATE = 0.001

criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)

def train_epoch(model, loader):
    model.train()
    total_loss = 0
    for src, tgt in tqdm(loader, desc="Training", leave=False):
        src, tgt = src.to(device), tgt.to(device)
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]
        
        logits = model(src, tgt_input)
        loss = criterion(logits.reshape(-1, logits.size(-1)), tgt_output.reshape(-1))
        
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        
        total_loss += loss.item()
    return total_loss / len(loader)

def validate(model, loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for src, tgt in loader:
            src, tgt = src.to(device), tgt.to(device)
            tgt_input = tgt[:, :-1]
            tgt_output = tgt[:, 1:]
            logits = model(src, tgt_input)
            loss = criterion(logits.reshape(-1, logits.size(-1)), tgt_output.reshape(-1))
            total_loss += loss.item()
    return total_loss / len(loader)

best_val_loss = float('inf')

for epoch in range(NUM_EPOCHS):
    train_loss = train_epoch(model, train_loader)
    val_loss = validate(model, val_loader)
    
    old_lr = optimizer.param_groups[0]['lr']
    scheduler.step(val_loss)
    new_lr = optimizer.param_groups[0]['lr']
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_translation.pth')
        print(f"✓ Epoch {epoch+1}/{NUM_EPOCHS} - Train: {train_loss:.4f}, Val: {val_loss:.4f} [SAVED]")
    else:
        print(f"  Epoch {epoch+1}/{NUM_EPOCHS} - Train: {train_loss:.4f}, Val: {val_loss:.4f}")
    
    if old_lr != new_lr:
        print(f"  → LR: {old_lr:.6f} → {new_lr:.6f}")

print(f"\nBest val loss: {best_val_loss:.4f}")

## Inference Function

In [None]:
def translate(model, src_text, encoded_vocab, rev_english_vocab, sos_token, eos_token, max_len=512):
    model.eval()
    src_seq = [encoded_vocab.get(c, 0) for c in src_text]
    src_tensor = torch.LongTensor(src_seq).unsqueeze(0).to(device)
    
    with torch.no_grad():
        batch_size, seq_len = src_tensor.size()
        pos_idx = torch.arange(seq_len, device=device).unsqueeze(0)
        pos_idx = torch.clamp(pos_idx, max=511)
        pos_embedded = model.pos_embedding(pos_idx)
        embedded_src = model.src_embedding(src_tensor) + pos_embedded
        _, (hidden, cell) = model.encoder(embedded_src)
    
    decoded = [sos_token]
    for _ in range(max_len):
        tgt_tensor = torch.LongTensor([decoded]).to(device)
        with torch.no_grad():
            embedded_tgt = model.tgt_embedding(tgt_tensor)
            outputs, (hidden, cell) = model.decoder(embedded_tgt, (hidden, cell))
            logits = model.fc(outputs)
        
        next_token = logits[0, -1].argmax().item()
        if next_token == eos_token:
            break
        decoded.append(next_token)
    
    return ''.join([rev_english_vocab.get(i, '') for i in decoded[1:] if i != 0])

## Test on Training Examples

In [None]:
model.load_state_dict(torch.load('best_translation.pth'))

print("Testing on training examples:\n")
for i in range(3):
    enc = encoded_texts[i][:80]
    exp = english_texts[i][:80]
    pred = translate(model, encoded_texts[i], encoded_vocab, rev_english_vocab, sos_token, eos_token)
    print(f"Example {i+1}:")
    print(f"Encoded:  {enc}...")
    print(f"Expected: {exp}...")
    print(f"Predicted: {pred[:80]}...\n")

## Translate Test Phrases

In [None]:
with open('test-part2.txt', 'r', encoding='utf-8') as f:
    test_phrases = [line.strip() for line in f if line.strip()]

print(f"Translating {len(test_phrases)} phrases...\n")

translations = []
for i, phrase in enumerate(test_phrases):
    translation = translate(model, phrase, encoded_vocab, rev_english_vocab, sos_token, eos_token)
    translations.append(translation)
    print(f"{i+1}. {translation}")

# Save
with open('44_XLR8_part2.txt', 'w', encoding='utf-8') as f:
    for t in translations:
        f.write(t + '\n')

print(f"\n✓ Saved: 44_XLR8_part2.txt")

## Next Steps

1. ✅ Share this notebook with all 6 TAs
2. ✅ Submit `44_XLR8_part2.txt`

### TA Kaggle IDs:
- adyabhat
- anaghakini  
- namitaachyuth
- tejasvenugopalan
- shusrith
- siddhiz