In [None]:
import ast
import collections
import random

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from nltk import WordNetLemmatizer
import nltk
import spacy

nltk.download("wordnet")
nltk.download("omw-1.4")
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "textcat"])

from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Mounted at /content/drive


# Configurations

In [None]:
# file locations
TRAIN_PATH = 'train.csv'
TEST_PATH = 'test.csv'
SAMPLE_SUB_PATH = 'sample_submission.csv'
OUTPUT_SUB_PATH = "submission.csv"

# model hyperparameters
EMB_DIM = 300          # upgraded from 100
HIDDEN_SIZE = 256      # upgraded from 128
MAX_SEQ_LEN = 384      # longer sequences
BATCH_SIZE = 32
EPOCHS = 15            # we'll early-stop using validation
LR = 5e-4              # slightly smaller LR
DROPOUT = 0.3
RANDOM_SEED = 42

# set seeds
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# use GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


# Load and split data

In [None]:
full_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
sample_sub_df = pd.read_csv(SAMPLE_SUB_PATH)

train_df, val_df = train_test_split(
  full_df, test_size=0.1, random_state=RANDOM_SEED, shuffle=True
)

print("Train shape:", train_df.shape)
print("Val shape:", val_df.shape)
print("Test shape:", test_df.shape)

Train shape: (4174, 5)
Val shape: (464, 5)
Test shape: (500, 4)


# Tokenization

In [None]:
def tokenize(text: str):
  # tokenizes a single string
  doc = nlp(str(text))
  tokens = []

  for tok in doc:
    # skip space
    if tok.is_space:
        continue
    tokens.append(tok.text.lower())

  return tokens

def lemmatize(words):
  # Reduces words to their root form
  wnl = WordNetLemmatizer()
  return [wnl.lemmatize(word) for word in words]

# create vocab
counter = collections.Counter()

def update_counter(df: pd.DataFrame):
  for _, row in df.iterrows():
    counter.update(lemmatize(tokenize(row["context"])))
    counter.update(lemmatize(tokenize(row["question"])))
    answers = ast.literal_eval(row["answers"])
    for opt in answers:
      counter.update(lemmatize(tokenize(opt)))

update_counter(train_df)
update_counter(test_df)

# create special "tokens" for padding, words outside the vocab, and serperators
special_tokens = ["<pad>", "<unk>", "[sep]"]
vocab = {tok: i for i, tok in enumerate(special_tokens)}

for tok, _ in counter.most_common():
  if tok not in vocab:
    vocab[tok] = len(vocab)

PAD_IDX = vocab["<pad>"]
UNK_IDX = vocab["<unk>"]
SEP_IDX = vocab["[sep]"]

print("Vocab size:", len(vocab))

def encode_tokens(tokens):
  return [vocab.get(t, UNK_IDX) for t in tokens]

def build_sequence(context: str, question: str, answer: str):
  # create sequence of lemmatised tokens and limit it to maximum sequence length
  tokens = (
    lemmatize(tokenize(context))
    + ["[sep]"]
    + lemmatize(tokenize(question))
    + ["[sep]"]
    + lemmatize(tokenize(answer))
  )
  ids = encode_tokens(tokens)
  return ids[:MAX_SEQ_LEN]

Vocab size: 18149


# Dataloaders

In [None]:

class MCQTrainDataset(Dataset):
  # class for the training data
  def __init__(self, df: pd.DataFrame):
    self.df = df.reset_index(drop=True)

  def __len__(self):
    return len(self.df)

  def __getitem__(self, idx):
    # gets a row by id and returns the question id, associated sequences, and the correct answer
    row = self.df.iloc[idx]
    context = row["context"]
    question = row["question"]
    answers = ast.literal_eval(row["answers"])
    label = int(row["label"])  # 0..3

    seqs = []
    for opt in answers:
      seq_ids = build_sequence(context, question, str(opt))
      seqs.append(seq_ids)

    return {"id": row["id"], "seqs": seqs, "label": label}

class MCQTestDataset(Dataset):
  # class for the test data
  def __init__(self, df: pd.DataFrame):
    self.df = df.reset_index(drop=True)

  def __len__(self):
    return len(self.df)

  def __getitem__(self, idx):
    # gets a row by id and returns the question id and associated sequences
    row = self.df.iloc[idx]
    context = row["context"]
    question = row["question"]
    answers = ast.literal_eval(row["answers"])

    seqs = []
    for opt in answers:
      seq_ids = build_sequence(context, question, str(opt))
      seqs.append(seq_ids)

    return {"id": row["id"], "seqs": seqs}

def collate_fn_train(batch):
  # creates a batch of padded samples
  B = len(batch)
  O = len(batch[0]["seqs"])
  max_len = max(len(seq) for item in batch for seq in item["seqs"])

  input_ids = torch.full(
    (B, O, max_len),
    PAD_IDX,
    dtype=torch.long,
  )
  labels = torch.tensor([item["label"] for item in batch], dtype=torch.long)
  ids = [item["id"] for item in batch]

  for i, item in enumerate(batch):
    for k, seq in enumerate(item["seqs"]):
      input_ids[i, k, : len(seq)] = torch.tensor(seq, dtype=torch.long)

  return input_ids, labels, ids

def collate_fn_test(batch):
  # create a batch of padded samples, but without label
  B = len(batch)
  O = len(batch[0]["seqs"])
  max_len = max(len(seq) for item in batch for seq in item["seqs"])

  input_ids = torch.full(
    (B, O, max_len),
    PAD_IDX,
    dtype=torch.long,
  )
  ids = [item["id"] for item in batch]

  for i, item in enumerate(batch):
    for k, seq in enumerate(item["seqs"]):
      input_ids[i, k, : len(seq)] = torch.tensor(seq, dtype=torch.long)

  return input_ids, ids

# create proper datasets
train_dataset = MCQTrainDataset(train_df)
val_dataset   = MCQTrainDataset(val_df)
test_dataset  = MCQTestDataset(test_df)

train_loader = DataLoader(
  train_dataset,
  batch_size=BATCH_SIZE,
  shuffle=True,
  collate_fn=collate_fn_train,
)

val_loader = DataLoader(
  val_dataset,
  batch_size=BATCH_SIZE,
  shuffle=False,
  collate_fn=collate_fn_train,
)

test_loader = DataLoader(
  test_dataset,
  batch_size=BATCH_SIZE,
  shuffle=False,
  collate_fn=collate_fn_test,
)

# BiLSTM model

In [None]:
class BiLSTMAttentionMCQ(nn.Module):
  def __init__(self, vocab_size, emb_dim, hidden_size, pad_idx,
              dropout=0.3):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=pad_idx)

    # 2-layer BiLSTM with dropout between layers
    self.lstm = nn.LSTM(
      emb_dim,
      hidden_size,
      batch_first=True,
      bidirectional=True,
      num_layers=3,
      dropout=dropout,
    )

    # attention mechanism
    self.att_w = nn.Linear(2 * hidden_size, 2 * hidden_size)
    self.att_v = nn.Linear(2 * hidden_size, 1, bias=False)
    self.dropout = nn.Dropout(dropout)
    self.classifier = nn.Sequential(
      nn.Linear(2 * hidden_size, 128),
      nn.ReLU(),
      nn.Dropout(dropout),
      nn.Linear(128, 1),
    )
    self.pad_idx = pad_idx

  def forward(self, input_ids):
    """
    input_ids: [B, O, L]
    returns: logits [B, O]
    """
    B, O, L = input_ids.shape

    # flatten options and ignore padding
    x = input_ids.view(B * O, L)          # [B*O, L]
    mask = x.ne(self.pad_idx)             # [B*O, L]

    emb = self.embedding(x)               # [B*O, L, E]
    h, _ = self.lstm(emb)                 # [B*O, L, 2H]

    scores = self.att_v(
      torch.tanh(self.att_w(h))
    ).squeeze(-1)                         # [B*O, L]
    scores = scores.masked_fill(~mask, -1e9)
    attn = torch.softmax(scores, dim=-1).unsqueeze(-1)  # [B*O, L, 1]

    context = (h * attn).sum(dim=1)       # [B*O, 2H]
    context = self.dropout(context)

    logits = self.classifier(context).view(B, O)  # [B, O]
    return logits

# init model
model = BiLSTMAttentionMCQ(
  vocab_size=len(vocab),
  emb_dim=EMB_DIM,
  hidden_size=HIDDEN_SIZE,
  pad_idx=PAD_IDX,
  dropout=DROPOUT
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=LR)

# Model Training

In [None]:
def train_one_epoch(model, loader, optimizer, epoch_idx):
  model.train()
  total_loss = 0.0
  total_examples = 0

  for input_ids, labels, ids in loader:
    input_ids = input_ids.to(device)
    labels = labels.to(device)

    optimizer.zero_grad()
    logits = model(input_ids)              # [B, 4]
    loss = F.cross_entropy(logits, labels)
    loss.backward()
    optimizer.step()

    total_loss += loss.item() * input_ids.size(0)
    total_examples += input_ids.size(0)

  avg_loss = total_loss / total_examples
  print(f"Epoch {epoch_idx}: train loss={avg_loss:.4f}")

def evaluate(model, loader):
  model.eval()
  total = 0
  correct = 0
  with torch.no_grad():
    for input_ids, labels, ids in loader:
      input_ids = input_ids.to(device)
      labels = labels.to(device)
      logits = model(input_ids)
      preds = logits.argmax(dim=-1)
      correct += (preds == labels).sum().item()
      total += labels.size(0)
  return correct / total

best_val = 0.0
best_state = None

for epoch in range(1, EPOCHS + 1):
  train_one_epoch(model, train_loader, optimizer, epoch)
  val_acc = evaluate(model, val_loader)
  print(f"Epoch {epoch}: val acc={val_acc:.4f}")

  if val_acc > best_val:
    best_val = val_acc
    best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
    print(f"  -> new best model (val acc={best_val:.4f})")

# load best weights before test inference
if best_state is not None:
  model.load_state_dict(best_state)
  model.to(device)
  print(f"Loaded best model with val acc={best_val:.4f}")

Epoch 1: train loss=1.3862
Epoch 1: val acc=0.2845
  -> new best model (val acc=0.2845)
Epoch 2: train loss=1.3754
Epoch 2: val acc=0.2608
Epoch 3: train loss=1.3124
Epoch 3: val acc=0.3103
  -> new best model (val acc=0.3103)
Epoch 4: train loss=1.1811
Epoch 4: val acc=0.2953
Epoch 5: train loss=0.9423
Epoch 5: val acc=0.3125
  -> new best model (val acc=0.3125)
Epoch 6: train loss=0.6293
Epoch 6: val acc=0.3103
Epoch 7: train loss=0.3545
Epoch 7: val acc=0.3147
  -> new best model (val acc=0.3147)
Epoch 8: train loss=0.2088
Epoch 8: val acc=0.3168
  -> new best model (val acc=0.3168)
Epoch 9: train loss=0.1196
Epoch 9: val acc=0.3082
Epoch 10: train loss=0.0662
Epoch 10: val acc=0.2974
Epoch 11: train loss=0.0978
Epoch 11: val acc=0.3254
  -> new best model (val acc=0.3254)
Epoch 12: train loss=0.0688
Epoch 12: val acc=0.3082
Epoch 13: train loss=0.0386
Epoch 13: val acc=0.3103
Epoch 14: train loss=0.0428
Epoch 14: val acc=0.3448
  -> new best model (val acc=0.3448)
Epoch 15: train l

# Predicting Test Answers

In [None]:
model.eval()
all_ids = []
all_preds = []

with torch.no_grad():
  for input_ids, ids in test_loader:
    input_ids = input_ids.to(device)
    logits = model(input_ids)                # [B, 4]
    preds = logits.argmax(dim=-1).cpu().tolist()
    all_ids.extend(ids)
    all_preds.extend(preds)

submission_df = pd.DataFrame({"id": all_ids, "label": all_preds})
submission_df.to_csv(OUTPUT_SUB_PATH, index=False)
print("Saved submission to", OUTPUT_SUB_PATH)


Saved submission to /content/drive/MyDrive/submission.csv
