In [None]:
!pip install -q torch transformers datasets scikit-learn tqdm sentencepiece

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

from transformers import AutoModel, AutoTokenizer
from datasets import load_dataset
from torch.optim import AdamW
from tqdm import tqdm
import random

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

In [None]:
class LanguageAgnosticClassifier(nn.Module):
    def __init__(self, model_name, num_labels):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        self.hidden = self.encoder.config.hidden_size
        self.classifier = nn.Linear(self.hidden, num_labels)

    def mean_pool(self, hidden, mask):
        mask = mask.unsqueeze(-1).float()
        return (hidden * mask).sum(1) / mask.sum(1)

    def forward(self, input_ids, attention_mask):
        out = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled = self.mean_pool(out.last_hidden_state, attention_mask)
        logits = self.classifier(pooled)
        return logits, pooled

In [None]:
class ContrastiveLoss(nn.Module):
    def __init__(self, temperature=0.1):
        super().__init__()
        self.temp = temperature

    def forward(self, z1, z2):
        z1 = F.normalize(z1, dim=1)
        z2 = F.normalize(z2, dim=1)
        sim = torch.matmul(z1, z2.T) / self.temp
        labels = torch.arange(z1.size(0)).to(z1.device)
        return F.cross_entropy(sim, labels)


In [None]:
MODEL_NAME = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class EnglishDataset(torch.utils.data.Dataset):
    def __init__(self, split="train"):
        self.data = load_dataset("imdb", split=split)

    def augment(self, text):
        words = text.split()
        if len(words) > 6:
            random.shuffle(words)
        return " ".join(words)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]["text"]
        label = self.data[idx]["label"]

        aug = self.augment(text)

        enc1 = tokenizer(text, truncation=True, padding="max_length",
                          max_length=128, return_tensors="pt")
        enc2 = tokenizer(aug, truncation=True, padding="max_length",
                          max_length=128, return_tensors="pt")

        return {
            "ids1": enc1["input_ids"].squeeze(),
            "mask1": enc1["attention_mask"].squeeze(),
            "ids2": enc2["input_ids"].squeeze(),
            "mask2": enc2["attention_mask"].squeeze(),
            "label": torch.tensor(label)
        }


In [None]:
BATCH_SIZE = 16

train_ds = EnglishDataset("train")
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)


In [None]:
model = LanguageAgnosticClassifier(MODEL_NAME, num_labels=2).to(DEVICE)

optimizer = AdamW(model.parameters(), lr=2e-5)
ce_loss = nn.CrossEntropyLoss()
ctr_loss = ContrastiveLoss()

LAMBDA = 0.5
EPOCHS = 2


In [None]:
model.train()

for epoch in range(EPOCHS):
    total_loss = 0
    correct = 0
    total = 0

    for batch in tqdm(train_loader):
        optimizer.zero_grad()

        ids1 = batch["ids1"].to(DEVICE)
        mask1 = batch["mask1"].to(DEVICE)
        ids2 = batch["ids2"].to(DEVICE)
        mask2 = batch["mask2"].to(DEVICE)
        labels = batch["label"].to(DEVICE)

        logits, emb1 = model(ids1, mask1)
        _, emb2 = model(ids2, mask2)

        loss_cls = ce_loss(logits, labels)
        loss_ctr = ctr_loss(emb1, emb2)

        loss = loss_cls + LAMBDA * loss_ctr
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        correct += (logits.argmax(1) == labels).sum().item()
        total += labels.size(0)

    print(f"Epoch {epoch+1} | Loss {total_loss/len(train_loader):.4f} | Acc {correct/total:.4f}")

In [None]:
torch.save(model.state_dict(), "english_trained_multilingual.pt")

In [None]:
model.eval()

def predict(text):
    enc = tokenizer(text, return_tensors="pt",
                    truncation=True, padding="max_length", max_length=128)
    with torch.no_grad():
        logits, _ = model(enc["input_ids"].to(DEVICE),
                          enc["attention_mask"].to(DEVICE))
    return "POSITIVE" if logits.argmax(1).item() == 1 else "NEGATIVE"

In [None]:
print("English:", predict("This movie was absolutely amazing"))
print("Hindi:", predict("यह फिल्म बहुत शानदार थी"))
print("Hindi:", predict("यह फिल्म बहुत खराब थी"))

In [None]:
tests_en = [
    "This movie was absolutely fantastic",
    "I really enjoyed the story and acting",
    "The film was boring and a complete waste of time",
    "Terrible movie, I regret watching it",
    "An excellent performance by the lead actor",
    "The plot was weak and predictable",
]
tests_hi = [
    "यह फिल्म बहुत शानदार थी",
    "मुझे यह फिल्म बहुत पसंद आई",
    "यह फिल्म बहुत खराब थी",
    "कहानी बिल्कुल बेकार थी",
    "अभिनय शानदार था",
    "यह समय की पूरी बर्बादी थी",
]
tests_hi_long = [
    "यह फिल्म देखने लायक है और कहानी भी अच्छी है",
    "फिल्म की कहानी कमजोर थी लेकिन अभिनय अच्छा था",
    "मुझे यह फिल्म बिल्कुल पसंद नहीं आई",
    "फिल्म बहुत लंबी और उबाऊ लग रही थी",
]

tests_code_mixed = [
    "यह movie बहुत अच्छी थी",
    "Story अच्छी थी but execution खराब था",
    "Acting तो अच्छी थी लेकिन movie boring थी",
    "यह फिल्म totally waste of time थी",
]
tests_tricky = [
    "The movie was not bad",
    "यह फिल्म बुरी नहीं थी",
    "I expected more from this movie",
    "फिल्म ठीक-ठाक थी",
    "The movie was average at best",
]


In [None]:
all_tests = (
    tests_en
    + tests_hi
    + tests_hi_long
    + tests_code_mixed
    + tests_tricky
)

for text in all_tests:
    print(f"{text} --> {predict(text)}")


In [None]:
import os

SAVE_DIR = "/content/drive/MyDrive/language_agnostic_classifier"
os.makedirs(SAVE_DIR, exist_ok=True)

SAVE_DIR

In [None]:
MODEL_PATH = f"{SAVE_DIR}/english_trained_multilingual.pt"
torch.save(model.state_dict(), MODEL_PATH)

MODEL_PATH