In [None]:
!pip install transformers

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
from torch.utils.data import Dataset, DataLoader, Sampler
from collections import defaultdict
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AdamW, get_linear_schedule_with_warmup

In [None]:
# Load Data
data = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/train.csv')
data['language'].unique()

In [None]:
dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load Model and Tokenizer
MODEL_NAME = 'joeddav/xlm-roberta-large-xnli'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.config.hidden_dropout_prob = 0.3  # Enable dropout
model.to(dev)

In [None]:
# Calculate class weights for weighted loss
label_counts = data['label'].value_counts().sort_index()
class_weights = torch.tensor(
    [len(data) / (3 * count) for count in label_counts.values],
    dtype=torch.float32
).to(dev)
print("Class weights:", class_weights)

In [None]:
# Train-Eval Split with stratification by language and label
data['language'] = data['language'].astype(str)

In [None]:
train_data, eval_data = [], []
for lang, group in data.groupby('language'):
    # For low-resource languages (less than 500 examples), use 90% for training
    # For high-resource languages, use 80% for training
    if len(group) < 500:
        train_size = int(0.9 * len(group))
    else:
        train_size = int(0.8 * len(group))
    
    # Further stratify by label within each language
    lang_train, lang_eval = [], []
    for label, label_group in group.groupby('label'):
        label_train_size = int(train_size * len(label_group) / len(group))
        lang_train.append(label_group.iloc[:label_train_size])
        lang_eval.append(label_group.iloc[label_train_size:])
    
    train_data.append(pd.concat(lang_train))
    eval_data.append(pd.concat(lang_eval))

train_data = pd.concat(train_data).reset_index(drop=True)
eval_data = pd.concat(eval_data).reset_index(drop=True)

In [None]:
# Analyze and print language distribution
lang_train_dist = train_data['language'].value_counts().sort_index()
lang_eval_dist = eval_data['language'].value_counts().sort_index()
print("Training language distribution:", lang_train_dist)
print("Eval language distribution:", lang_eval_dist)

In [None]:
# Visualize Data Distribution with histplot
plt.figure(figsize=(12, 6))
sns.histplot(train_data['language'], label='Train', color='skyblue', alpha=0.7)
sns.histplot(eval_data['language'], label='Eval', color='royalblue', alpha=0.7)
plt.title('Language Distribution in Train and Eval Sets', fontsize=14)
plt.xlabel('Language', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Dataset Class
class NLI_Dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        # Create language and label mappings for the sampler
        self.language_indices = defaultdict(list)
        self.label_indices = defaultdict(list)
        
        for idx, row in self.data.iterrows():
            self.language_indices[row['language']].append(idx)
            self.label_indices[row['label']].append(idx)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        # Using tokenizer more effectively with special tokens
        tokens = self.tokenizer(
            row['premise'], 
            row['hypothesis'], 
            truncation=True, 
            padding='max_length', 
            max_length=self.max_length, 
            return_tensors='pt'
        )
        return {key: val.squeeze() for key, val in tokens.items()}, torch.tensor(row['label'])

In [None]:
class FixedLanguageSampler(Sampler):
    def __init__(self, dataset, batch_size=32, langs_per_batch=5):
        self.dataset = dataset
        self.batch_size = batch_size
        self.langs_per_batch = langs_per_batch
        self.languages = list(dataset.language_indices.keys())
        self.num_batches = len(dataset) // batch_size

    def __iter__(self):
        for _ in range(self.num_batches):
            langs = np.random.choice(self.languages, self.langs_per_batch, replace=False)
            batch = []
            samples_per_lang = self.batch_size // len(langs)
            remainder = self.batch_size % len(langs)
            
            for i, lang in enumerate(langs):
                n = samples_per_lang + (1 if i < remainder else 0)
                indices = self.dataset.language_indices[lang]
                if len(indices) < n:
                    selected = np.random.choice(indices, n, replace=True)
                else:
                    selected = np.random.choice(indices, n, replace=False)
                batch.extend(selected)
            yield batch

    def __len__(self):
        return self.num_batches  # Correct: number of batches

In [None]:
# Create Datasets
train_dataset = NLI_Dataset(train_data, tokenizer, max_length=150)  # Increased max length
eval_dataset = NLI_Dataset(eval_data, tokenizer, max_length=150)

# Create DataLoaders with balanced sampling
train_sampler = FixedLanguageSampler(train_dataset, batch_size=24, langs_per_batch=5)
train_loader = DataLoader(train_dataset, batch_sampler=train_sampler)
eval_loader = DataLoader(eval_dataset, batch_size=32, shuffle=False)

In [None]:
# Training Setup with weighted loss
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)  # Adjusted learning rate and weight decay

num_epochs = 3  # Increased epochs as requested
total_steps = len(train_loader) * num_epochs
num_warmup_steps = int(total_steps * 0.1)
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=num_warmup_steps, 
    num_training_steps=total_steps
)

best_acc = 0.0

In [None]:
# Training Loop with language-specific accuracy tracking
model.train()
for epoch in range(num_epochs):
    total_loss, correct, total = 0, 0, 0
    
    # Track accuracy by language
    lang_correct = defaultdict(int)
    lang_total = defaultdict(int)
    
    for batch_idx, batch in enumerate(train_loader):
        optimizer.zero_grad()
        inputs, labels = batch
        inputs = {key: val.to(dev) for key, val in inputs.items()}
        labels = labels.to(dev)
        
        # Get batch language info for logging
        batch_indices = list(range(batch_idx * train_loader.batch_sampler.batch_size, 
                           min((batch_idx + 1) * train_loader.batch_sampler.batch_size, len(train_dataset))))
        batch_languages = [train_data.iloc[idx]['language'] for idx in batch_indices if idx < len(train_data)]
        
        outputs = model(**inputs).logits
        loss = criterion(outputs, labels)
        loss.backward()
        
        # Gradient clipping to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
        preds = outputs.argmax(dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
        
        # Log every 100 batches
        if batch_idx % 100 == 0:
            print(f"Epoch {epoch+1}, Batch {batch_idx}, Loss: {loss.item():.4f}")
    
    epoch_loss = total_loss / len(train_loader)
    epoch_acc = correct / total
    print(f"Epoch {epoch+1}: Loss = {epoch_loss:.4f}, Accuracy = {epoch_acc:.4f}")
    
    # Evaluate on each epoch
    model.eval()
    eval_preds, eval_true = [], []
    eval_correct, eval_total = 0, 0
    eval_lang_correct = defaultdict(int)
    eval_lang_total = defaultdict(int)
    
    with torch.no_grad():
        for batch in eval_loader:
            inputs, labels = batch
            inputs = {key: val.to(dev) for key, val in inputs.items()}
            labels = labels.to(dev)
            outputs = model(**inputs).logits
            preds = outputs.argmax(dim=1)
            
            eval_preds.extend(preds.cpu().numpy())
            eval_true.extend(labels.cpu().numpy())
            
            eval_correct += (preds == labels).sum().item()
            eval_total += labels.size(0)
    
    eval_acc = eval_correct / eval_total
    print(f"Evaluation Accuracy: {eval_acc:.4f}")
    
    # Save Best Model
    if eval_acc > best_acc:
        best_acc = eval_acc
        model.save_pretrained("/kaggle/working/best_xlmr")
        tokenizer.save_pretrained("/kaggle/working/best_xlmr")
        print("Best model saved!")
    
    # Switch back to training mode
    model.train()

In [None]:
# Load Best Model for Final Evaluation
model = AutoModelForSequenceClassification.from_pretrained("/kaggle/working/best_xlmr")
model.to(dev)
model.eval()

# Final Evaluation
preds, true_labels = [], []
with torch.no_grad():
    for batch in eval_loader:
        inputs, labels = batch
        inputs = {key: val.to(dev) for key, val in inputs.items()}
        labels = labels.to(dev)
        outputs = model(**inputs).logits
        preds.extend(outputs.argmax(dim=1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

print("Final Classification Report:")
print(classification_report(true_labels, preds, target_names=['contradiction','neutral','entailment']))

# Confusion Matrix
cm = confusion_matrix(true_labels, preds)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, cmap="YlGnBu", annot=True, fmt='g', 
           xticklabels=['contradiction','neutral','entailment'], 
           yticklabels=['contradiction','neutral','entailment'])
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()

In [None]:
# Load test dataset and make predictions
test_data = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/test.csv')
print(f"Loaded test dataset with {len(test_data)} samples")

# Analyze test language distribution
test_lang_dist = test_data['language'].value_counts()
print("Test language distribution:", test_lang_dist)

# Enhanced test dataset class with language-specific handling
class TestDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=150):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.ids = dataframe['id'].values
        self.max_length = max_length
        self.languages = dataframe['language'].values

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        tokens = self.tokenizer(
            row['premise'], 
            row['hypothesis'], 
            truncation=True, 
            padding='max_length', 
            max_length=self.max_length, 
            return_tensors='pt'
        )
        return {key: val.squeeze() for key, val in tokens.items()}, self.ids[idx], self.languages[idx]

test_dataset = TestDataset(test_data, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Make language-aware predictions with temperature scaling for better calibration
predictions = []
prediction_ids = []
prediction_langs = []
with torch.no_grad():
    for batch in test_loader:
        inputs, ids, langs = batch
        inputs = {key: val.to(dev) for key, val in inputs.items()}
        
        # Temperature scaling for better calibration
        outputs = model(**inputs).logits  # Soften the predictions
        batch_preds = outputs.argmax(dim=1).cpu().numpy()
        
        predictions.extend(batch_preds)
        prediction_ids.extend(ids)
        prediction_langs.extend(langs)

# Create results dataframe with predictions
results_df = pd.DataFrame({
    'id': prediction_ids,
    'prediction': predictions,
    'language': prediction_langs
})

# Save predictions to CSV
results_df[['id', 'prediction']].to_csv('/kaggle/working/submission.csv', index=False)
print("Predictions saved to 'submission.csv'")