## 1. Setup & Install Dependencies

In [None]:
!pip install -q transformers datasets peft accelerate tqdm
%pip install google.colab


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
# Google Drive setup (for Colab)
from google.colab import drive
drive.mount('/content/drive')

# Checkpoint setup (Google Drive)
import os
CHECKPOINT_DIR = "/content/drive/My Drive/nlp_project/checkpoints"
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
print(f"Checkpoints will be saved to: {CHECKPOINT_DIR}")

Checkpoints will be saved to: checkpoints/xnli_lora_adapter


In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_scheduler
from torch.utils.data import DataLoader, RandomSampler
from torch.optim import AdamW
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
from tqdm.auto import tqdm

# Device detection (Colab: CUDA, Mac: MPS, fallback: CPU)
if torch.cuda.is_available():
    device = torch.device('cuda')
    device_name = 'cuda'
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = torch.device('mps')
    device_name = 'mps'
else:
    device = torch.device('cpu')
    device_name = 'cpu'

print(f"Using device: {device_name}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

  from .autonotebook import tqdm as notebook_tqdm


Using device: mps


In [12]:
# Configuration for partial fine-tuning
CONFIG = {
    "model_name": "xlm-roberta-base",
    "num_labels": 3,  # XNLI: entailment, neutral, contradiction
    "max_length": 128,
    "batch_size": 16,  # Can use larger batch on Colab GPU
    "learning_rate": 2e-5,  # Lower LR for partial fine-tuning
    "epochs": 3,
    "warmup_ratio": 0.1,
    "weight_decay": 0.01,
    "train_lang": "en",  # Train ONLY on English
    "eval_langs": ["en", "de", "zh", "ar", "ru", "hi"],
    "checkpoint_dir": CHECKPOINT_DIR,
}

## 3. Load & Preprocess XNLI Dataset

In [4]:
# Load XNLI dataset (prefer local folder, fallback to HuggingFace)
print("Loading XNLI dataset...")
data_dir = "data/raw/xnli"
if os.path.exists(data_dir):
    from datasets import load_from_disk
    dataset = load_from_disk(data_dir)
else:
    print("[Local] data_dir not found. Falling back to HuggingFace datasets...")
    dataset = load_dataset("xnli", "all_languages")

print(f"Train: {len(dataset['train']):,} examples")
print(f"Validation: {len(dataset['validation']):,} examples")
print(f"Test: {len(dataset['test']):,} examples")

Loading XNLI dataset...
[Local] data_dir not found. Falling back to HuggingFace datasets...
Train: 392,702 examples
Validation: 2,490 examples
Test: 5,010 examples


In [6]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_name"])

def tokenize_xnli(dataset, lang="en", max_length=128):
    """
    Tokenize XNLI dataset for a specific language.
    XNLI stores text as dicts: {"en": "...", "de": "...", ...}
    """
    def preprocess(examples):
        premises = [p[lang] if isinstance(p, dict) else p for p in examples["premise"]]
        hypotheses = [h[lang] if isinstance(h, dict) else h for h in examples["hypothesis"]]

        tokenized = tokenizer(
            premises,
            hypotheses,
            truncation=True,
            padding="max_length",
            max_length=max_length,
        )
        tokenized["labels"] = examples["label"]
        return tokenized

    tokenized = dataset.map(
        preprocess,
        batched=True,
        remove_columns=dataset.column_names,
    )
    tokenized.set_format("torch")
    return tokenized

In [7]:
# --- Start of fix for tokenize_xnli function ---# The tokenizer is already loaded in a previous cell and available in global scope.
# If running this cell independently without prior execution, uncomment the line below:
# from transformers import AutoTokenizer
# tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_name"])

def tokenize_xnli(dataset, lang="en", max_length=128):
    """
    Tokenize XNLI dataset for a specific language.
    XNLI stores text as dicts: {"en": "...", "de": "...", ...}
    For the train split, premise/hypothesis are typically direct strings.
    This modified version handles cases where language keys might be missing from dicts.
    """
    def preprocess(examples):
        premises = []
        for p in examples["premise"]:
            if isinstance(p, dict):
                # If it's a dict, try to get the specified language, fallback to empty string if not found
                premises.append(p.get(lang, ""))
            else:
                # If it's not a dict (e.g., string for train split), use it directly
                premises.append(p)

        hypotheses = []
        for h in examples["hypothesis"]:
            if isinstance(h, dict):
                # If it's a dict, try to get the specified language, fallback to empty string if not found
                hypotheses.append(h.get(lang, ""))
            else:
                # If it's not a dict, use it directly
                hypotheses.append(h)

        tokenized = tokenizer(
            premises,
            hypotheses,
            truncation=True,
            padding="max_length",
            max_length=max_length,
        )
        tokenized["labels"] = examples["label"]
        return tokenized

    tokenized = dataset.map(
        preprocess,
        batched=True,
        remove_columns=dataset.column_names,
    )
    tokenized.set_format("torch")
    return tokenized
# --- End of fix for tokenize_xnli function ---

# Tokenize training data (English only)
print(f"Tokenizing training data (lang={CONFIG['train_lang']})...")
train_tokenized = tokenize_xnli(
    dataset["train"],
    lang=CONFIG["train_lang"],
    max_length=CONFIG["max_length"]
)

print(f"Tokenizing validation data (lang={CONFIG['train_lang']})...")
val_tokenized = tokenize_xnli(
    dataset["validation"],
    lang=CONFIG["train_lang"],
    max_length=CONFIG["max_length"]
)

print(f"\nTrain samples: {len(train_tokenized):,}")
print(f"Val samples: {len(val_tokenized):,}")

Tokenizing training data (lang=en)...
Tokenizing validation data (lang=en)...

Train samples: 392,702
Val samples: 2,490


## 5. Training Functions

In [None]:
# ================ Load model =========================
def load_xlm_roberta_base_model(model_name="xlm-roberta-base", num_labels=3):

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels,
        dtype=torch.float32,  # float32 for Mac compatibility
    )

    # Set correct label mapping for XNLI
    model.config.id2label = {0: "entailment", 1: "neutral", 2: "contradiction"}
    model.config.label2id = {"entailment": 0, "neutral": 1, "contradiction": 2}

    # Freeze all parameters first
    for param in model.parameters():
        param.requires_grad = False

    # Unfreeze the classifier head
    for param in model.classifier.parameters():
        param.requires_grad = True

    # Unfreeze the last 3 encoder layers (layers 9, 10, 11 for XLM-RoBERTa-base)
    for layer in model.roberta.encoder.layer[-3:]:
        for param in layer.parameters():
            param.requires_grad = True

    model = model.to(device)
    
    # Print trainable parameters info
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Trainable parameters: {trainable_params:,} ({100 * trainable_params / total_params:.2f}%)")
    print(f"Total parameters: {total_params:,}")

    return model, tokenizer

In [None]:
def evaluate(model, dataset, batch_size=32):
    """Evaluate model accuracy on a dataset."""
    model.eval()
    dataloader = DataLoader(dataset, batch_size=batch_size)

    correct = 0
    total = 0

    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            predictions = torch.argmax(outputs.logits, dim=-1)
            correct += (predictions == batch["labels"]).sum().item()
            total += len(batch["labels"])

    return correct / total


def finetune_model(model, train_dataset, val_dataset=None, config=None, checkpoint_path=None, resume_epoch=0):
    """
    Fine-tune the partially unfrozen model on training data.
    Supports resuming from a checkpoint.
    Uses layer-wise learning rates for partial fine-tuning.

    Args:
        model: Partially frozen model (classifier + last 3 layers unfrozen)
        train_dataset: Tokenized dataset with 'input_ids', 'attention_mask', 'labels'
        val_dataset: Optional validation dataset
        config: Dict to override default hyperparameters
        checkpoint_path: Path to save/load checkpoints (if resuming)
        resume_epoch: Epoch to resume from (0 if starting fresh)

    Returns:
        model: Fine-tuned model
        loss_history: List of training losses per epoch
        val_acc_history: List of validation accuracies per epoch
    """
    default_config = {
        'epochs': 3,
        'batch_size': 16,
        'learning_rate': 2e-5,  # Base LR, but overridden by param groups
        'warmup_ratio': 0.1,
        'weight_decay': 0.01,
    }
    if config is not None:
        default_config.update(config)
    cfg = default_config

    # Calculate training steps (remaining if resuming)
    num_batches = (len(train_dataset) + cfg['batch_size'] - 1) // cfg['batch_size']
    total_num_training_steps = num_batches * cfg['epochs']
    num_warmup_steps = int(total_num_training_steps * cfg['warmup_ratio'])

    train_dataloader = DataLoader(
        train_dataset,
        sampler=RandomSampler(train_dataset),
        batch_size=cfg['batch_size']
    )

    model = model.to(device)

    # Layer-wise learning rates for partial fine-tuning
    param_groups = [
        {'params': [p for n, p in model.named_parameters() if p.requires_grad and 'classifier' in n], 'lr': 1e-4, 'weight_decay': cfg['weight_decay']},
        {'params': [p for n, p in model.named_parameters() if p.requires_grad and 'layer.11' in n], 'lr': 5e-5, 'weight_decay': cfg['weight_decay']},
        {'params': [p for n, p in model.named_parameters() if p.requires_grad and ('layer.9' in n or 'layer.10' in n)], 'lr': 3e-5, 'weight_decay': cfg['weight_decay']},
    ]

    optimizer = AdamW(param_groups)

    scheduler = get_scheduler(
        name="linear",
        optimizer=optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=total_num_training_steps
    )

    # If resuming, load optimizer/scheduler state if available
    if resume_epoch > 0 and checkpoint_path:
        checkpoint_file = os.path.join(checkpoint_path, f"checkpoint_epoch_{resume_epoch}.pt")
        if os.path.exists(checkpoint_file):
            checkpoint = torch.load(checkpoint_file, map_location=device)
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
            print(f"Resumed from epoch {resume_epoch}")
        else:
            print(f"Warning: Checkpoint {checkpoint_file} not found, starting fresh")

    print(f"\n{'='*50}")
    print(f"Starting fine-tuning with layer-wise LRs...")
    print(f"  Classifier LR: 1e-4")
    print(f"  Layer 11 LR: 5e-5")
    print(f"  Layers 9-10 LR: 3e-5")
    print(f"  Device: {device}")
    print(f"  Epochs: {cfg['epochs']} (resuming from {resume_epoch})")
    print(f"  Batch size: {cfg['batch_size']}")
    print(f"  Total steps: {total_num_training_steps}")
    print(f"  Trainable params: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
    print(f"{'='*50}\n")

    best_val_accuracy = 0
    loss_history = []
    val_acc_history = []

    for epoch in range(resume_epoch, cfg['epochs']):
        # training
        model.train()
        total_loss = 0

        progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{cfg['epochs']}")

        for batch in progress_bar:
            batch = {k: v.to(device) for k, v in batch.items()}

            outputs = model(**batch)
            loss = outputs.loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()
            progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})

        avg_train_loss = total_loss / len(train_dataloader)
        loss_history.append(avg_train_loss)

        # training validation
        if val_dataset is not None:
            val_accuracy = evaluate(model, val_dataset, cfg['batch_size'])
            val_acc_history.append(val_accuracy)
            print(f"Epoch {epoch+1}: Train Loss = {avg_train_loss:.4f}, Val Accuracy = {val_accuracy:.4f}")

            if val_accuracy > best_val_accuracy:
                best_val_accuracy = val_accuracy
                print(f"  ✓ New best accuracy!")
        else:
            val_acc_history.append(0.0)  # placeholder
            print(f"Epoch {epoch+1}: Train Loss = {avg_train_loss:.4f}")

        # Save checkpoint after each epoch
        if checkpoint_path:
            os.makedirs(checkpoint_path, exist_ok=True)
            checkpoint_file = os.path.join(checkpoint_path, f"checkpoint_epoch_{epoch+1}.pt")
            torch.save({
                'epoch': epoch+1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
                'loss': avg_train_loss,
                'val_accuracy': val_acc_history[-1] if val_dataset else 0.0,
            }, checkpoint_file)
            print(f"  Checkpoint saved: {checkpoint_file}")

    print(f"\nTraining complete!")
    if val_dataset is not None:
        print(f"Best validation accuracy: {best_val_accuracy:.4f}")

    return model, loss_history, val_acc_history

In [None]:
# Load model with partial fine-tuning setup (classifier + last 3 layers unfrozen)
print(f"Loading model: {CONFIG['model_name']}")
model, tokenizer = load_xlm_roberta_base_model(
    model_name=CONFIG["model_name"],
    num_labels=CONFIG["num_labels"]
)

# Print trainable parameters
model.print_trainable_parameters()

## 6. Train on English

In [None]:
# Train the model (English only)
checkpoint_dir = os.path.join(CONFIG["checkpoint_dir"], "xnli_partial_ft_checkpoints")
model, loss_history, val_acc_history = finetune_model(
    model,
    train_tokenized,
    val_tokenized,
    config={
        "epochs": CONFIG["epochs"],
        "batch_size": CONFIG["batch_size"],
        "learning_rate": CONFIG["learning_rate"],
        "warmup_ratio": CONFIG["warmup_ratio"],
        "weight_decay": CONFIG["weight_decay"],
    },
    checkpoint_path=checkpoint_dir,
    resume_epoch=0  # Set to last completed epoch if resuming
)

# Save the final fine-tuned model
import os
final_save_path = os.path.join(CONFIG["checkpoint_dir"], "xnli_partial_ft")
os.makedirs(final_save_path, exist_ok=True)
model.save_pretrained(final_save_path)
tokenizer.save_pretrained(final_save_path)
print(f"\nFinal model saved to: {final_save_path}")
print(f"Checkpoints saved to: {checkpoint_dir}")


Starting fine-tuning...
  Device: mps
  Epochs: 3
  Batch size: 32
  Learning rate: 0.0005
  Total steps: 36,816
  Trainable params: 1,477,635



Epoch 1/3:  35%|███▍      | 4285/12272 [10:25:15<19:25:27,  8.76s/it, loss=1.1015] 


KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt

# Plot training loss and validation accuracy
epochs = range(1, len(loss_history) + 1)

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(epochs, loss_history, 'b-', label='Training Loss')
plt.title('Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(epochs, val_acc_history, 'r-', label='Validation Accuracy')
plt.title('Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
def load_partial_ft_model_from_checkpoint(checkpoint_path, device):
    """
    Loads a partially fine-tuned model from a checkpoint.
    The checkpoint should contain both the model and tokenizer.
    """
    print(f"Loading tokenizer from: {checkpoint_path}")
    tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)

    print(f"Loading model from: {checkpoint_path}")
    model = AutoModelForSequenceClassification.from_pretrained(
        checkpoint_path,
        torch_dtype=torch.float32,
    )

    model = model.to(device)
    model.eval()  # Set to evaluation mode by default after loading

    print(f"Model and tokenizer loaded successfully from {checkpoint_path}")
    return model, tokenizer


def load_checkpoint_for_resume(checkpoint_path, epoch, device, model_class=AutoModelForSequenceClassification):
    """
    Load model, optimizer, and scheduler state from a checkpoint for resuming training.
    """
    checkpoint_file = os.path.join(checkpoint_path, f"checkpoint_epoch_{epoch}.pt")
    if not os.path.exists(checkpoint_file):
        raise FileNotFoundError(f"Checkpoint {checkpoint_file} not found")

    print(f"Loading checkpoint from: {checkpoint_file}")
    checkpoint = torch.load(checkpoint_file, map_location=device)

    # Load model
    model_path = os.path.join(checkpoint_path, f"model_epoch_{epoch}")
    if os.path.exists(model_path):
        model = model_class.from_pretrained(model_path, torch_dtype=torch.float32)
    else:
        # Fallback: assume base model and load state dict
        model = model_class.from_pretrained("xlm-roberta-base", num_labels=3, torch_dtype=torch.float32)
        model.load_state_dict(checkpoint['model_state_dict'])

    model = model.to(device)

    # Recreate optimizer and scheduler (they need to be recreated with current params)
    optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)  # Use same config
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

    # Scheduler recreation is trickier; for simplicity, we'll skip full scheduler resume
    # In a full implementation, you'd need to save/load scheduler state properly

    print(f"Checkpoint loaded: epoch {checkpoint['epoch']}, loss {checkpoint['loss']:.4f}")
    return model, optimizer


# Define the full path to your saved checkpoint
checkpoint_full_path = os.path.join(CONFIG["checkpoint_dir"], "xnli_partial_ft")

# Load the model and tokenizer
loaded_model, loaded_tokenizer = load_partial_ft_model_from_checkpoint(
    checkpoint_full_path,
    device
)

print("\nNow you can use `loaded_model` and `loaded_tokenizer` for inference.")

## 7. Zero-Shot Cross-Lingual Evaluation

In [None]:
print("\n" + "="*60)
print("Zero-Shot Cross-Lingual Evaluation")
print("="*60)

results = {}
for lang in CONFIG["eval_langs"]:
    print(f"\nEvaluating on {lang}...")
    eval_tokenized = tokenize_xnli(
        dataset["validation"],
        lang=lang,
        max_length=CONFIG["max_length"]
    )
    acc = evaluate(model, eval_tokenized, CONFIG["batch_size"])
    results[lang] = acc
    print(f"  {lang}: {acc:.4f}")

In [None]:
# Results Summary
print("\n" + "="*60)
print("Results Summary")
print("="*60)
print(f"{'Language':<10} {'Accuracy':<12} {'Gap from EN':<10}")
print("-" * 35)

en_acc = results.get("en", 0)
for lang, acc in results.items():
    gap = en_acc - acc
    print(f"{lang:<10} {acc:.4f}       {gap:+.4f}")

print("-" * 35)
avg_acc = sum(results.values()) / len(results)
print(f"{'Average':<10} {avg_acc:.4f}")
print(f"\nCheckpoint saved to: {CONFIG['checkpoint_dir']}/xnli_partial_ft")

In [None]:
import matplotlib.pyplot as plt

# Plot cross-lingual evaluation results
languages = list(results.keys())
accuracies = list(results.values())

plt.figure(figsize=(10, 6))
bars = plt.bar(languages, accuracies, color='skyblue')
plt.title('Zero-Shot Cross-Lingual NLI Accuracy')
plt.xlabel('Language')
plt.ylabel('Accuracy')
plt.ylim(0, 1)
plt.xticks(rotation=45)

# Add value labels on bars
for bar, acc in zip(bars, accuracies):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, f'{acc:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

## 8. Test Inference (Optional)

In [None]:
LABEL_NAMES = {0: "entailment", 1: "neutral", 2: "contradiction"}

def predict(premise, hypothesis):
    """Predict NLI label for any language."""
    model.eval()
    inputs = tokenizer(
        premise, hypothesis,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=128,
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=-1)
        pred = torch.argmax(probs, dim=-1).item()

    return LABEL_NAMES[pred], probs[0].cpu().tolist()

In [None]:
# Test on different languages (zero-shot!)

# English
label, probs = predict(
    "A man is playing guitar on stage.",
    "A musician is performing."
)
print(f"English: {label} (probs: {[f'{p:.2f}' for p in probs]})")

# German
label, probs = predict(
    "Ein Mann spielt Gitarre auf der Bühne.",
    "Ein Musiker tritt auf."
)
print(f"German: {label} (probs: {[f'{p:.2f}' for p in probs]})")

# Chinese
label, probs = predict(
    "一个男人在舞台上弹吉他。",
    "一位音乐家正在表演。"
)
print(f"Chinese: {label} (probs: {[f'{p:.2f}' for p in probs]})")

# Arabic
label, probs = predict(
    "رجل يعزف على الجيتار على المسرح.",
    "موسيقي يؤدي."
)
print(f"Arabic: {label} (probs: {[f'{p:.2f}' for p in probs]})")

## 9. Download Checkpoint

The checkpoint is already saved to your Google Drive at:
- Path: `My Drive/nlp_project/checkpoints/xnli_partial_ft/`

To download locally, copy from Google Drive to your local project:
```bash
# From your local machine
cp -r ~/Google\ Drive/My\ Drive/nlp_project/checkpoints/xnli_partial_ft checkpoints/
```

Run inference locally with:
```bash
python src/inference.py --mode demo
```

# Task
Inspect the trainable parameters of the model, verify the tokenized input and label structure for both training and validation datasets, adjust the learning rate in the `CONFIG` dictionary in cell `e71789c1` from `2e-4` to `2e-5`, retrain the model by executing cell `847d09e3`, and finally summarize the findings and retraining outcome.

## Inspect Trainable Parameters

### Subtask:
Add a new cell to print the model's trainable parameters to confirm that only LoRA adapter layers are being trained, addressing the 'Frozen Model' concern.


**Reasoning**:
The user wants to confirm that only LoRA adapter layers are trainable by printing the model's trainable parameters. The `model.print_trainable_parameters()` method is suitable for this task and has been used before.



In [None]:
print("\nVerifying trainable parameters after LoRA integration:")
model.print_trainable_parameters()


Verifying trainable parameters after LoRA integration:
trainable params: 1,182,723 || all params: 279,228,678 || trainable%: 0.4236


## Inspect Tokenized Input and Label Structure

To ensure the tokenization process works as expected and the data is correctly formatted for model input, let's inspect a sample from the `train_tokenized` and `val_tokenized` datasets. We will check the keys available, the shapes of the tensors (e.g., `input_ids`, `attention_mask`), and the labels.

**Reasoning**:
To verify the structure of the tokenized datasets, I will display a sample from both the `train_tokenized` and `val_tokenized` datasets, examining their keys, tensor shapes, and labels.



In [None]:
print("\n--- Inspecting train_tokenized dataset ---")
print(f"Keys: {train_tokenized[0].keys()}")
print(f"input_ids shape: {train_tokenized[0]['input_ids'].shape}")
print(f"attention_mask shape: {train_tokenized[0]['attention_mask'].shape}")
print(f"labels: {train_tokenized[0]['labels']}")

print("\n--- Inspecting val_tokenized dataset ---")
print(f"Keys: {val_tokenized[0].keys()}")
print(f"input_ids shape: {val_tokenized[0]['input_ids'].shape}")
print(f"attention_mask shape: {val_tokenized[0]['attention_mask'].shape}")
print(f"labels: {val_tokenized[0]['labels']}")


--- Inspecting train_tokenized dataset ---
Keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
input_ids shape: torch.Size([128])
attention_mask shape: torch.Size([128])
labels: 1

--- Inspecting val_tokenized dataset ---
Keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
input_ids shape: torch.Size([128])
attention_mask shape: torch.Size([128])
labels: 1


## Adjust Learning Rate and Retrain Model

As instructed, we will now adjust the `learning_rate` in the `CONFIG` dictionary from its current value of `2e-4` to `2e-5`. After this modification, we will retrain the model to observe the effect of the lower learning rate.

**Reasoning**:
To adjust the learning rate, I will update the `CONFIG` dictionary in a new code cell, specifically changing the `learning_rate` to `2e-5`.



In [None]:
CONFIG["learning_rate"] = 2e-5
print(f"Updated learning_rate in CONFIG to: {CONFIG['learning_rate']}")

Updated learning_rate in CONFIG to: 2e-05


**Reasoning**:
Now that the learning rate has been updated in the `CONFIG` dictionary, the next step is to retrain the model by executing the `train_model` function in cell `847d09e3`.

