In [None]:
!pip install transformers datasets evaluate




In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"  # Disable W&B prompts

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch

from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments
)
from datasets import Dataset
import evaluate

# =============================
# 1. Load Training and Test Data
# =============================
# "data.csv" must have columns: id, text, target
train_df = pd.read_csv("/content/train.csv")

# "test.csv" must have columns: id, text
test_df = pd.read_csv("/content/test.csv")

# =============================
# 2. Split Training Data (Train/Val)
# =============================
train_split_df, val_split_df = train_test_split(
    train_df,
    test_size=0.2,
    random_state=42,
    stratify=train_df["target"]
)
print("Number of training samples:", len(train_split_df))
print("Number of validation samples:", len(val_split_df))

# =============================
# 3. Convert DataFrames to Datasets
# =============================
train_dataset = Dataset.from_pandas(train_split_df)
val_dataset   = Dataset.from_pandas(val_split_df)
test_dataset  = Dataset.from_pandas(test_df)

# =============================
# 4. Tokenizer & Tokenization
# =============================
model_name = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset   = val_dataset.map(tokenize_function, batched=True)
test_dataset  = test_dataset.map(tokenize_function, batched=True)

# IMPORTANT: Rename "target" to "labels" so Trainer can compute loss
train_dataset = train_dataset.rename_column("target", "labels")
val_dataset   = val_dataset.rename_column("target", "labels")

# Set format for PyTorch tensors
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

# =============================
# 5. Load Pre-trained BERT
# =============================
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# =============================
# 6. Define F1 Metric (using `evaluate`)
# =============================
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return f1_metric.compute(predictions=preds, references=labels, average="weighted")

# =============================
# 7. Training Arguments
# =============================
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    logging_strategy="epoch",     # Log once per epoch
    evaluation_strategy="epoch",  # Evaluate once per epoch
    save_strategy="no",           # Do not save model checkpoints each epoch
    report_to=["none"],           # Disable any experiment trackers (like W&B)
    disable_tqdm=True,            # No progress bars
    seed=42                       # Reproducibility
)

# =============================
# 8. Trainer
# =============================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# =============================
# 9. Train the Model
# =============================
trainer.train()

# Evaluate on the validation set
eval_results = trainer.evaluate()
print("Validation Results:", eval_results)

# =============================
# 10. Predict on Test Data & Save Submission
# =============================
pred_output = trainer.predict(test_dataset)
pred_labels = np.argmax(pred_output.predictions, axis=-1)

submission_df = pd.DataFrame({
    "id": test_df["id"],
    "target": pred_labels
})

submission_df.to_csv("submission.csv", index=False)
print("Submission file saved as submission.csv")


Number of training samples: 6090
Number of validation samples: 1523


Map:   0%|          | 0/6090 [00:00<?, ? examples/s]

Map:   0%|          | 0/1523 [00:00<?, ? examples/s]

Map:   0%|          | 0/3263 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 0.4553, 'grad_norm': 3.7760798931121826, 'learning_rate': 3.3333333333333335e-05, 'epoch': 1.0}
{'eval_loss': 0.40479469299316406, 'eval_f1': 0.8353296065722233, 'eval_runtime': 607.909, 'eval_samples_per_second': 2.505, 'eval_steps_per_second': 0.158, 'epoch': 1.0}
{'loss': 0.2989, 'grad_norm': 3.3139126300811768, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}
{'eval_loss': 0.420592337846756, 'eval_f1': 0.8354721512240003, 'eval_runtime': 606.8811, 'eval_samples_per_second': 2.51, 'eval_steps_per_second': 0.158, 'epoch': 2.0}
{'loss': 0.1825, 'grad_norm': 6.505929946899414, 'learning_rate': 0.0, 'epoch': 3.0}
{'eval_loss': 0.5547046661376953, 'eval_f1': 0.8355463789481278, 'eval_runtime': 607.8734, 'eval_samples_per_second': 2.505, 'eval_steps_per_second': 0.158, 'epoch': 3.0}
{'train_runtime': 26109.1664, 'train_samples_per_second': 0.7, 'train_steps_per_second': 0.044, 'train_loss': 0.3122138409685484, 'epoch': 3.0}
{'eval_loss': 0.5547046661376953, 'eval_f1': 0.835