<a href="https://colab.research.google.com/github/tasinfrancesco/Practical_ML_PSL/blob/main/Attempt_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget https://raw.githubusercontent.com/tasinfrancesco/pml_challenge/main/train.csv
!wget https://raw.githubusercontent.com/tasinfrancesco/pml_challenge/main/test.csv


--2026-01-19 18:11:46--  https://raw.githubusercontent.com/tasinfrancesco/pml_challenge/main/train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1054962 (1.0M) [text/plain]
Saving to: ‘train.csv’


2026-01-19 18:11:47 (172 MB/s) - ‘train.csv’ saved [1054962/1054962]

--2026-01-19 18:11:47--  https://raw.githubusercontent.com/tasinfrancesco/pml_challenge/main/test.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3243869 (3.1M) [text/plain]
Saving to: ‘test.csv’


2026-01-19 18:11:48 (383 MB/s) - ‘test.csv’ saved [3243869/3

In [None]:
from sklearn.model_selection import GroupKFold
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import strptime
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from transformers import Trainer, AutoModelForTokenClassification, AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments
from datasets import DatasetDict, load_dataset, Dataset
import torch.nn as nn
import torch.optim as optim
from scipy.special import softmax



def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if isinstance(predictions, tuple):
      logits = predictions[0]
    else:
      logits = predictions
    probabilities = softmax(logits, axis=-1)
    pos_class_probs = probabilities[:, 1]
    auc = roc_auc_score(labels, pos_class_probs)
    return {"roc_auc": np.round(auc, 3)}


tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english", padding = True, truncation = True, max_length = 128)
# #defining the model below
data_files = {"train": "train.csv", "test": "test.csv"}
loaded_ds = load_dataset("csv", data_files = data_files)
# print(f"shape of 'train': {loaded_ds["train"].shape}")
# print(f"shape of 'test': {loaded_ds["test"].shape}")

# small_train_val_ds = DatasetDict(
#     train = loaded_ds["train"].shuffle(seed=42).select(range(4000)),
#     val = loaded_ds["train"].shuffle(seed=42).select(range(4000, 5000)),
# )
# # Assuming you have a 'group' column in your original dataset
# # You'll need to keep track of groups before tokenization

def train_with_group_kfold(dataset, groups, num_epochs, lr, batch_size, n_splits=5):
    """
    Train model using GroupKFold cross-validation

    Args:
        dataset: Your tokenized HuggingFace dataset
        groups: Array of group identifiers (same length as dataset)
        n_splits: Number of folds
    """
    print(f"batch_size:{batch_size}, epochs:{num_epochs}, lr:{lr}")

    gkf = GroupKFold(n_splits=n_splits)
    fold_results = []

    # Convert dataset to indices for splitting
    indices = np.arange(len(dataset))

    for fold, (train_idx, val_idx) in enumerate(gkf.split(indices, groups=groups)):
        print(f"\n{'='*50}")
        print(f"Training Fold {fold + 1}/{n_splits}")
        print(f"{'='*50}")

        # Create train/val splits for this fold
        train_fold = dataset.select(train_idx.tolist())
        val_fold = dataset.select(val_idx.tolist())

        # Initialize a fresh model for each fold
        model = AutoModelForSequenceClassification.from_pretrained(
            "distilbert-base-uncased-finetuned-sst-2-english",
            num_labels=2,
            problem_type="single_label_classification"
        )

        # Define training arguments for this fold
        arguments = TrainingArguments(
            output_dir=f"fold_{fold+1}_outputs",
            per_device_train_batch_size=batch_size[0],
            per_device_eval_batch_size=batch_size[1],
            gradient_accumulation_steps = 2,
            num_train_epochs=num_epochs,
            eval_strategy="epoch",
            save_strategy="epoch",
            learning_rate=lr,
            load_best_model_at_end=True,
            seed=42,
            metric_for_best_model="roc_auc",
            greater_is_better=True,
        )

        # Create trainer for this fold
        trainer = Trainer(
            model=model,
            args=arguments,
            train_dataset=train_fold,
            eval_dataset=val_fold,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics
        )

        # Train
        trainer.train()

        # Evaluate on validation fold
        eval_results = trainer.evaluate()
        fold_results.append(eval_results)

        print(f"Fold {fold + 1} ROC AUC: {eval_results['eval_roc_auc']:.4f}")

    # Aggregate results across folds
    avg_auc = np.mean([r['eval_roc_auc'] for r in fold_results])
    std_auc = np.std([r['eval_roc_auc'] for r in fold_results])

    print(f"\n{'='*50}")
    print(f"Cross-Validation Results:")
    print(f"batch_size:{batch_size}, epochs:{num_epochs}, lr:{lr}")
    print(f"Average ROC AUC: {avg_auc:.4f} (+/- {std_auc:.4f})")
    print(f"{'='*50}")

    return fold_results

# Example usage:
# Make sure to extract groups BEFORE tokenization
groups = loaded_ds["train"]["id"]  # Replace with your group column name

# Tokenize your full training dataset
tokenized_full = loaded_ds["train"].map(
    lambda example: tokenizer(example["text"], padding=True, truncation=True, max_length=128),
    batched=True,
    batch_size=1000,
    num_proc=4,
)
epoch_range = [2, 3, 4]
lr_range = [2e-5, 5e-5]
batch_range = [(16, 32), (32, 64), (64, 64)]

print(f"size of tokenized full = {tokenized_full}")
tokenized_full = tokenized_full.remove_columns(["text", "id"])  # Remove text and group
tokenized_full = tokenized_full.rename_column("target", "labels")
tokenized_full.set_format("torch")

# Run cross-validation
# for num_epochs in epoch_range:
#   count = 0
#   for lr in lr_range:
#     # for batch_size in batch_range:
#       if count == 0:
#         count +=1
#         results =
#       else:
#         continue
train_with_group_kfold(tokenized_full, groups, 4, 2e-5, (32, 64), n_splits=5)

size of tokenized full = Dataset({
    features: ['id', 'text', 'creation_date', 'post_id', 'user_id', 'score', 'target', 'input_ids', 'attention_mask'],
    num_rows: 5000
})
batch_size:(32, 64), epochs:4, lr:2e-05

Training Fold 1/5


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Roc Auc
1,No log,0.373242,0.717
2,No log,0.36792,0.744
3,No log,0.365186,0.748
4,No log,0.369157,0.749


Fold 1 ROC AUC: 0.7490

Training Fold 2/5


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Roc Auc
1,No log,0.387389,0.657
2,No log,0.379378,0.698
3,No log,0.39029,0.708
4,No log,0.398453,0.71


Fold 2 ROC AUC: 0.7100

Training Fold 3/5


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Roc Auc
1,No log,0.431872,0.711
2,No log,0.422394,0.729
3,No log,0.436852,0.727
4,No log,0.451236,0.729


Fold 3 ROC AUC: 0.7290

Training Fold 4/5


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Roc Auc
1,No log,0.403722,0.679
2,No log,0.390408,0.705
3,No log,0.403829,0.711
4,No log,0.412244,0.71


Fold 4 ROC AUC: 0.7110

Training Fold 5/5


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Roc Auc
1,No log,0.351364,0.702
2,No log,0.340909,0.73
3,No log,0.346089,0.728
4,No log,0.353546,0.728


Fold 5 ROC AUC: 0.7300

Cross-Validation Results:
batch_size:(32, 64), epochs:4, lr:2e-05
Average ROC AUC: 0.7258 (+/- 0.0144)


[{'eval_loss': 0.3691573739051819,
  'eval_roc_auc': 0.749,
  'eval_runtime': 3.4058,
  'eval_samples_per_second': 293.613,
  'eval_steps_per_second': 4.698,
  'epoch': 4.0},
 {'eval_loss': 0.39845335483551025,
  'eval_roc_auc': 0.71,
  'eval_runtime': 3.3569,
  'eval_samples_per_second': 297.897,
  'eval_steps_per_second': 4.766,
  'epoch': 4.0},
 {'eval_loss': 0.42239367961883545,
  'eval_roc_auc': 0.729,
  'eval_runtime': 3.3736,
  'eval_samples_per_second': 296.419,
  'eval_steps_per_second': 4.743,
  'epoch': 4.0},
 {'eval_loss': 0.40382909774780273,
  'eval_roc_auc': 0.711,
  'eval_runtime': 3.377,
  'eval_samples_per_second': 296.122,
  'eval_steps_per_second': 4.738,
  'epoch': 4.0},
 {'eval_loss': 0.34090906381607056,
  'eval_roc_auc': 0.73,
  'eval_runtime': 3.3729,
  'eval_samples_per_second': 296.482,
  'eval_steps_per_second': 4.744,
  'epoch': 4.0}]

In [None]:
arguments = TrainingArguments(
            metric_for_best_model="roc_auc",
            greater_is_better=True,
        )

len(list(loaded_ds["test"]["text"]))
model = AutoModelForSequenceClassification.from_pretrained("fold_5_outputs/checkpoint-375")

# Create new trainer for prediction
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args = arguments,
)

tokenized_test = loaded_ds["test"].map(
    lambda example: tokenizer(example["text"], padding=True, truncation=True, max_length=128)
)
tokenized_test = tokenized_test.remove_columns(["text", "id"])  # Remove text and group
tokenized_test = tokenized_test.rename_column("target", "labels")
tokenized_test.set_format("torch")

final_predictions = []
predictions = trainer.predict(tokenized_test)

logits = preds.predictions
if isinstance(logits, tuple):
  logits = logits[0]

probs = softmax(logits, axis = -1)
final_predictions.append(probs)

  trainer = Trainer(


NotImplementedError: "nll_loss_forward_reduce_cuda_kernel_2d_index" not implemented for 'Float'

In [None]:
test_tokenized = loaded_ds["test"].map(
    lambda example: tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    ),
    batched=True,
    batch_size=1000,
)
test_tokenized = test_tokenized.remove_columns(["text"])
test_tokenized.set_format("torch")

# Predict with all folds
n_folds = 5  # Change this to however many folds you ran
all_predictions = []

for fold in range(1, n_folds + 1):
    print(f"Loading and predicting with fold {fold}...")

    # Load model
    model = AutoModelForSequenceClassification.from_pretrained(f"fold_{fold}_outputs/checkpoint-375")

    # Create trainer
    trainer = Trainer(model=model, tokenizer=tokenizer)

    # Predict
    preds = trainer.predict(test_tokenized)
    logits = preds.predictions
    if isinstance(logits, tuple):
        logits = logits[0]

    # Convert to probabilities
    probs = softmax(logits, axis=-1)
    all_predictions.append(probs)

# Average predictions across folds
avg_probs = np.mean(all_predictions, axis=0)
final_predictions = np.argmax(avg_probs, axis=-1)

# Create submission
ids = [int(i) for i in test_tokenized["id"]]
submission = pd.DataFrame({
    'id': ids,
    'target': avg_probs[:, 1],
})

print(f"Prediction distribution:\n{submission['prediction'].value_counts()}")
print(f"\nFirst few predictions:\n{submission.head()}")

submission.to_csv('predictions_ensemble.csv', index=False)


Loading and predicting with fold 1...


  trainer = Trainer(model=model, tokenizer=tokenizer)


Loading and predicting with fold 2...


  trainer = Trainer(model=model, tokenizer=tokenizer)


Loading and predicting with fold 3...


  trainer = Trainer(model=model, tokenizer=tokenizer)


Loading and predicting with fold 4...


  trainer = Trainer(model=model, tokenizer=tokenizer)


Loading and predicting with fold 5...


  trainer = Trainer(model=model, tokenizer=tokenizer)


ValueError: Per-column arrays must each be 1-dimensional

In [None]:
ids = [int(i) for i in test_tokenized["id"]]
submission = pd.DataFrame({
    'id': ids,
    'target': avg_probs[:, 1],
})


KeyError: 'prediction'

In [None]:

print(f"Prediction distribution:\n{submission['target'].value_counts()}")
print(f"\nFirst few predictions:\n{submission.head()}")

submission.to_csv('predictions_ensemble.csv', index=False)

Prediction distribution:
target
0.377542    4
0.015722    4
0.155181    3
0.126935    2
0.036934    2
           ..
0.024299    1
0.007168    1
0.095174    1
0.025508    1
0.029170    1
Name: count, Length: 14981, dtype: int64

First few predictions:
         id    target
0   2011998  0.033419
1  65996680  0.273215
2  98232262  0.026997
3   4683404  0.173482
4  29595870  0.344224


In [None]:
submission.iloc[]

In [None]:

for i in range(15000):
  if int(loaded_ds["test"]["id"][i]) != int(test_tokenized["id"][i]):
      print(i

KeyboardInterrupt: 