In [1]:
%pip install "torch==2.5.0" "torchvision==0.20.0"
%pip install "setuptools<71.0.0" scikit-learn

%pip install  --upgrade \
  "datasets==3.1.0" \
  "accelerate==1.2.1" \
  "hf-transfer==0.1.8"

%pip install "git+https://github.com/huggingface/transformers.git@6e0515e99c39444caae39472ee1b2fd76ece32f1" --upgrade

Collecting torch==2.5.0
  Downloading torch-2.5.0-cp312-cp312-manylinux1_x86_64.whl.metadata (28 kB)
Collecting torchvision==0.20.0
  Downloading torchvision-0.20.0-cp312-cp312-manylinux1_x86_64.whl.metadata (6.1 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.5.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.5.0)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.5.0)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.5.0)
  Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch==2.5.0)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.meta

In [1]:
from datasets import load_dataset
from datasets.arrow_dataset import Dataset
from datasets.dataset_dict import DatasetDict, IterableDatasetDict
from datasets.iterable_dataset import IterableDataset

# Dataset id from huggingface.co/dataset
dataset_id = "wesley7137/question_complexity_classification"

# Load raw dataset
train_dataset = load_dataset(dataset_id, split='train')

split_dataset = train_dataset.train_test_split(test_size=0.1)
split_dataset['train'][5:7]

  from .autonotebook import tqdm as notebook_tqdm
Repo card metadata block was not found. Setting CardData to empty.
Generating train split: 100%|██████████| 14048/14048 [00:00<00:00, 139075.55 examples/s]


{'question': ['Which City in San Diego County is considered a great place to live and raise a family?',
  'What are the FAANG companies?'],
 'rating': [0.2, 0.2]}

In [None]:
# # 1. Define the bucketing logic clearly
# def get_bucket(example):
#     rating = example['rating']
    
#     # Handle None/Null values if they exist (assign to default or filter later)
#     if rating is None:
#         return {"labels": 0} # Defaulting to Easy, or you can filter these rows out first
        
#     if rating <= 0.3:
#         label = 0
#     elif rating <= 0.6:
#         label = 1
#     else:
#         label = 2
    
#     return {"labels": label}

# # 2. Apply it using .map()
# # This creates the new "labels" column efficiently
# split_dataset = split_dataset.map(get_bucket)

# # 3. (Optional) Remove the old 'rating' column to clean up
# split_dataset = split_dataset.remove_columns(["rating"])

# # Now save
# split_dataset.save_to_disk('question_complexity_classification_dataset')

Map: 100%|██████████| 12643/12643 [00:01<00:00, 10673.27 examples/s]
Map: 100%|██████████| 1405/1405 [00:00<00:00, 8858.95 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 12643/12643 [00:00<00:00, 1273348.19 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1405/1405 [00:00<00:00, 269492.71 examples/s]


In [None]:
# Regression Approach
from datasets import Value

# 1. Rename 'rating' (the decimal) to 'labels'
split_dataset = split_dataset.rename_column("rating", "labels")

split_dataset = split_dataset.filter(lambda x: x['labels'] is not None)
# 2. Ensure it is a Float (Decimal number)
# This converts the column type to ensure regression works
split_dataset = split_dataset.cast_column("labels", Value("float32"))

# Verify it looks correct (should see decimals like 0.3, 0.7)
print(split_dataset['train'][0])

Casting the dataset: 100%|██████████| 12643/12643 [00:00<00:00, 16219.28 examples/s]
Casting the dataset: 100%|██████████| 1405/1405 [00:00<00:00, 16297.50 examples/s]

{'question': 'How would you build me a plane out of duct tape? You can leverage other materials, but the exterior of both the fuselage and the wings must be duct tape.', 'labels': 0.75}





In [None]:
#split_dataset = DatasetDict.load_from_disk('question_complexity_classification_dataset')

In [3]:
from transformers import AutoTokenizer

# Model id to load the tokenizer
model_id = "answerdotai/ModernBERT-base"

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Tokenize helper function
def tokenize(batch):
    return tokenizer(batch['question'], truncation=True,padding=True)

tokenized_dataset = split_dataset.map(tokenize, batched=True)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]
Map: 100%|██████████| 12643/12643 [00:06<00:00, 2081.05 examples/s]
Map: 100%|██████████| 1405/1405 [00:00<00:00, 3368.14 examples/s]


In [4]:
print(tokenized_dataset["train"].features)

{'question': Value(dtype='string', id=None), 'labels': Value(dtype='float32', id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}


In [None]:
# from transformers import AutoModelForSequenceClassification

# # Model id to load the tokenizer
# model_id = "answerdotai/ModernBERT-base"

# # Prepare model labels - useful for inference
# labels = [0, 1, 2]  # 0 - easy, 1 - medium, 2 - hard
# num_labels = len(labels)
# label2id, id2label = dict(), dict()
# for i, label in enumerate(labels):
#     label2id[label] = str(i)
#     id2label[str(i)] = label

# # Download the model from huggingface.co/models
# model = AutoModelForSequenceClassification.from_pretrained(
#     model_id, num_labels=num_labels, label2id=label2id, id2label=id2label,
# )

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
from transformers import AutoModelForSequenceClassification

model_id = "answerdotai/ModernBERT-base"

# Download the model for REGRESSION
# num_labels=1 automatically triggers Regression (MSE Loss)
model = AutoModelForSequenceClassification.from_pretrained(
    model_id, 
    num_labels=1 
)

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
model.save_pretrained("my_model/")
tokenizer.save_pretrained("my_model/")

('my_model/tokenizer_config.json',
 'my_model/special_tokens_map.json',
 'my_model/tokenizer.json')

In [None]:
#tokenized_dataset.save_to_disk("tokenized_dataset")

Saving the dataset (1/1 shards): 100%|██████████| 12643/12643 [00:00<00:00, 209293.90 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1405/1405 [00:00<00:00, 167819.94 examples/s]


In [7]:
tokenized_dataset.save_to_disk("tokenized_dataset_regression")

Saving the dataset (1/1 shards): 100%|██████████| 12643/12643 [00:00<00:00, 241282.50 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1405/1405 [00:00<00:00, 135123.29 examples/s]


In [10]:
# Check the first item
print(tokenized_dataset["train"][10]["labels"]) 
# Output MUST be an integer like 0, 1, or 2. 
# If it is 0.15, the fix didn't work.

1


In [11]:
from collections import Counter
from datasets import load_from_disk
tokenized_dataset = load_from_disk("tokenized_dataset_regression")
print(Counter(tokenized_dataset['train']['labels']))

Counter({0.20000000298023224: 2028, 0.6000000238418579: 1792, 0.30000001192092896: 1685, 0.4000000059604645: 1641, 0.699999988079071: 1639, 0.800000011920929: 1415, 0.5: 1392, 0.8999999761581421: 431, None: 325, 0.05000000074505806: 66, 0.8500000238418579: 39, 0.10000000149011612: 33, 0.75: 32, 0.25: 32, 0.44999998807907104: 19, 0.6499999761581421: 14, 0.550000011920929: 12, 0.3499999940395355: 11, 0.949999988079071: 8, 0.0: 5, 0.15000000596046448: 5, 0.029999999329447746: 4, 0.9700000286102295: 2, 0.41999998688697815: 2, 0.009999999776482582: 2, 0.3199999928474426: 2, 0.6700000166893005: 1, 0.8799999952316284: 1, 0.5600000023841858: 1, 0.4300000071525574: 1, 0.9800000190734863: 1, 0.7599999904632568: 1, 0.0010000000474974513: 1})


In [5]:
import numpy as np
import torch
from sklearn.metrics import confusion_matrix
import pandas as pd

# 1. Get predictions from the trainer
print("Running prediction on test set...")
predictions_output = trainer.predict(tokenized_dataset["test"])

# 2. Convert raw logits to class IDs (0, 1, 2)
y_preds = np.argmax(predictions_output.predictions, axis=1)
y_true = predictions_output.label_ids

# 3. Create the Matrix
cm = confusion_matrix(y_true, y_preds)

# 4. Display nicely
labels = ["Easy (0)", "Medium (1)", "Hard (2)"]
df_cm = pd.DataFrame(cm, index=[f"True {l}" for l in labels], 
                         columns=[f"Pred {l}" for l in labels])
print("\n--- CONFUSION MATRIX ---")
print(df_cm)

Running prediction on test set...


NameError: name 'trainer' is not defined

In [6]:
from transformers import pipeline

# Point directly to the best checkpoint
model_path = "ModernBERT-domain-classifier/checkpoint-198"

# Load the classifier
classifier = pipeline("text-classification", model=model_path, tokenizer="my_model/")

# Test it
print(classifier("What is the square root of 144?"))

Device set to use cpu
Traceback (most recent call last):
  File "/gpfs/accounts/cse585f25_class_root/cse585f25_class/anikrish/.venv/lib64/python3.12/site-packages/torch/_inductor/compile_worker/__main__.py", line 7, in <module>
    from torch._inductor.async_compile import pre_fork_setup
  File "/gpfs/accounts/cse585f25_class_root/cse585f25_class/anikrish/.venv/lib64/python3.12/site-packages/torch/_inductor/async_compile.py", line 16, in <module>
    from torch._dynamo.device_interface import get_registered_device_interfaces
  File "/gpfs/accounts/cse585f25_class_root/cse585f25_class/anikrish/.venv/lib64/python3.12/site-packages/torch/_dynamo/__init__.py", line 39, in <module>
    from .polyfills import loader as _  # usort: skip # noqa: F401
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/gpfs/accounts/cse585f25_class_root/cse585f25_class/anikrish/.venv/lib64/python3.12/site-packages/torch/_dynamo/polyfills/loader.py", line 22, in <module>
    POLYFILLED_MODULES: Tuple["ModuleType", .

KeyboardInterrupt: 

In [7]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, DataCollatorWithPadding
from datasets import load_from_disk
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix

# 1. Load the Best Model (Checkpoint-198)
# We load specifically from the checkpoint folder to ensure we test the best version
checkpoint_path = "ModernBERT-domain-classifier/checkpoint-198"

print(f"Loading model from {checkpoint_path}...")
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint_path,
    num_labels=3,
    reference_compile=False # Keep this to avoid the Python.h error
)
tokenizer = AutoTokenizer.from_pretrained("my_model/") # Or the base model name

# 2. Load Dataset
tokenized_dataset = load_from_disk("tokenized_dataset")

# 3. Create a simple Trainer for prediction
# We don't need all the training args, just the model and collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    data_collator=data_collator
)

# 4. Get Predictions
print("Running prediction on test set...")
predictions_output = trainer.predict(tokenized_dataset["test"])

# 5. Process Results
y_preds = np.argmax(predictions_output.predictions, axis=1)
y_true = predictions_output.label_ids

# 6. Generate Matrix
cm = confusion_matrix(y_true, y_preds)

# 7. Display with Pandas
labels = ["Easy (0)", "Medium (1)", "Hard (2)"]
df_cm = pd.DataFrame(cm, index=[f"True {l}" for l in labels], 
                         columns=[f"Pred {l}" for l in labels])

print("\n--- CONFUSION MATRIX ---")
print(df_cm)

Loading model from ModernBERT-domain-classifier/checkpoint-198...


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Running prediction on test set...


KeyboardInterrupt: 