In [2]:
from src.data_loader import JeopardyDataLoader
from src.classifier import HuggingFaceJeopardyClassifier
from src.curator import DatasetCurator

In [3]:
input = "JEOPARDY_QUESTIONS1.json"
max_samples = None # Full data inference

model_name = "Qwen/Qwen3-4B-Instruct-2507"
batch_size = 256
max_new_tokens = 500

output_dir = "output"
n_samples = 1000 # Sub-class sample size

predictions_file = "predictions.json"
save_every_n = 100

In [4]:
loader = JeopardyDataLoader(input)
data = loader.load_and_clean(max_samples=max_samples)

2025-11-29 15:00:34,073 - INFO - Loading data from JEOPARDY_QUESTIONS1.json
2025-11-29 15:00:34,602 - INFO - Loaded 216,930 samples
2025-11-29 15:00:34,604 - INFO - Cleaning data...
2025-11-29 15:00:34,785 - INFO - Cleaned 216,930 records


In [5]:
stats = loader.get_statistics(data)
stats

{'total_questions': 216930, 'unique_categories': 27995, 'unique_shows': 3640}

In [6]:
total_dataset_size = len(data)
total_dataset_size

216930

In [7]:
classifier = HuggingFaceJeopardyClassifier(model_name=model_name, batch_size=batch_size)

2025-11-29 15:00:50,016 - INFO - Loading model: Qwen/Qwen3-4B-Instruct-2507
2025-11-29 15:00:50,017 - INFO - This may take a few moments...


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/99.6M [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/238 [00:00<?, ?B/s]

Device set to use cuda:0


2025-11-29 15:01:16,955 - INFO - Model loaded successfully on device: auto


In [8]:
predictions = classifier.classify_dataset(data, save_every_n, predictions_file)

2025-11-29 15:01:16,962 - INFO - 
Classifying 216,930 questions with model Qwen/Qwen3-4B-Instruct-2507
2025-11-29 15:01:16,963 - INFO - Batch size: 256


Processing questions:   1%|          | 2560/216930 [02:18<3:11:47, 18.63it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Processing questions:   3%|▎         | 6400/216930 [05:51<3:10:35, 18.41it/s]

2025-11-29 15:07:09,622 - INFO - 
Checkpoint saved at question 6400/216930


Processing questions:   6%|▌         | 12800/216930 [11:45<3:01:35, 18.73it/s]

2025-11-29 15:13:03,669 - INFO - 
Checkpoint saved at question 12800/216930


Processing questions:   9%|▉         | 19200/216930 [17:41<3:19:41, 16.50it/s]

2025-11-29 15:18:59,796 - INFO - 
Checkpoint saved at question 19200/216930


Processing questions:  12%|█▏        | 25600/216930 [23:35<3:00:19, 17.68it/s]

2025-11-29 15:24:54,093 - INFO - 
Checkpoint saved at question 25600/216930


Processing questions:  15%|█▍        | 32000/216930 [29:25<2:48:47, 18.26it/s]

2025-11-29 15:30:44,064 - INFO - 
Checkpoint saved at question 32000/216930


Processing questions:  18%|█▊        | 38400/216930 [35:18<2:42:21, 18.33it/s]

2025-11-29 15:36:37,042 - INFO - 
Checkpoint saved at question 38400/216930


Processing questions:  21%|██        | 44800/216930 [41:14<2:36:26, 18.34it/s]

2025-11-29 15:42:33,381 - INFO - 
Checkpoint saved at question 44800/216930


Processing questions:  24%|██▎       | 51200/216930 [47:04<2:28:11, 18.64it/s]

2025-11-29 15:48:22,747 - INFO - 
Checkpoint saved at question 51200/216930


Processing questions:  27%|██▋       | 57600/216930 [53:00<2:27:11, 18.04it/s]

2025-11-29 15:54:19,717 - INFO - 
Checkpoint saved at question 57600/216930


Processing questions:  30%|██▉       | 64000/216930 [58:54<2:22:43, 17.86it/s]

2025-11-29 16:00:13,119 - INFO - 
Checkpoint saved at question 64000/216930


Processing questions:  32%|███▏      | 70400/216930 [1:04:42<2:12:51, 18.38it/s]

2025-11-29 16:06:01,062 - INFO - 
Checkpoint saved at question 70400/216930


Processing questions:  35%|███▌      | 76800/216930 [1:10:33<2:08:01, 18.24it/s]

2025-11-29 16:11:52,785 - INFO - 
Checkpoint saved at question 76800/216930


Processing questions:  38%|███▊      | 83200/216930 [1:16:28<2:04:03, 17.97it/s]

2025-11-29 16:17:47,747 - INFO - 
Checkpoint saved at question 83200/216930


Processing questions:  41%|████▏     | 89600/216930 [1:22:21<2:00:10, 17.66it/s]

2025-11-29 16:23:40,192 - INFO - 
Checkpoint saved at question 89600/216930


Processing questions:  44%|████▍     | 96000/216930 [1:28:08<1:49:36, 18.39it/s]

2025-11-29 16:29:27,254 - INFO - 
Checkpoint saved at question 96000/216930


Processing questions:  47%|████▋     | 102400/216930 [1:33:59<1:46:59, 17.84it/s]

2025-11-29 16:35:18,227 - INFO - 
Checkpoint saved at question 102400/216930


Processing questions:  50%|█████     | 108800/216930 [1:40:00<1:36:19, 18.71it/s]

2025-11-29 16:41:19,333 - INFO - 
Checkpoint saved at question 108800/216930


Processing questions:  53%|█████▎    | 115200/216930 [1:45:45<1:29:53, 18.86it/s]

2025-11-29 16:47:04,620 - INFO - 
Checkpoint saved at question 115200/216930


Processing questions:  56%|█████▌    | 121600/216930 [1:51:30<1:24:18, 18.85it/s]

2025-11-29 16:52:49,176 - INFO - 
Checkpoint saved at question 121600/216930


Processing questions:  59%|█████▉    | 128000/216930 [1:57:22<1:22:26, 17.98it/s]

2025-11-29 16:58:41,723 - INFO - 
Checkpoint saved at question 128000/216930


Processing questions:  62%|██████▏   | 134400/216930 [2:03:16<1:16:11, 18.05it/s]

2025-11-29 17:04:35,788 - INFO - 
Checkpoint saved at question 134400/216930


Processing questions:  65%|██████▍   | 140800/216930 [2:09:07<1:10:14, 18.06it/s]

2025-11-29 17:10:26,027 - INFO - 
Checkpoint saved at question 140800/216930


Processing questions:  68%|██████▊   | 147200/216930 [2:14:51<1:03:22, 18.34it/s]

2025-11-29 17:16:10,591 - INFO - 
Checkpoint saved at question 147200/216930


Processing questions:  71%|███████   | 153600/216930 [2:20:36<57:01, 18.51it/s]

2025-11-29 17:21:55,726 - INFO - 
Checkpoint saved at question 153600/216930


Processing questions:  74%|███████▍  | 160000/216930 [2:26:24<52:12, 18.17it/s]

2025-11-29 17:27:43,311 - INFO - 
Checkpoint saved at question 160000/216930


Processing questions:  77%|███████▋  | 166400/216930 [2:32:17<46:31, 18.10it/s]

2025-11-29 17:33:36,228 - INFO - 
Checkpoint saved at question 166400/216930


Processing questions:  80%|███████▉  | 172800/216930 [2:38:05<40:01, 18.37it/s]

2025-11-29 17:39:24,921 - INFO - 
Checkpoint saved at question 172800/216930


Processing questions:  83%|████████▎ | 179200/216930 [2:43:50<33:51, 18.57it/s]

2025-11-29 17:45:09,400 - INFO - 
Checkpoint saved at question 179200/216930


Processing questions:  86%|████████▌ | 185600/216930 [2:49:42<28:57, 18.04it/s]

2025-11-29 17:51:01,820 - INFO - 
Checkpoint saved at question 185600/216930


Processing questions:  89%|████████▊ | 192000/216930 [2:55:38<23:07, 17.97it/s]

2025-11-29 17:56:57,867 - INFO - 
Checkpoint saved at question 192000/216930


Processing questions:  91%|█████████▏| 198400/216930 [3:01:30<17:04, 18.09it/s]

2025-11-29 18:02:49,345 - INFO - 
Checkpoint saved at question 198400/216930


Processing questions:  94%|█████████▍| 204800/216930 [3:07:21<11:13, 18.01it/s]

2025-11-29 18:08:40,439 - INFO - 
Checkpoint saved at question 204800/216930


Processing questions:  97%|█████████▋| 211200/216930 [3:13:14<04:58, 19.18it/s]

2025-11-29 18:14:34,016 - INFO - 
Checkpoint saved at question 211200/216930


Processing questions: 100%|██████████| 216930/216930 [3:18:26<00:00, 18.16it/s]

2025-11-29 18:19:45,944 - INFO - 
Checkpoint saved at question 216930/216930


Processing questions: 100%|██████████| 216930/216930 [3:18:28<00:00, 18.22it/s]


2025-11-29 18:19:48,126 - INFO - 
Classification complete! Results saved to predictions.json


In [17]:
curator = DatasetCurator(output_dir=output_dir)
curation_stats = curator.create_curated_datasets(
    predictions, n_samples=n_samples, total_dataset_size=total_dataset_size
)

2025-11-29 18:28:14,730 - INFO - 
Creating curated datasets with 1000 samples each...
2025-11-29 18:28:14,775 - INFO - 
Classification Quality Analysis:
2025-11-29 18:28:14,775 - INFO -   Total predictions: 216,930
2025-11-29 18:28:14,776 - INFO -   Successful: 216,930 (100.00%)
2025-11-29 18:28:14,777 - INFO -   Failed (all fields None): 0 (0.00%)
2025-11-29 18:28:14,801 - INFO - 
Using 216,930 valid predictions for curation
2025-11-29 18:28:14,874 - INFO - 
Class distributions in classified dataset:
2025-11-29 18:28:14,875 - INFO -   Numbers: 80,275 (37.01%)
2025-11-29 18:28:14,876 - INFO -   Non-English: 37,168 (17.13%)
2025-11-29 18:28:14,876 - INFO -   Unusual Proper Nouns: 28,509 (13.14%)
2025-11-29 18:35:26,620 - INFO - 
Curated datasets saved to output:
2025-11-29 18:35:26,621 - INFO -   dataset_numbers.json: 1000 samples
2025-11-29 18:35:26,621 - INFO -   dataset_non_english.json: 1000 samples
2025-11-29 18:35:26,621 - INFO -   dataset_unusual_proper_nouns.json: 1000 samples
2

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [14]:
import shutil
import os

def persist_json_to_drive(local_json_path: str, drive_target_path: str):
  """
  Copies a file from the local Colab filesystem to a specified path in Google Drive.

  The Google Drive must be mounted before calling this function.

  Parameters:
  -----------
  local_json_path: The full path to the JSON file in the local Colab runtime
                    (e.g., '/content/data.json').
  drive_target_path: The full path where the file should be saved in Drive.
                      This should start with '/content/drive/MyDrive/'
                      (e.g., '/content/drive/MyDrive/MyProjectData/saved_data.json').
  """
  if not os.path.exists(local_json_path):
      print(f"Error: Local file not found at '{local_json_path}'")
      return

  target_dir = os.path.dirname(drive_target_path)
  if target_dir and not os.path.exists(target_dir):
      os.makedirs(target_dir, exist_ok=True)
      print(f"Created target directory in Drive: '{target_dir}'")

  try:
      shutil.copyfile(local_json_path, drive_target_path)
      print(f"Success: JSON file copied from '{local_json_path}' to '{drive_target_path}'")
  except Exception as e:
      print(f"An error occurred while copying the file: {e}")

In [16]:
persist_json_to_drive(
  "/content/predictions.json",
  "/content/drive/MyDrive/data_curation/predictions.json")

Created target directory in Drive: '/content/drive/MyDrive/data_curation'
Success: JSON file copied from '/content/predictions.json' to '/content/drive/MyDrive/data_curation/predictions.json'


In [21]:
persist_json_to_drive(
  "/content/output/dataset_numbers.json",
  "/content/drive/MyDrive/data_curation/dataset_numbers.json")

Success: JSON file copied from '/content/output/dataset_numbers.json' to '/content/drive/MyDrive/data_curation/dataset_numbers.json'


In [22]:
persist_json_to_drive(
  "/content/output/dataset_non_english.json",
  "/content/drive/MyDrive/data_curation/dataset_non_english.json")

Success: JSON file copied from '/content/output/dataset_non_english.json' to '/content/drive/MyDrive/data_curation/dataset_non_english.json'


In [24]:
persist_json_to_drive(
  "/content/output/dataset_unusual_proper_nouns.json",
  "/content/drive/MyDrive/data_curation/dataset_unusual_proper_nouns.json")

Success: JSON file copied from '/content/output/dataset_unusual_proper_nouns.json' to '/content/drive/MyDrive/data_curation/dataset_unusual_proper_nouns.json'


In [25]:
persist_json_to_drive(
  "/content/output/classification_stats.json",
  "/content/drive/MyDrive/data_curation/classification_stats.json")

Success: JSON file copied from '/content/output/classification_stats.json' to '/content/drive/MyDrive/data_curation/classification_stats.json'
