# Main Experiment Runner for One Dataset Example


In [2]:
from ICL_modules import data_loader, dataset_interface, s_random, experiment_basics, functions
from ICL_inference import inference
from ICL_calibrations import calibration_methods
from run import run_multiple_calibration_experiments_generic
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    LogitsProcessorList,
    LogitsProcessor
)
import random
import torch
import numpy as np
import pandas as pd
import itertools
import functools
import copy

In [3]:
# ---------------------------------------------
# Define the model identifier from HuggingFace Hub
# This should be the path or name of a pre-trained language model that supports causal (auto-regressive) generation.
MODEL_ID = "<your-model-id>"

# Define the HuggingFace authentication token if the model is gated or requires private access.
HF_TOKEN = "<your-huggingface-access-token>"

# ---------------------------------------------
# Load the tokenizer associated with the model.
# The tokenizer is responsible for converting raw text into token IDs (and vice versa).
# Authentication token is passed if required.
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_auth_token=HF_TOKEN)

# ---------------------------------------------
# Load the pre-trained causal language model.
# - device_map=\"auto\": Automatically places model components on available devices (GPU/CPU).
# - torch_dtype=torch.float16: Loads the model in half-precision for reduced memory usage and faster inference.
# - .eval(): Sets the model to evaluation mode (disabling any training-specific behavior).
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype=torch.float16,
    use_auth_token=HF_TOKEN
).eval()




tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [4]:
# ---------------------------------------------
# Use data_loader class to upload data from cache
# As an example, AGNews is used
ag_news= data_loader.agnews(from_cache = True)
seed = 107
test_samples = 512
train_samples = len(ag_news) - test_samples
splitted_ag = dataset_interface.DatasetSplitter(ag_news, train_samples,test_samples,seed)

In [6]:
# --------------------------------------------------------
# param_dic: Hyperparameter config for LR calibration
# --------------------------------------------------------
# This dictionary defines cosine-threshold and invariance-strength values
# used during logistic regression calibration.

# Usage:
# param_dic can be either:
#
# (1) Generic (used for all datasets):
# ---------------------------------------------------
# param_dic = {
#     4: { 0: [15, 0.8], 1: [5.5, 0.8], ..., 3: [4.5, 0.8] },
#     8: { 0: [...], ..., 7: [...] }
# }
#
# (2) Dataset-specific:
# ---------------------------------------------------
# param_dic = {
#     "sst2": {
#         4: { 0: [15, 0.8], ..., 3: [4.5, 0.8] },
#         8: { 0: [...], ..., 7: [...] }
#     },
#     "agnews": {
#         4: { ... },
#         8: { ... }
#     }
# }

# Meaning of inner values:
# param_dic[k][i] = [cosine_factor, invariance_strength]
# Where:
#   - `k` is the total number of shots
#   - `i` is the current sub-shot index (0 ≤ i < k)
#   - `cosine_factor`: used as denominator in cos(π / cosine_factor) for threshold
#                      → higher values mean stricter similarity filter
#   - `invariance_strength`: regularization weight (typically 0–1)

#   Example:
#   cosine_threshold = cos(π / 4.5) ≈ 0.22
#   param_dic[4][3] = [4.5, 0.8]



param_dic = {
    4: {
        0: [15, 0.8],
        1: [5.5, 0.8],
        2: [5, 0.7],
        3: [4.5, 0.8]
    }
}


In [8]:
# Run a calibration experiment using the Contextual Calibration (CC) method.
#
# Arguments:
# - model / tokenizer: HuggingFace-compatible model and tokenizer.
# - splitted_datasets: One or more dataset splits (train/test) ready for calibration.
# - seeds: List of seeds for random sampling of demonstration examples (for reproducibility).
# - k_values: List of values for k-shot learning (number of demonstrations in-context).
# - param_dic: Dictionary of hyperparameters (used mainly for LR).
# - methods_to_run: Specifies which calibration methods to run. Here, we use only 'LR'.
#
# Returns:
# - result: A dictionary containing performance metrics for each dataset, seed, and k.
# - The other two return values (ignored here) include:
#     - dfs: Calibration-specific dataframes (mostly used by LR).
#     - coefficients_dic: Calibration coefficients (also for LR).
#
# This call executes LR calibration on the provided model and dataset, printing progress logs as it runs.

result, _, _ = run_multiple_calibration_experiments_generic(
    model=model,
    tokenizer=tokenizer,
    splitted_datasets=splitted_ag,
    seeds=[2206632489],
    k_values=[4],
    param_dic=param_dic,
    methods_to_run=['LR']
)


Starting experiments on dataset: AGNews
No dataset-specific param_dic found for 'AGNews', using generic config.

=== Running experiment for AGNews | Seed 2206632489, k: 4 ===
Running LR Calibration
Training LR for 0-shot with index: [105838, 40576, 116599, 39740]
[105838, 40576, 116599, 39740]
Process: 100% [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 4/4


  self.H.update(delta_x, delta_g)



[SCIPY] Calibration Training Finished.

Testing LR, Seed 2206632489, k: 0, demo index: []

Start testing the forward inference function on the dataset: AGNews
Process: 100.0% |   512 / 512 | >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Training LR for 1-shot with index: [105838, 40576, 116599, 39740]
[105838, 40576, 116599, 39740]
Process: 100% [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 12/12


  self.H.update(delta_x, delta_g)
  self.H.update(self.x - self.x_prev, self.g - self.g_prev)



[SCIPY] Calibration Training Finished.

Testing LR, Seed 2206632489, k: 1, demo index: [105838]

Start testing the forward inference function on the dataset: AGNews
Process: 100.0% |   512 / 512 | >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Testing LR, Seed 2206632489, k: 1, demo index: [39740]

Start testing the forward inference function on the dataset: AGNews
Process: 100.0% |   512 / 512 | >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Averaging probabilities for LR-1
0.763671875
Training LR for 2-shot with index: [105838, 40576, 116599, 39740]
[105838, 40576, 116599, 39740]
Process: 100% [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 24/24


  self.H.update(delta_x, delta_g)



[SCIPY] Calibration Training Finished.

Testing LR, Seed 2206632489, k: 2, demo index: [39740, 40576]

Start testing the forward inference function on the dataset: AGNews
Process: 100.0% |   512 / 512 | >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Testing LR, Seed 2206632489, k: 2, demo index: [105838, 116599]

Start testing the forward inference function on the dataset: AGNews
Process: 100.0% |   512 / 512 | >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Testing LR, Seed 2206632489, k: 2, demo index: [105838, 40576]

Start testing the forward inference function on the dataset: AGNews
Process: 100.0% |   512 / 512 | >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Testing LR, Seed 2206632489, k: 2, demo index: [40576, 116599]

Start testing the forward inference function on the dataset: AGNews
Process: 100.0% |   512 / 512 | >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Testing LR, Seed 2206632489, k: 2, demo index: 

  self.H.update(delta_x, delta_g)



[SCIPY] Calibration Training Finished.

Testing LR, Seed 2206632489, k: 3, demo index: [39740, 40576, 105838]

Start testing the forward inference function on the dataset: AGNews
Process: 100.0% |   512 / 512 | >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Testing LR, Seed 2206632489, k: 3, demo index: [105838, 116599, 39740]

Start testing the forward inference function on the dataset: AGNews
Process: 100.0% |   512 / 512 | >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Testing LR, Seed 2206632489, k: 3, demo index: [105838, 40576, 116599]

Start testing the forward inference function on the dataset: AGNews
Process: 100.0% |   512 / 512 | >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Testing LR, Seed 2206632489, k: 3, demo index: [40576, 116599, 105838]

Start testing the forward inference function on the dataset: AGNews
Process: 100.0% |   512 / 512 | >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Testing LR, Seed

In [9]:
result

{'AGNews': {'seed_2206632489': {'4': {'LR-0': {'accuracy': 0.7421875,
     'averaged_truelabel_likelihood': 0.7432745506910318,
     'macro_F1': 0.7063868395123042,
     'expected_calibration_error_1': 0.2365214109683676},
    'LR-1-[105838]': {'accuracy': 0.724609375,
     'averaged_truelabel_likelihood': 0.2500000188309639,
     'macro_F1': 0.6172070137910985,
     'expected_calibration_error_1': 0.4746093527636165},
    'LR-1-[39740]': {'accuracy': 0.68359375,
     'averaged_truelabel_likelihood': 0.2500000176435237,
     'macro_F1': 0.5943106353454257,
     'expected_calibration_error_1': 0.4335937255541975},
    'LR-1': {'accuracy': np.float64(0.7041015625),
     'averaged_truelabel_likelihood': np.float64(0.2500000182372438),
     'macro_F1': np.float64(0.6057588245682621),
     'expected_calibration_error_1': np.float64(0.454101539158907),
     'roc_auc': None},
    'LR-1-average_voting': {'accuracy': 0.763671875,
     'averaged_truelabel_likelihood': 0.2500000182372439,
     'm

# Main Experiment Runner for All Datasets

In [4]:
import inspect

# Settings
test_samples = 512
seed = 107

# Auto-detect all dataset loader functions in data_loader (e.g., glue_sst2, agnews, yelp, etc.)
dataset_loaders = {
    name: func for name, func in inspect.getmembers(data_loader, inspect.isclass)
    if not name.startswith("_")  # skip private functions
}

# Now split and store them
all_splitted_datasets = []

for name, load_fn in dataset_loaders.items():
    try:
        print(f"Loading and splitting: {name}")
        dataset = load_fn(from_cache=True)
        train_samples = len(dataset) - test_samples
        splitted = dataset_interface.DatasetSplitter(dataset, train_samples, test_samples, seed)
        all_splitted_datasets.append(splitted)
    except Exception as e:
        print(f"Skipping {name} due to error: {e}")


Loading and splitting: agnews
Loading and splitting: datasets_loader
Skipping datasets_loader due to error: datasets_loader.__init__() got an unexpected keyword argument 'from_cache'
Loading and splitting: financial_phrasebank
Loading and splitting: glue_sst2
Loading and splitting: rotten_tomatoes
Loading and splitting: sst5
Loading and splitting: subjective
Loading and splitting: trec
Loading and splitting: tweet_eval_emotion
Loading and splitting: tweet_eval_hate


In [22]:
result, _, _ = run_multiple_calibration_experiments_generic(model=model,
                                                            tokenizer=tokenizer,
                                                            splitted_datasets=all_splitted_datasets,
                                                            seeds=seeds[:2],
                                                            k_values=[4],
                                                            param_dic=param_dic,
                                                            methods_to_run=['CC','Baseline']
                                                        )

Starting experiments on dataset: AGNews
No dataset-specific param_dic found for 'AGNews', using generic config.

=== Running experiment for AGNews | Seed 2206632489, k: 4 ===
Running Baseline...

Start testing the forward inference function on the dataset: AGNews
Process: 100.0% |   512 / 512 | >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Running Contextual Calibration (CC)...
[105838, 40576, 116599, 39740]
Process: 100%, 3 in 3 >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Calibration Training Finished.


Start testing the forward inference function on the dataset: AGNews
Process: 100.0% |   512 / 512 | >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
=== Running experiment for AGNews | Seed 2481609806, k: 4 ===
Running Baseline...

Start testing the forward inference function on the dataset: AGNews
Process: 100.0% |   512 / 512 | >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Running Contextual Calibration (CC)...

In [23]:
result

{'AGNews': {'seed_2206632489': {'4': {'Baseline': {'accuracy': 0.806640625,
     'averaged_truelabel_likelihood': np.float32(0.7641576),
     'macro_F1': 0.7572663723666802,
     'expected_calibration_error_1': np.float32(0.06762034)},
    'CC': {'accuracy': 0.810546875,
     'averaged_truelabel_likelihood': 0.6936887978744313,
     'macro_F1': 0.7599705119131871,
     'expected_calibration_error_1': 0.09593932651276135}}},
  'seed_2481609806': {'4': {'Baseline': {'accuracy': 0.751953125,
     'averaged_truelabel_likelihood': np.float32(0.70814496),
     'macro_F1': 0.6724641295584752,
     'expected_calibration_error_1': np.float32(0.08280731)},
    'CC': {'accuracy': 0.75,
     'averaged_truelabel_likelihood': 0.6660673491577983,
     'macro_F1': 0.6678602581822496,
     'expected_calibration_error_1': 0.07627313297367812}}}},
 'financial_phrasebank': {'seed_2206632489': {'4': {'Baseline': {'accuracy': 0.796875,
     'averaged_truelabel_likelihood': np.float32(0.72628176),
     'macr