# Main Experiment Runner for One Dataset Example


In [6]:
%load_ext autoreload
%autoreload 2

In [1]:
from ICL_modules import data_loader, dataset_interface, s_random, experiment_basics, functions
from ICL_inference import inference
from ICL_calibrations import calibration_methods
from run import run_multiple_calibration_experiments_generic
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    LogitsProcessorList,
    LogitsProcessor
)
import random
import torch
import numpy as np
import pandas as pd
import itertools
import functools
import copy

# import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
# ---------------------------------------------
# Define the model identifier from HuggingFace Hub
# This should be the path or name of a pre-trained language model that supports causal (auto-regressive) generation.
MODEL_ID = "meta-llama/Llama-2-7b-chat-hf"
            

# Define the HuggingFace authentication token if the model is gated or requires private access.
HF_TOKEN = "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"

# ---------------------------------------------
# Load the tokenizer associated with the model.
# The tokenizer is responsible for converting raw text into token IDs (and vice versa).
# Authentication token is passed if required.
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_auth_token=HF_TOKEN)

# ---------------------------------------------
# Load the pre-trained causal language model.
# - device_map=\"auto\": Automatically places model components on available devices (GPU/CPU).
# - torch_dtype=torch.float16: Loads the model in half-precision for reduced memory usage and faster inference.
# - .eval(): Sets the model to evaluation mode (disabling any training-specific behavior).
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype=torch.float16,
    use_auth_token=HF_TOKEN
).eval()




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
# ---------------------------------------------
# Use data_loader class to upload data from cache
# As an example, AGNews is used
ag_news= data_loader.agnews(from_cache = True)
seed = 107
test_samples = 256
train_samples = len(ag_news) - test_samples
splitted_ag = dataset_interface.DatasetSplitter(ag_news, train_samples,test_samples,seed)

In [16]:
# --------------------------------------------------------
# param_dic: Hyperparameter config for LR calibration
# --------------------------------------------------------
# This dictionary defines cosine-threshold and invariance-strength values
# used during logistic regression calibration.

# Usage:
# param_dic can be either:
#
# (1) Generic (used for all datasets):
# ---------------------------------------------------
# param_dic = {
#     4: { 0: [15, 0.8], 1: [5.5, 0.8], ..., 3: [4.5, 0.8] },
#     8: { 0: [...], ..., 7: [...] }
# }
#
# (2) Dataset-specific:
# ---------------------------------------------------
# param_dic = {
#     "sst2": {
#         4: { 0: [15, 0.8], ..., 3: [4.5, 0.8] },
#         8: { 0: [...], ..., 7: [...] }
#     },
#     "agnews": {
#         4: { ... },
#         8: { ... }
#     }
# }

# Meaning of inner values:
# param_dic[k][i] = [cosine_factor, invariance_strength]
# Where:
#   - `k` is the total number of shots
#   - `i` is the current sub-shot index (0 ≤ i < k)
#   - `cosine_factor`: used as denominator in cos(π / cosine_factor) for threshold
#                      → higher values mean stricter similarity filter
#   - `invariance_strength`: regularization weight (typically 0–1)

#   Example:
#   cosine_threshold = cos(π / 4.5) ≈ 0.22
#   param_dic[4][3] = [4.5, 0.8]



# param_dic = {
#     4: {
#         0: [15, 0.8],
#         1: [5.5, 0.8],
#         2: [5, 0.7],
#         3: [4.5, 0.8]
#     }
# }

param_dic = {
  8: {
    0: [90, 10],
    1: [90, 10],
    2: [90, 10],
    3: [90, 10],
    4: [90, 10],
    5: [90, 10],
    6: [90, 10],
    7: [90, 10],
  }
}

In [11]:
from run_experiments import load_param_config
param_dic = load_param_config('/hpc/home/jd420/Projects/ICL/configs/Llama_8.json', 8)

Loading param_dic from /hpc/home/jd420/Projects/ICL/configs/Llama_8.json


In [17]:
# Run a calibration experiment using the Contextual Calibration (CC) method.
#
# Arguments:
# - model / tokenizer: HuggingFace-compatible model and tokenizer.
# - splitted_datasets: One or more dataset splits (train/test) ready for calibration.
# - seeds: List of seeds for random sampling of demonstration examples (for reproducibility).
# - k_values: List of values for k-shot learning (number of demonstrations in-context).
# - param_dic: Dictionary of hyperparameters (used mainly for LR).
# - methods_to_run: Specifies which calibration methods to run. Here, we use only 'LR'.
#
# Returns:
# - result: A dictionary containing performance metrics for each dataset, seed, and k.
# - The other two return values (ignored here) include:
#     - dfs: Calibration-specific dataframes (mostly used by LR).
#     - coefficients_dic: Calibration coefficients (also for LR).
#
# This call executes LR calibration on the provided model and dataset, printing progress logs as it runs.

result, _, _ = run_multiple_calibration_experiments_generic(
    model=model,
    tokenizer=tokenizer,
    splitted_datasets=splitted_ag,
    seeds=[2206632489],
    k_values=[8],
    param_dic=param_dic,
    methods_to_run=['CC','LR']
)


Starting experiments on dataset: AGNews
No dataset-specific param_dic found for 'AGNews', using generic config.

=== Running experiment for AGNews | Seed 2206632489, k: 8 ===
Running Contextual Calibration (CC)...
[116599, 5520, 40092, 39714, 36867, 105958, 75871, 108187]
Process: 100%, 3 in 3 >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Calibration Training Finished.


Start testing the forward inference function on the dataset: AGNews
Running LR Calibration
Training LR for 0-shot with index: [116599, 5520, 40092, 39714, 36867, 105958, 75871, 108187]
Optimization terminated successfully.
         Current function value: 0.000010
         Iterations: 33
         Function evaluations: 245
         Gradient evaluations: 35
[SCIPY] Optimization terminated successfully.
Training Elapsed time: 0.02 minutes
Selecting demonstration set for LR...
Testing LR, Seed 2206632489, k: 0, demo index: []

Start testing the forward inference function on the dataset: AGNews
Testing El

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 657.960999
         Iterations: 73
         Function evaluations: 859
         Gradient evaluations: 121
[SCIPY] Desired error not necessarily achieved due to precision loss.
Training Elapsed time: 23.61 minutes
Selecting demonstration set for LR...
Testing LR, Seed 2206632489, k: 4, demo index: [75871, 5520, 36867, 108187]

Start testing the forward inference function on the dataset: AGNews
Testing LR, Seed 2206632489, k: 4, demo index: [5520, 116599, 105958, 75871]

Start testing the forward inference function on the dataset: AGNews
Testing LR, Seed 2206632489, k: 4, demo index: [116599, 40092, 75871, 39714]

Start testing the forward inference function on the dataset: AGNews
Testing LR, Seed 2206632489, k: 4, demo index: [108187, 5520, 36867, 105958]

Start testing the forward inference function on the dataset: AGNews
Testing LR, Seed 2206632489, k: 4, demo index: [40092, 105958, 75871, 36867]

Start testing the forward inference function on the data

  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


         Current function value: 2217.056438
         Iterations: 59
         Function evaluations: 616
         Gradient evaluations: 88
[SCIPY] Desired error not necessarily achieved due to precision loss.
Training Elapsed time: 82.55 minutes
Selecting demonstration set for LR...
Testing LR, Seed 2206632489, k: 5, demo index: [75871, 5520, 36867, 108187, 39714]

Start testing the forward inference function on the dataset: AGNews
Testing LR, Seed 2206632489, k: 5, demo index: [5520, 116599, 105958, 75871, 40092]

Start testing the forward inference function on the dataset: AGNews
Testing LR, Seed 2206632489, k: 5, demo index: [116599, 40092, 75871, 39714, 5520]

Start testing the forward inference function on the dataset: AGNews
Testing LR, Seed 2206632489, k: 5, demo index: [108187, 5520, 36867, 105958, 39714]

Start testing the forward inference function on the dataset: AGNews
Testing LR, Seed 2206632489, k: 5, demo index: [40092, 105958, 75871, 36867, 5520]

Start testing the forwa

In [18]:
result

{'AGNews': {'seed_2206632489': {'8': {'CC': {'accuracy': 0.7734375,
     'averaged_truelabel_likelihood': 0.6794060457737429,
     'macro_F1': 0.6616333511016347,
     'expected_calibration_error_1': 0.09861836961608951},
    'LR-0': {'accuracy': 0.77734375,
     'averaged_truelabel_likelihood': 0.7772035412572371,
     'macro_F1': 0.7704264233424202,
     'expected_calibration_error_1': 0.023293506361596356},
    'LR-1-[5520]': {'accuracy': 0.75,
     'averaged_truelabel_likelihood': 0.6819447164950956,
     'macro_F1': 0.7013794023563733,
     'expected_calibration_error_1': 0.05090517836975013},
    'LR-1-[116599]': {'accuracy': 0.8203125,
     'averaged_truelabel_likelihood': 0.7635070219513804,
     'macro_F1': 0.8188427825910726,
     'expected_calibration_error_1': 0.058311366716639106},
    'LR-1-[105958]': {'accuracy': 0.6015625,
     'averaged_truelabel_likelihood': 0.6112124095870373,
     'macro_F1': 0.5899747433230508,
     'expected_calibration_error_1': 0.240821269516759

# Main Experiment Runner for All Datasets

In [8]:
import inspect

# Settings
test_samples = 512
seed = 107

# Auto-detect all dataset loader functions in data_loader (e.g., glue_sst2, agnews, yelp, etc.)
dataset_loaders = {
    name: func for name, func in inspect.getmembers(data_loader, inspect.isclass)
    if not name.startswith("_")  # skip private functions
}

# Now split and store them
all_splitted_datasets = []

for name, load_fn in dataset_loaders.items():
    try:
        print(f"Loading and splitting: {name}")
        dataset = load_fn(from_cache=True)
        train_samples = len(dataset) - test_samples
        splitted = dataset_interface.DatasetSplitter(dataset, train_samples, test_samples, seed)
        all_splitted_datasets.append(splitted)
    except Exception as e:
        print(f"Skipping {name} due to error: {e}")


Loading and splitting: agnews
Loading and splitting: datasets_loader
Skipping datasets_loader due to error: datasets_loader.__init__() got an unexpected keyword argument 'from_cache'
Loading and splitting: financial_phrasebank
Loading and splitting: glue_sst2
Loading and splitting: rotten_tomatoes
Loading and splitting: sst5
Loading and splitting: subjective
Loading and splitting: trec
Loading and splitting: tweet_eval_emotion
Loading and splitting: tweet_eval_hate


In [14]:
dataset_loaders.keys()

dict_keys(['agnews', 'datasets_loader', 'financial_phrasebank', 'glue_sst2', 'rotten_tomatoes', 'sst5', 'subjective', 'trec', 'tweet_eval_emotion', 'tweet_eval_hate'])

In [10]:
result, _, _ = run_multiple_calibration_experiments_generic(model=model,
                                                            tokenizer=tokenizer,
                                                            splitted_datasets=all_splitted_datasets,
                                                            seeds=[2206632489],
                                                            k_values=[4],
                                                            param_dic=param_dic,
                                                            methods_to_run=['CC','Baseline'])

Starting experiments on dataset: AGNews
No dataset-specific param_dic found for 'AGNews', using generic config.

=== Running experiment for AGNews | Seed 2206632489, k: 4 ===
Running Baseline...

Start testing the forward inference function on the dataset: AGNews
Process: 100.0% |   512 / 512 | >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Running Contextual Calibration (CC)...
[105838, 40576, 116599, 39740]
Process: 100%, 3 in 3 >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Calibration Training Finished.


Start testing the forward inference function on the dataset: AGNews
Process: 100.0% |   512 / 512 | >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Starting experiments on dataset: financial_phrasebank
No dataset-specific param_dic found for 'financial_phrasebank', using generic config.

=== Running experiment for financial_phrasebank | Seed 2206632489, k: 4 ===
Running Baseline...

Start testing the forward inference function on 

In [11]:
result

{'AGNews': {'seed_2206632489': {'4': {'Baseline': {'accuracy': 0.8046875,
     'averaged_truelabel_likelihood': 0.7641413473719553,
     'macro_F1': 0.7536930728074809,
     'expected_calibration_error_1': 0.06561180607939154},
    'CC': {'accuracy': 0.810546875,
     'averaged_truelabel_likelihood': 0.6937629945875696,
     'macro_F1': 0.7599705119131871,
     'expected_calibration_error_1': 0.09705838704801596}}}},
 'financial_phrasebank': {'seed_2206632489': {'4': {'Baseline': {'accuracy': 0.796875,
     'averaged_truelabel_likelihood': 0.7263263075424782,
     'macro_F1': 0.7163742559340863,
     'expected_calibration_error_1': 0.03540531476133088},
    'CC': {'accuracy': 0.861328125,
     'averaged_truelabel_likelihood': 0.6850761934791803,
     'macro_F1': 0.8255112659520267,
     'expected_calibration_error_1': 0.1300423525579212}}}},
 'GLUE-SST2': {'seed_2206632489': {'4': {'Baseline': {'accuracy': 0.93359375,
     'averaged_truelabel_likelihood': 0.8619928194188831,
     'macr

In [12]:
result.keys()

dict_keys(['AGNews', 'financial_phrasebank', 'GLUE-SST2', 'rotten_tomatoes', 'SST5', 'Subjective', 'TREC', 'tweet_eval_emotion', 'tweet_eval_hate'])

In [13]:
result['AGNews']

{'seed_2206632489': {'4': {'Baseline': {'accuracy': 0.8046875,
    'averaged_truelabel_likelihood': 0.7641413473719553,
    'macro_F1': 0.7536930728074809,
    'expected_calibration_error_1': 0.06561180607939154},
   'CC': {'accuracy': 0.810546875,
    'averaged_truelabel_likelihood': 0.6937629945875696,
    'macro_F1': 0.7599705119131871,
    'expected_calibration_error_1': 0.09705838704801596}}}}