In [1]:
from poi.llm.evaluate import top_one_accuracy
from poi.dataset.llm import load_prompt_completion_llm_dataset, find_all_user_sids_in_dataset, filter_test
from poi.llm import LLMConfig, load_fast_inference_model
from poi import settings
from pathlib import Path

def get_test_ds(ds_dir: Path):

    train_ds = load_prompt_completion_llm_dataset(ds_dir / "train_codebook.json")
    test_ds = load_prompt_completion_llm_dataset(ds_dir / "test_codebook.json")
    
    test_users, test_sids = find_all_user_sids_in_dataset(test_ds)
    train_users, train_sids = find_all_user_sids_in_dataset(train_ds)
    test_ds = test_ds.filter(filter_test, fn_kwargs={"filter_users": test_users - train_users, "filter_sids": test_sids - train_sids})
    
    return test_ds


nyc_config = LLMConfig(run_name="new-llama3-nyc-base")
tky_config = LLMConfig(run_name="new-llama3-tky-base")

nyc_ds_dir = settings.DATASETS_DIR / "NYC" / "New LLM Dataset" / "Nrqvae-NYC-div0.25-commit0.25-lr1e-3"
tky_ds_dir = settings.DATASETS_DIR / "TKY" / "New LLM Dataset" / "Nrqvae-TKY-div0.25-commit0.25-lr1e-3"

nyc_ds = get_test_ds(nyc_ds_dir)
tky_ds = get_test_ds(tky_ds_dir)


  from .autonotebook import tqdm as notebook_tqdm


ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.


Skipping import of cpp extensions due to incompatible torch version 2.8.0+cu128 for torchao version 0.14.0         Please see GitHub issue #2919 for more info


ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!


Generating train split: 2848 examples [00:00, 159543.99 examples/s]
Formatting prompt and completion: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2848/2848 [00:00<00:00, 63133.24 examples/s]
Filter: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 826/826 [00:00<00:00, 53262.23 examples/s]
Generating train split: 7308 examples [00:00, 148266.26 examples/s]
Formatting prompt and completion: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 7308/7308 [00:00<00:00, 60766.29 examples/s]
Filter: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1985/1985 [00:00<00:00, 59631.09 examples/s]


In [2]:
nyc_model = load_fast_inference_model(nyc_config, from_hub=True)
nyc_model_on_tky_ds_acc = top_one_accuracy(nyc_config, nyc_model, tky_ds, True)


print("="*100)
print("NYC model on TKY dataset accuracy:")
print(nyc_model_on_tky_ds_acc)

==((====))==  Unsloth 2025.10.9: Fast Llama patching. Transformers: 4.56.2.
   \\   /|    NVIDIA GeForce RTX 4090. Num GPUs = 1. Max memory: 23.516 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.10.9 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1947/1947 [09:14<00:00,  3.51it/s]

NYC model on TKY dataset accuracy:
0.24550590652285567





In [3]:
tky_model = load_fast_inference_model(tky_config, from_hub=True)
tky_model_on_nyc_ds_acc = top_one_accuracy(tky_config, tky_model, nyc_ds, True)


print("="*100)
print("TKY model on NYC dataset accuracy:")
print(tky_model_on_nyc_ds_acc)


==((====))==  Unsloth 2025.10.9: Fast Llama patching. Transformers: 4.56.2.
   \\   /|    NVIDIA GeForce RTX 4090. Num GPUs = 1. Max memory: 23.516 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 805/805 [03:37<00:00,  3.70it/s]

TKY model on NYC dataset accuracy:
0.31180124223602484



