# Kaggle LLM Science Exam Perplexity Ranking Ensemble

This code is based on the discussion and notebooks from [Psi](https://www.kaggle.com/code/philippsinger) and [Takamichi Toda](https://www.kaggle.com/takamichitoda)

- https://www.kaggle.com/competitions/kaggle-llm-science-exam/discussion/424242
- https://www.kaggle.com/code/philippsinger/h2ogpt-perplexity-ranking
- https://www.kaggle.com/code/takamichitoda/llm-perplexity-ranking-ensemble/notebook

The idea is, rather than using or training an LLM to predict the correct answer given the question and multiple choice answers, we instead:
- Input each answer, prepended with the question, to a pretrained LLM and perform inference
- For each token, see what the model predicts as its logprob
- Calculate the perplexity over these results -- effectively treating the questions and multiple choice answers as a dataset and use the perplexity as the measure of how well the model is modelling the correct distribution that should correspond to the correct answer
- Sort the perplexity for each of the question-answer pairs to get the answer as that with the lowest perplexity and use that as the model's answer to the question

In [1]:
# !pip install bitsandbytes ipywidgets python-dotenv

In [2]:
import torch

import gc
import os
import numpy as np
import pandas as pd
import torch
from torch import nn
from tqdm import tqdm

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

In [3]:
from huggingface_hub import login, notebook_login
login()
# notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
import os
from pathlib import Path

# TODO fix up for colab or kaggle if needed. No need for local or docker with volume mapped for HF cache
# cwd = Path(os.getcwd())
# data_dir =  cwd / 'data'
# model_dir = [cwd / 'models']

# GCP version
cwd = Path('/workspace')
data_dir =  cwd / 'data'
model_dir = [cwd / 'models']

In [6]:
from dotenv import load_dotenv
load_dotenv('/workspace/.env')

True

In [7]:
import os
import subprocess
from pathlib import Path
import zipfile

COMPETITION='kaggle-llm-science-exam'

# fix as needed
# data_dir = Path(os.getcwd()) / 'data'
data_dir.mkdir(exist_ok=True)

file_path = data_dir / f'{COMPETITION}.zip'

if not os.path.exists(file_path):
    # download dataset
    subprocess.run(['kaggle', 'competitions', 'download', '-p', data_dir, '-c', COMPETITION], check=True)
    # subprocess.run(['unzip', 'kaggle-llm-science-exam.zip'], check=True)
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        zip_ref.extractall(data_dir)

In [8]:
MODELS = ["mistralai/Mistral-7B-v0.1",
          "01-ai/Yi-6B"]

In [9]:
train_df = pd.read_csv(data_dir / 'train.csv')

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    test_df = pd.read_csv(data_dir / 'test.csv', index_col='id')
    # add unused answer to avoid errors when we're using the train set as test set
    test_df["answer"] = "A"
else:
    test_df = train_df.copy()
test_df.head()

Unnamed: 0,id,prompt,A,B,C,D,E,answer
0,0,Which of the following statements accurately d...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...,D
1,1,Which of the following is an accurate definiti...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,A
2,2,Which of the following statements accurately d...,The triskeles symbol was reconstructed as a fe...,The triskeles symbol is a representation of th...,The triskeles symbol is a representation of a ...,The triskeles symbol represents three interloc...,The triskeles symbol is a representation of th...,A
3,3,What is the significance of regularization in ...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,C
4,4,Which of the following statements accurately d...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,D


In [10]:
# https://www.kaggle.com/code/philippsinger/h2ogpt-perplexity-ranking/notebook

class Perplexity(nn.Module):
    def __init__(self, reduce: bool = True):
        super().__init__()
        self.reduce = reduce
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, logits, labels):
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()

        perplexity = []
        for i in range(labels.shape[0]):
            perplexity.append(self.loss_fn(shift_logits[i], shift_labels[i]))
        perplexity = torch.stack(perplexity, dim=0)
        # perplexity = torch.exp(perplexity)
        if self.reduce:
            perplexity = perplexity.mean()
        return perplexity

In [11]:
# https://www.kaggle.com/code/philippsinger/h2ogpt-perplexity-ranking/notebook
def precision_at_k(r, k):
    """Precision at k"""
    assert k <= len(r)
    assert k != 0
    return sum(int(x) for x in r[:k]) / k

def MAP_at_3(predictions, true_items):
    """Score is mean average precision at 3"""
    U = len(predictions)
    map_at_3 = 0.0
    for u in range(U):
        user_preds = predictions[u]
        user_true = true_items[u]
        user_results = [1 if item == user_true else 0 for item in user_preds]
        for k in range(min(len(user_preds), 3)):
            map_at_3 += precision_at_k(user_results, k+1) * user_results[k]
    return map_at_3 / U

# NB the calculation here is missing the short circuiting of the precision_at_k function
# This would be needed if the model output multiple copies of the same answer
# This is specced in the MAP@3 calculation description in the competition rules
# https://www.kaggle.com/competitions/kaggle-llm-science-exam
# redo it here for clarity
# The code that invokes this function won't let this happen as it uses [cols][np.argsort(preplexity)]
# which won't return the same index for tied values. Still though, this feels like a bug in the calc so
# worth noting
# TODO test this
def MAP_at_3(predictions, true_items):
    """Score is mean average precision at 3"""
    U = len(predictions)
    map_at_3 = 0.0
    for u in range(U):
        user_preds = predictions[u]
        user_true = true_items[u]
        user_results = [1 if item == user_true else 0 for item in user_preds]
        for k in range(min(len(user_preds), 3)):
            map_at_3 += precision_at_k(user_results, k+1) * user_results[k]
            if user_results[k] == 1:
                break
    return map_at_3 / U

In [12]:
perplexity = Perplexity()
candidate = np.array(['A', 'B', 'C', 'D', 'E'])

In [13]:
# https://www.kaggle.com/code/takamichitoda/llm-perplexity-ranking-ensemble
def infer(model_name, infer_df):
    # tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        load_in_8bit=True,
        device_map="auto",
        trust_remote_code=True,
    )

    perps, scores = [], []
    for _, row in tqdm(infer_df.iterrows(), total=len(infer_df)):
        inp = row["prompt"]
        cands = [f"Question: {inp}\n Answer: {row[c]}" for c in candidate]

        with torch.no_grad():
            inputs = tokenizer(cands, return_tensors="pt", padding=True, truncation=True).to(f"cuda:{model.device.index}")
            output = model(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask,)
            output = output.logits
            labels = inputs.input_ids
            labels.masked_fill_(~inputs["attention_mask"].bool(), -100)
            perp = [perplexity(output[i].unsqueeze(0), labels[i].unsqueeze(0)).cpu() for i in range(len(cands))]

        # save perplexity not the sorted predictions for each question so we can use it for ensembling
        perps.append(perp)

        # calculate the MAP@3 score for each question, MAP_at_3 expects a list of lists and a list of answers
        # but we're doing them individually here so we need to wrap them in lists
        preds = candidate[np.argsort(np.asarray(perp))]
        score = MAP_at_3([preds], [row["answer"]])
        scores.append(score)

        torch.cuda.empty_cache()
        gc.collect()

    return perps, scores


In [14]:
# TODO delete
perplexity(torch.tensor([[[0.1, 0.1, 0.1], [0.1, 0.1, 0.1]]]), torch.tensor([[0, 0]])).cpu()

tensor(1.0986)

In [15]:
model_perps = []
for model_name in MODELS:
    perps, scores = infer(model_name, test_df)
    model_perps.append(perps)
    torch.cuda.empty_cache()
    gc.collect()

    print(f"{model_name} MAP@3: {np.mean(scores)}")

perps = np.array(model_perps)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 200/200 [02:42<00:00,  1.23it/s]


mistralai/Mistral-7B-v0.1 MAP@3: 0.7208333333333333


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 200/200 [01:33<00:00,  2.15it/s]

01-ai/Yi-6B MAP@3: 0.6283333333333333





In [18]:
perps[0][:10]

array([[1.9469231, 1.8957705, 2.1129675, 1.846193 , 1.9246615],
       [2.5738037, 2.6259425, 2.5766556, 2.5453894, 2.6985378],
       [2.350564 , 2.1702247, 2.1937566, 2.3108087, 2.1098208],
       [3.1203482, 2.9559467, 2.8621874, 3.0236456, 3.1727817],
       [1.7092881, 1.6327369, 1.7643096, 1.5720744, 1.7078724],
       [2.2709692, 2.0655148, 1.9405072, 1.9165689, 2.0688221],
       [1.9622561, 2.2139924, 1.9954762, 2.094616 , 2.4284868],
       [1.6194817, 1.6595483, 1.6707814, 1.7516892, 1.5812576],
       [3.0847971, 2.9435732, 3.0494156, 3.3751962, 3.3799398],
       [2.4534338, 2.6544611, 2.344752 , 2.5890644, 2.217326 ]],
      dtype=float32)

In [19]:
perps_avg = perps[0] if len(perps) == 1 else np.mean(perps, axis=0)

Let's see whether the ensembling has actually helped

In [62]:
for _perps in [perps[0], perps[1], perps_avg]:
    _scores = []
    for i, row in test_df.iterrows():
        # _perp = perps_avg[i]
        perp = _perps[i]
        preds = candidate[np.argsort(np.asarray(perp))]
        m = MAP_at_3([preds], [row.answer])
        _scores.append(m)

    print(f"MAP@3: {np.mean(_scores)}")
    # [" ".join(candidate[np.argsort(perp)][:3]) for perp in _perps]

MAP@3: 0.7208333333333333
MAP@3: 0.6283333333333333
MAP@3: 0.675


No! The `"01-ai/Yi-6B"` model isn't as strong as `Mistral-7b` for this task. There are likely to be combinations that help here, but working that out can be for another time when it's actually needed. The code is here and working.

For now, just go with the Mistral model

In [63]:
sub_df = pd.read_csv(data_dir / 'sample_submission.csv')

# Uncomment to use ensembled predictions
# sub_df["prediction"] = [" ".join(candidate[np.argsort(perp)][:3]) for perp in perps_avg]

# otherwise use the first model
sub_df["prediction"] = [" ".join(candidate[np.argsort(perp)][:3]) for perp in perps[0]]

In [64]:
sub_df.head()

Unnamed: 0,id,prediction
0,0,D B E
1,1,D A C
2,2,E B C
3,3,C B D
4,4,D B E


In [65]:
sub_df.to_csv(data_dir / 'submission.csv', index=False)

In [66]:
pd.read_csv(data_dir / "submission.csv")

Unnamed: 0,id,prediction
0,0,D B E
1,1,D A C
2,2,E B C
3,3,C B D
4,4,D B E
...,...,...
195,195,E C B
196,196,C B D
197,197,B C A
198,198,D C B


## TESTING -- TODO delete below

In [53]:
assert 0, "TODO: delete below here"

AssertionError: TODO: delete below here

In [None]:
_scores = []
# _perps = perps_avg
_perps = perps[0]
# _perps = perps[1]
for i, row in enumerate(test_df.iterrows()):
    _row = test_df.iloc[i]
    # _perp = perps_avg[i]
    _perp = _perps[i]
    _preds = candidate[np.argsort(np.asarray(_perp))]
    m = MAP_at_3([_preds], [_row["answer"]])
    _scores.append(m)

print(f"MAP@3: {np.mean(_scores)}")
[" ".join(candidate[np.argsort(perp)][:3]) for perp in _perps]

In [54]:
del model
del tokenizer
del inputs
torch.cuda.empty_cache()
gc.collect()

968

In [31]:
model_name = MODELS[0]
print(f"Using model {model_name}")
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    load_in_8bit=True,
    device_map="auto",
    trust_remote_code=True,
)

Using model mistralai/Mistral-7B-v0.1


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [32]:
row = test_df.iloc[0]
inp = row["prompt"]
cands = [f"Question: {inp}\n Answer: {row[c]}" for c in candidate]
inputs = tokenizer(cands, return_tensors="pt", padding=True, truncation=True).to(f"cuda:{model.device.index}")
print(tokenizer.decode(inputs.input_ids[0]), inputs.attention_mask[0])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


</s></s></s></s></s></s></s></s></s><s> Question: Which of the following statements accurately describes the impact of Modified Newtonian Dynamics (MOND) on the observed "missing baryonic mass" discrepancy in galaxy clusters?
 Answer: MOND is a theory that reduces the observed missing baryonic mass in galaxy clusters by postulating the existence of a new form of matter called "fuzzy dark matter." tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')


In [35]:
perps_testing = []
for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
    inp = row["prompt"]
    cands = [f"Question: {inp}\n Answer: {row[c]}" for c in candidate]

    with torch.no_grad():
        inputs = tokenizer(cands, return_tensors="pt", padding=True, truncation=True).to(f"cuda:{model.device.index}")
        output = model(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask,)
        output = output.logits
        labels = inputs.input_ids
        labels.masked_fill_(~inputs["attention_mask"].bool(), -100)
        perp = [perplexity(output[i].unsqueeze(0), labels[i].unsqueeze(0)).cpu() for i in range(len(cands))]
        perps_testing.append(perp)
        # print(perp)
        # break

100%|██████████| 200/200 [02:02<00:00,  1.63it/s]


In [36]:
candidate[np.argsort(perp)], row.answer

(array(['E', 'A', 'B', 'C', 'D'], dtype='<U1'), 'C')

In [51]:
torch.cuda.empty_cache()
gc.collect()

10016

In [None]:
model_perps = []
for model_name in [MODELS[0]]:
    perps, scores = infer(model_name, test_df[:10])
    model_perps.append(perps)
    torch.cuda.empty_cache()
    gc.collect()

    print(f"{model_name} MAP@3: {np.mean(scores)}")

perps = np.array(model_perps)