In [None]:
model_id: str = (
    # "microsoft/Phi-3-mini-128k-instruct"
    "model_artifacts/Phi-3-mini-dpo-sardukar"
)
config: str = "anatomy"
eval_split: str = "test"
instruction_type: str = "cot"

In [2]:
from datasets import load_dataset

dataset = load_dataset("cais/mmlu", config, split=eval_split)

display(dataset)
display(dataset[0])

Dataset({
    features: ['question', 'subject', 'choices', 'answer'],
    num_rows: 135
})

{'question': 'A lesion causing compression of the facial nerve at the stylomastoid foramen will cause ipsilateral',
 'subject': 'anatomy',
 'choices': ['paralysis of the facial muscles.',
  'paralysis of the facial muscles and loss of taste.',
  'paralysis of the facial muscles, loss of taste and lacrimation.',
  'paralysis of the facial muscles, loss of taste, lacrimation and decreased salivation.'],
 'answer': 0}

In [3]:
import os
from dotenv import load_dotenv
from huggingface_hub import login

load_dotenv("../.env")
hf_token = os.environ["HF_TOKEN"]
print(hf_token[:8] + "*" * (len(hf_token) - 13) + hf_token[-5:])

login(hf_token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


hf_AAlNN************************hCQBo


In [4]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)
if "gemma" in model_id:
    bnb_config = None

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=bnb_config,
    attn_implementation=("eager" if "gemma" in model_id else None),
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
answer_map = ["A", "B", "C", "D"]

user_message_template = """
You are an expert on {config} who is tasked with answering multiple choice questions as shown in the example below.
{instruction}

Example: How many legs does a banana have?

A: 0
B: 1
C: 2
D: 3

Answer: {example_answer}

Question: {question}

{choices}

Answer: """[1:]

instructions = {
    "standard": (
        "Provide only your final letter choice with no explanations. No yapping.",
        "A"
    ),
    "cod": (
        "Think step by step. Return the answer at the end of the response after a separator ####.",
        "Step 1: A banana is not an animal. Step 2: Therefore it has no legs. Step 3: The correct answer is A. #### A"
    ),
    "cot": (
        "Think step by step. Return the answer at the end of the response after a separator ####.",
        "A banana is not an animal; therefore it has no legs. The correct answer is A. #### A"
    ),
}

def generate_user_message(sample: dict, instruction_type: str = instruction_type) -> str:
    instruction, example_answer = instructions[instruction_type]
    return user_message_template.format(
        config=config,
        instruction=instruction,
        example_answer=example_answer,
        question=sample["question"],
        choices="\n".join(
            f"{answer_map[i]}: {choice}"
            for i, choice in enumerate(sample["choices"])
        )
    )

print(generate_user_message(dataset[0]))

You are an expert on anatomy who is tasked with answering multiple choice questions as shown in the example below.
Think step by step. Return the answer at the end of the response after a separator ####.

Example: How many legs does a banana have?

A: 0
B: 1
C: 2
D: 3

Answer: A banana is not an animal; therefore it has no legs. The correct answer is A. #### A

Question: A lesion causing compression of the facial nerve at the stylomastoid foramen will cause ipsilateral

A: paralysis of the facial muscles.
B: paralysis of the facial muscles and loss of taste.
C: paralysis of the facial muscles, loss of taste and lacrimation.
D: paralysis of the facial muscles, loss of taste, lacrimation and decreased salivation.

Answer: 


In [6]:
from more_itertools import unzip
from tqdm import tqdm

def generate_prompt(sample) -> str:
    user_message = generate_user_message(sample)
    messages = [{"role": "user", "content": user_message}]
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    assert isinstance(prompt, str)
    return prompt

def batch_predict(dataset, batch_size=8):
    all_predictions = dict[int, str]()
    indexed_prompts = sorted(
        enumerate(map(generate_prompt, dataset)),
        key=lambda i_prompt: len(i_prompt[1])
    )

    for i in tqdm(range(0, len(indexed_prompts), batch_size), desc="Batch"):
        batch_idxs, batch_prompts = unzip(indexed_prompts[i: i + batch_size])
        batch_idxs, batch_prompts = list(batch_idxs), list(batch_prompts)

        batch_inputs = tokenizer(batch_prompts, padding=True, return_tensors="pt")
        batch_input_length = batch_inputs["input_ids"].shape[1]

        with torch.no_grad():
            batch_outputs = model.generate(
                **batch_inputs.to(model.device),
                max_new_tokens=512,
                do_sample=True,
                temperature=0.001,
                top_p=0.999,
            )

        for j, output in zip(batch_idxs, batch_outputs):
            all_predictions[j] = tokenizer.decode(
                output[batch_input_length:], skip_special_tokens=True
            )

    return dataset.add_column(
        "prediction", [all_predictions[i] for i in range(len(dataset))]
    )

example_preds = batch_predict(dataset.select(range(4)), batch_size=2)
display(example_preds["prediction"])

Batch:   0%|          | 0/2 [00:00<?, ?it/s]

Batch: 100%|██████████| 2/2 [00:08<00:00,  4.22s/it]


['The facial nerve (cranial nerve VII) is responsible for innervating the muscles of facial expression, carrying taste sensations from the anterior two-thirds of the tongue, and stimulating the lacrimal glands for tear production. A lesion at the stylomastoid foramen, which is the exit point of the facial nerve from the skull, would affect all these functions. Therefore, the correct answer is D. #### D',
 'A "dished face" profile is often associated with a protruding mandible due to reactivation of the condylar cartilage by acromegaly. The correct answer is A. #### A',
 'The structure that collects urine in the body is the bladder. The correct answer is A. #### A',
 'Ectomesenchyme is a type of embryonic connective tissue that contributes to the formation of certain structures in the body. Among the options provided, skeletal muscles are derived from ectomesenchyme. Therefore, the correct answer is B. #### B']

In [7]:
def postprocess_predictions(sample: dict) -> dict:
    prediction_letter = sample["prediction"].split("####")[-1].strip()
    if len(prediction_letter) > 1:
        prediction_letter = ""
    return {
        "answer_letter": answer_map[sample["answer"]],
        "prediction_letter": prediction_letter,
    }

dataset = batch_predict(dataset).map(postprocess_predictions)
display(dataset.to_pandas())

Batch: 100%|██████████| 17/17 [02:06<00:00,  7.42s/it]


Map:   0%|          | 0/135 [00:00<?, ? examples/s]

Unnamed: 0,question,subject,choices,answer,prediction,answer_letter,prediction_letter
0,A lesion causing compression of the facial ner...,anatomy,"[paralysis of the facial muscles., paralysis o...",0,The facial nerve (cranial nerve VII) is respon...,A,D
1,"A ""dished face"" profile is often associated with",anatomy,[a protruding mandible due to reactivation of ...,1,"A ""dished face"" profile is often associated wi...",B,A
2,Which of the following best describes the stru...,anatomy,"[Bladder, Kidney, Ureter, Urethra]",0,The structure that collects urine in the body ...,A,A
3,Which of the following structures is derived f...,anatomy,"[Motor neurons, Skeletal muscles, Melanocytes,...",2,Ectomesenchyme is a type of embryonic connecti...,C,B
4,Which of the following describes the cluster o...,anatomy,"[Afferent arteriole, Glomerulus, Loop of Henle...",1,The cluster of blood capillaries found in each...,B,B
...,...,...,...,...,...,...,...
130,The dorsal roots of all spinal nerves contain,anatomy,"[sensory neuronal processes., sensory and auto...",0,The dorsal roots of all spinal nerves contain ...,A,A
131,Which of the following is the master gland of ...,anatomy,"[Adrenal, Pancreas, Pineal, Pituitary]",3,The pituitary gland is known as the master gla...,D,D
132,Loss of somatic sensation over the anterior tw...,anatomy,[lingual branch of the mandibular trigeminal n...,0,The anterior two-thirds of the tongue's somati...,A,A
133,"In men, specimens for gonococcal cultures are ...",anatomy,"[Anus, Bladder, Urethra, Testicle]",2,Gonococcal cultures are used to diagnose infec...,C,C


In [8]:
fmt_acc = sum(
    pred.strip() in answer_map
    for pred in dataset["prediction_letter"]
) / len(dataset)
ans_acc = sum(
    pred.strip() == ans.strip()
    for pred, ans in zip(dataset["prediction_letter"], dataset["answer_letter"])
) / len(dataset)
print(f"Format Accuracy: {fmt_acc}")
print(f"Answer Accuracy: {ans_acc}")

Format Accuracy: 0.9777777777777777
Answer Accuracy: 0.6888888888888889


In [None]:
#                                 standard               cod               cot
#                           format  answer    format  answer    format  answer
# Phi-3-mini-4k-instruct     1.000   0.615     1.000   0.652     0.970   0.667
# Phi-3-mini-128k-instruct   1.000   0.630     0.985   0.637     0.985   0.696
#   fintuned-sardukar        1.000   0.622     0.993   0.667     0.978   0.689