In [1]:
! pip -q install transformers pandas torch sentencepiece python-dotenv

In [2]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import re

In [3]:
df = pd.read_csv('data/ABCD.csv')

In [4]:
from dotenv import load_dotenv
import os
from huggingface_hub import login
import os

load_dotenv()

login(os.getenv('HF_TOKEN'))


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [5]:
# Replace 'path_to_llama_model' with the actual path to your local LLaMA model
model_path = "meta-llama/Llama-3.2-1B"
 
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm):

In [6]:
def get_model_prediction(question, options):
    prompt = f"Question: {question}\nOptions:\n"
    for option, text in options.items():
        prompt += f"{option}. {text}\n"
    prompt += "Answer:"

    inputs = tokenizer(prompt, return_tensors='pt').to(device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=10,
        temperature=0.7,
        do_sample=False, 
        pad_token_id=tokenizer.eos_token_id
    )
    
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract the answer using regex
    match = re.search(r'Answer:\s*([A-D])', output_text, re.IGNORECASE)
    if match:
        return match.group(1).upper()
    else:
        return None

def clean_answer(text):
    cleaned_text = re.sub(r'\d', '', text)
    return cleaned_text

In [7]:
total_questions = len(df)
correct_predictions = 0
wrong_answers = []

for index, row in df.iterrows():
    question = row['Question:']
    options = {
        'A': row['Option A'],
        'B': row['Option B'],
        'C': row['Option C'],
        'D': row['Option D']
    }
    correct_answer = row['Correct answer'].upper()

    prediction = get_model_prediction(question, options)

    if prediction == correct_answer:
        correct_predictions += 1
    else:
        if len(wrong_answers) < 4:
            wrong_answers.append({
                'Question': question,
                'Model Prediction': clean_answer(prediction),
                'Correct Answer': correct_answer,
                'Options': options
            })

    print(f"Processed Question {index + 1}/{total_questions}", end='\r')

accuracy = (correct_predictions / total_questions) * 100



Processed Question 90/90

In [8]:
print(f"\nBenchmark Results:")
print(f"Total Questions: {total_questions}")
print(f"Correct Predictions: {correct_predictions}")
print(f"Accuracy: {accuracy:.2f}%\n")

if wrong_answers:
    print("Examples of Wrong Answers:")
    for i, wa in enumerate(wrong_answers, 1):
        print(f"\nExample {i}:")
        print(f"Question: {wa['Question']}")
        for opt, text in wa['Options'].items():
            print(f"  {opt}. {text}")
        print(f"Model Prediction: {wa['Model Prediction']}")
        print(f"Correct Answer: {wa['Correct Answer']}")
else:
    print("No wrong answers found.")


Benchmark Results:
Total Questions: 90
Correct Predictions: 36
Accuracy: 40.00%

Examples of Wrong Answers:

Example 1:
Question: Jaký máme rok?
  A. 1876
  B. 2014
  C. 2023
  D. 2024
Model Prediction: C
Correct Answer: D

Example 2:
Question: Kdo jako první vyřešil Basilejský problém?
  A. Jacob Bernoulli
  B. Leonard Euler
  C. Karl Weierstrass
  D.  Joseph-Louis Lagrange
Model Prediction: D
Correct Answer: B

Example 3:
Question: Jaký je nejstarší ze starověkých 7 divů světa? 
  A. Pyramidy v Gize
  B. Diova socha v Olympii
  C. Rhodsky kolos
  D. Visuté zahrady Semiramidiny
Model Prediction: B
Correct Answer: A

Example 4:
Question: Která z následujících planet je nejblíže Slunci?
  A. Země
  B. Venuše
  C. Merkur
  D. Mars
Model Prediction: D
Correct Answer: C
