# Task 01



In [1]:
import torch
import transformers

from datasets import load_dataset

dataset = load_dataset("csv", data_files="ABCD.csv")

dataset

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Timestamp', 'Question:', 'Option A', 'Option B', 'Option C', 'Option D', 'Correct answer'],
        num_rows: 90
    })
})

In [None]:
from huggingface_hub import login
login(token="...")

In [3]:
dataset["train"][0]

{'Timestamp': '11/11/2024 11:18:42',
 'Question:': 'Jaký máme rok?',
 'Option A': '1876',
 'Option B': '2014',
 'Option C': '2023',
 'Option D': '2024',
 'Correct answer': 'D'}

In [4]:
def get_prompt(x):
    question, options = x["Question:"], (
        x["Option A"],
        x["Option B"],
        x["Option C"],
        x["Option D"],
    )
    text = f"""Which option A/B/C/D is the best answer for the question. Answer with one letter, no explanation.

Question (in Czech): {question}

Options:
A) {options[0]}
B) {options[1]}
C) {options[2]}
D) {options[3]}

Answer (just 1 letter, A/B/C/D):"""
    return text


prompts = [get_prompt(x) for x in dataset["train"]]

print(prompts[0])

Which option A/B/C/D is the best answer for the question. Answer with one letter, no explanation.

Question (in Czech): Jaký máme rok?

Options:
A) 1876
B) 2014
C) 2023
D) 2024

Answer (just 1 letter, A/B/C/D):


In [5]:
prompts[2]

'Which option A/B/C/D is the best answer for the question. Answer with one letter, no explanation.\n\nQuestion (in Czech): Který z následujících kontinentů je největší podle rozlohy?\n\nOptions:\nA) Afrika\nB) Evropa\nC) Austrálie\nD) Asie\n\nAnswer (just 1 letter, A/B/C/D):'

In [25]:
from transformers import pipeline
pipe = pipeline("text-generation", model="Qwen/Qwen2.5-7B-Instruct")


config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [26]:
messages = [
    {"role": "user", "content": prompts[40]},
]

response = pipe(
    messages,
    max_length=150, 
    do_sample=False,
    temperature=0.1, 
    top_p=0.1, 
    top_k=1,
    num_return_sequences=1,
    pad_token_id=pipe.tokenizer.eos_token_id
)
response

[{'generated_text': [{'role': 'user',
    'content': 'Which option A/B/C/D is the best answer for the question. Answer with one letter, no explanation.\n\nQuestion (in Czech): Mám tři jablka, ze dvou udělám štrůdl a jedno další koupím. Kolik budu mít jablek?\n\nOptions:\nA) pět\nB) čtyři\nC) jedno\nD) dvě\n\nAnswer (just 1 letter, A/B/C/D):'},
   {'role': 'assistant', 'content': 'D'}]}]

In [30]:


def get_answer(prompt):
    messages = [
        {"role": "user", "content": prompt},
    ]

    response = pipe(
        messages,
        max_length=1000, 
        do_sample=False,
        temperature=0.001, 
        top_p=0.1, 
        top_k=1,
        num_return_sequences=1,
        pad_token_id=pipe.tokenizer.eos_token_id
    )
    answer = response[0]['generated_text'][-1]['content'].strip().upper()[:1]
    print(answer)
    return answer

answer = get_answer(prompts[40])


D


In [31]:
from tqdm.notebook import tqdm

answers = []

for prompt in tqdm(prompts):
    a = get_answer(prompt)
    answers.append(a)

  0%|          | 0/90 [00:00<?, ?it/s]

C
A
D
C
B
C
B
A
A
B
D
D
C
B
A
A
B
C
B
C
A
A
A
C
D
C
D
B
D
C
A
B
C
B
A
D
B
D
D
B
D
C
C
C
B
D
A
A
C
C
C
B
A
B
C
A
D
B
B
C
A
A
C
A
A
A
B
C
B
D
A
C
C
B
A
A
A
C
B
C
D
B
B
B
C
A
C
D
B
D


In [32]:
correct = 0

for answer, solution in zip(answers, dataset["train"]["Correct answer"]):
    if answer == solution:
        correct += 1

# 80% for GPT3.5 (11 mistakes)
# 94% for GPT4 (3 mistakes)
correct / len(answers)

0.6444444444444445

In [35]:
# List the mistakes
wrong_answers = []

for answer, x in zip(answers, dataset["train"]):
    if answer != x["Correct answer"]:
        wrong_answers.append((answer, str(x)))

wrong_answers[:4]

[('C',
  "{'Timestamp': '11/11/2024 11:18:42', 'Question:': 'Jaký máme rok?', 'Option A': '1876', 'Option B': '2014', 'Option C': '2023', 'Option D': '2024', 'Correct answer': 'D'}"),
 ('A',
  "{'Timestamp': '11/11/2024 11:19:23', 'Question:': 'Kdo jako první vyřešil Basilejský problém?', 'Option A': 'Jacob Bernoulli', 'Option B': 'Leonard Euler', 'Option C': 'Karl Weierstrass', 'Option D': ' Joseph-Louis Lagrange', 'Correct answer': 'B'}"),
 ('C',
  "{'Timestamp': '11/11/2024 11:19:41', 'Question:': 'Jaký je nejstarší ze starověkých 7 divů světa? ', 'Option A': 'Pyramidy v Gize', 'Option B': 'Diova socha v Olympii', 'Option C': 'Rhodsky kolos', 'Option D': 'Visuté zahrady Semiramidiny', 'Correct answer': 'A'}"),
 ('B',
  "{'Timestamp': '11/11/2024 11:20:37', 'Question:': 'Kolika let se dožil Bedřich Smetana?', 'Option A': '60', 'Option B': '84', 'Option C': '69', 'Option D': '47', 'Correct answer': 'A'}")]