###1. Download and Load Required Libraries

In [None]:
!pip install datasets
!pip install transformers==4.48.2
!pip install rouge-score
!pip install openai

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [None]:
import torch
import time
import re
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from peft import PeftModel
from datasets import load_dataset, Dataset
from google.colab import files, drive
from rouge_score import rouge_scorer
from openai import OpenAI

###2. Setting up System Prompt

In [None]:
SYSTEM_PROMPT = """
You will be given a question. What you have to do is generate your answer by strictly following the below instructions,
First identify the goal and wrap it inside <goal_detector>...</goal_detector>tags.
Then generate the step wise plan (with numeric calculation if needed) to achieve the goal (solve the question), and wrap it inside <plan_generator>....</plan_generator> tags.
Think above plan works or not, and wrap it inside <projector>....</projector> tags.
Finally wrap your final answer inside <executer>...</executer> tags.

Your final answer must be in the below format,
<goal_detector>
........
</goal_detector>
<plan_generator>
........
</plan_generator>
<projector>
........
</projector>
<executer>
........
</executer>
Make sure to include all the 8 opening and closing tags in your final answer.
"""

###3.Load Model and Tokenizer

In [None]:
drive.mount('/content/drive')
model_name = "Qwen/Qwen2.5-1.5B-Instruct"
lora_model_name = "/content/drive/My Drive/base_model_v3"

Mounted at /content/drive


In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map=None
).to("cuda")

# Novel Model
peft_model = PeftModel.from_pretrained(model, lora_model_name)
merged_model = peft_model.merge_and_unload()

# Common Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

###4. Load Dataset

In [None]:
def extract_hash_answer(text: str) -> str | None:
    if "####" not in text:
        return None
    return text.split("####")[1].strip()

def get_svamp_questions(split = "test") -> Dataset:
    data = load_dataset('ChilleD/SVAMP', 'default')[split]
    data = data.map(lambda x: {
        'prompt': [
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': x['Body'] + ' ' + x['Question']}
        ],
        'answer': extract_hash_answer(x['Answer'])
    })
    return data

In [None]:
def extract_xml_answer(text: str) -> str:
    answer = text.split("<executer>")[-1]
    answer = answer.split("</executer>")[0]
    return answer.strip()

###5. Benchmark for SVAMP

In [None]:
# Loadong Novel model and evaluating
def generate(question):
    messages = [
        {"role": "system", "content": ""},
        {"role": "user", "content": SYSTEM_PROMPT + '\n' + question}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(merged_model.device)

    generated_ids = merged_model.generate(
        **model_inputs,
        max_new_tokens=1024,
        temperature=0.1,
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response

def extract_final_answer(response):
    final_answer = extract_xml_answer(response)
    return final_answer

def accuracy_check():
    all_responses = []
    all_ground_truth = []
    dataset = get_svamp_questions()
    dataset = dataset.select(range(100))
    for example in dataset:
        question = example["Body"] + " " + example['Question']
        ground_truth = example['Answer']
        response = generate(question)
        final_answer = extract_final_answer(response)
        print("\n==============================================")
        print(f"🔹 Full response: {response}")
        print(f"🔹 Extracted: {final_answer}")
        print(f"🎯 Ground Truth: {ground_truth}")
        print("==============================================\n")
        all_responses.append(final_answer)
        all_ground_truth.append(ground_truth)
    pattern = r'\d+\$?'
    extracted_values = [match.group() for text in all_ground_truth for match in re.finditer(pattern, text)]
    match_count = sum(any(re.search(re.escape(value), ans) for ans in all_responses) for value in extracted_values)
    return match_count / len(extracted_values)

accuracy_check()



🔹 Full response: <goal_detector>
Determine how many more bird families flew away to Africa compared to those that flew away to Asia.
</goal_detector>

<plan_generator>
1. Calculate the number of bird families that flew away to Africa: 
   Number of bird families flying to Africa = 62

2. Calculate the difference between the number of bird families flying to Africa and those flying to Asia:
   Difference = Number of bird families flying to Africa - Number of bird families flying to Asia
                = 62 - 35
                = 27

The projector confirms that there are 27 more bird families who flew away to Africa than those who flew away to Asia.
</projector>

<executer>
There were 41 bird families initially. Out of these, 35 flew to Asia and 62 flew to Africa. The difference in the number of birds flying to each continent is 27. Therefore, 27 more bird families flew to Africa than to Asia.
</executer>
🔹 Extracted: There were 41 bird families initially. Out of these, 35 flew to Asia

0.95

In [None]:
# Loading Base model and evaluating
def generate(question):
    messages = [
        {"role": "system", "content": ""},
        {"role": "user", "content": question}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=1024,
        temperature=0.1,
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response

def extract_final_answer(response):
    match = re.search(r'\\boxed\{(.*?)\}', response)
    if match:
        return match.group(1)
    else:
        return "No match found"

def accuracy_check():
    all_responses = []
    all_ground_truth = []
    dataset = get_gsm8k_questions()
    dataset = dataset.select(range(100))
    for example in dataset:
        question = example["Body"] + " " + example['Question']
        ground_truth = example['Answer']
        response = generate(question)
        capped_response = extract_final_answer(response)
        print("\n==============================================")
        print(f"🔹 Full response: {response}")
        print(f"🎯 Ground Truth: {ground_truth}")
        print("==============================================\n")
        all_responses.append(capped_response)
        all_ground_truth.append(ground_truth)
    pattern = r'\d+\$?'
    extracted_values = [match.group() for text in all_ground_truth for match in re.finditer(pattern, text)]
    match_count = sum(any(re.search(re.escape(value), ans) for ans in all_responses) for value in extracted_values)
    return match_count / len(extracted_values)

accuracy_check()



🔹 Full response: To determine how many more bird families flew away to Africa than those that flew away to Asia, we need to follow these steps:

1. Identify the number of bird families that flew away to each destination.
   - Bird families flying to Asia: 35
   - Bird families flying to Africa: 62

2. Calculate the difference between the number of bird families flying to Africa and those flying to Asia.
   \[
   62 - 35 = 27
   \]

So, the number of bird families that flew away to Africa is 27 more than those that flew away to Asia.

The final answer is:
\[
\boxed{27}
\]
🎯 Ground Truth: 27


🔹 Full response: To determine how many fish disappeared, we need to find out how many fish Paige originally had and then subtract the number of fish that are left.

First, let's calculate the total number of fish Paige initially had:
\[
7 \text{ (goldfish)} + 12 \text{ (catfish)} = 19 \text{ (total fish)}
\]

Next, we know that after some fish were eaten by stray cats, Paige now has 15 fish left. 

0.8