###1. Download and Install required Libraries

In [None]:
!pip install datasets
!pip install transformers==4.48.2
!pip install rouge-score
!pip install openai

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [None]:
import torch
import time
import re
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from peft import PeftModel
from datasets import load_dataset, Dataset
from google.colab import files, drive
from rouge_score import rouge_scorer
from openai import OpenAI

In [None]:
!pip show transformers

Name: transformers
Version: 4.48.2
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /usr/local/lib/python3.11/dist-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: peft, sentence-transformers


###2. System Prompt

In [None]:
SYSTEM_PROMPT = """
You will be given a question. What you have to do is generate your answer by strictly following the below instructions,
First identify the goal and wrap it inside <goal_detector>...</goal_detector>tags.
Then generate the step wise plan (with numeric calculation if needed) to achieve the goal (solve the question), and wrap it inside <plan_generator>....</plan_generator> tags.
Think above plan works or not, and wrap it inside <projector>....</projector> tags.
Finally wrap your final answer inside <executer>...</executer> tags.

Your final answer must be in the below format,
<goal_detector>
........
</goal_detector>
<plan_generator>
........
</plan_generator>
<projector>
........
</projector>
<executer>
........
</executer>
Make sure to include all the 8 opening and closing tags in your final answer.
"""

###3. Loading Model

In [None]:
drive.mount('/content/drive')
model_name = "Qwen/Qwen2.5-1.5B-Instruct"
lora_model_name = "/content/drive/My Drive/base_model_v3"

Mounted at /content/drive


In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map=None
).to("cuda")

peft_model = PeftModel.from_pretrained(model, lora_model_name)
merged_model = peft_model.merge_and_unload()

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [None]:
def extract_hash_answer(text: str) -> str | None:
    if "####" not in text:
        return None
    return text.split("####")[1].strip()

def get_gsm8k_questions(split = "test") -> Dataset:
    data = load_dataset('openai/gsm8k', 'main')[split]
    data = data.map(lambda x: {
        'prompt': [
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': x['question']}
        ],
        'answer': extract_hash_answer(x['answer'])
    })
    return data

In [None]:
def extract_xml_answer(text: str) -> str:
    answer = text.split("<executer>")[-1]
    answer = answer.split("</executer>")[0]
    return answer.strip()

In [None]:
# Dry run on loaded model
prompt = "Every day, Sara bakes 10 cakes and puts them in his refrigerator. He does this for 5 days. Carol then comes over and eats 12 of his cakes. If it takes 2 cans of frosting to frost a single cake, how many cans of frosting does Bob need to frost the remaining cakes?"
messages = [
    {"role": "system", "content": ""},
    {"role": "user", "content": SYSTEM_PROMPT + '\n' + prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(merged_model.device)

generated_ids = merged_model.generate(
    **model_inputs,
    max_new_tokens=1024,
    temperature=0.1,
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)

<goal_detector>
Sara needs to determine how many cans of frosting are required to frost the remaining cakes after Carol eats some.
</goal_detector>

<plan_generator>
1. Calculate the total number of cakes baked by Sara: 
   Total cakes = Number of cakes per day * Number of days = 10 * 5 = 50 cakes.

2. Subtract the number of cakes eaten by Carol from the total:
   Remaining cakes = Total cakes - Cakes eaten by Carol = 50 - 12 = 38 cakes.

3. Calculate the number of cans of frosting needed:
   Cans of frosting = Number of remaining cakes * Cans of frosting per cake = 38 * 2 = 76 cans.
</plan_generator>

<projector>
The calculated steps show that Sara needs 76 cans of frosting to frost the remaining cakes.
</projector>

<executer>
76
</executer>


###4. Benchmarking for gsm8k test set

In [None]:
# novel model
def generate(question):
    messages = [
        {"role": "system", "content": ""},
        {"role": "user", "content": SYSTEM_PROMPT + '\n' + question}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(merged_model.device)

    generated_ids = merged_model.generate(
        **model_inputs,
        max_new_tokens=1024,
        temperature=0.1,
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response

def extract_final_answer(response):
    final_answer = extract_xml_answer(response)
    return final_answer

def accuracy_check():
    all_responses = []
    all_ground_truth = []
    dataset = get_gsm8k_questions()
    dataset = dataset.select(range(400))
    for example in dataset:
        question = example['question']
        ground_truth = example['answer']
        response = generate(question)
        final_answer = extract_final_answer(response)
        print("\n==============================================")
        print(f"🔹 Full response: {response}")
        print(f"🔹 Extracted: {final_answer}")
        print(f"🎯 Ground Truth: {ground_truth}")
        print("==============================================\n")
        all_responses.append(final_answer)
        all_ground_truth.append(ground_truth)
    pattern = r'\d+\$?'
    extracted_values = [match.group() for text in all_ground_truth for match in re.finditer(pattern, text)]
    match_count = sum(any(re.search(re.escape(value), ans) for ans in all_responses) for value in extracted_values)
    return match_count / len(extracted_values)

accuracy_check()



🔹 Full response: <goal_detector>
Calculate how many eggs Janet's ducks produce each day and determine her earnings from selling those eggs at the farmers' market.
</goal_detector>

<plan_generator>
1. Calculate the total number of eggs laid by the ducks each day: \(16 \text{ eggs/day}\).
2. Subtract the eggs eaten for breakfast: \(3 \times 7 = 21\) eggs are eaten for breakfast.
3. Subtract the eggs used to bake muffins: \(4 \times 5 = 20\) eggs are used to bake muffins.
4. Calculate the remaining eggs after subtracting the ones eaten and baked: \(16 - 21 - 20 = -7\). Since this result is negative, there seems to be an error in the problem setup as no eggs can be negative. Let's assume the correct interpretation should be that Janet doesn't eat any eggs for breakfast or uses them to bake muffins, so we'll adjust accordingly.
5. If Janet doesn't eat any eggs for breakfast or use them to bake muffins, then the remaining eggs are simply the total produced minus what was eaten/baked: \(16 

0.875

In [None]:
# vanilla model
def generate(question):
    messages = [
        {"role": "system", "content": ""},
        {"role": "user", "content": question}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=1024,
        temperature=0.1,
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response

def extract_final_answer(response):
    match = re.search(r'\\boxed\{(.*?)\}', response)
    if match:
        return match.group(1)
    else:
        return "No match found"

def accuracy_check():
    all_responses = []
    all_ground_truth = []
    dataset = get_gsm8k_questions()
    dataset = dataset.select(range(400))
    for example in dataset:
        question = example['question']
        ground_truth = example['answer']
        response = generate(question)
        capped_response = extract_final_answer(response)
        print("\n==============================================")
        print(f"🔹 Full response: {response}")
        print(f"🎯 Ground Truth: {ground_truth}")
        print("==============================================\n")
        all_responses.append(capped_response)
        all_ground_truth.append(ground_truth)
    pattern = r'\d+\$?'
    extracted_values = [match.group() for text in all_ground_truth for match in re.finditer(pattern, text)]
    match_count = sum(any(re.search(re.escape(value), ans) for ans in all_responses) for value in extracted_values)
    return match_count / len(extracted_values)

accuracy_check()



🔹 Full response: To determine how much Janet makes every day at the farmers' market, we need to follow these steps:

1. Calculate the number of eggs laid by the ducks each day.
2. Determine how many eggs are eaten for breakfast.
3. Subtract the number of eggs eaten from the total number of eggs laid to find out how many eggs are left for baking muffins.
4. Subtract the number of eggs used for baking muffins from the remaining eggs to find out how many eggs are sold at the farmers' market.
5. Calculate the revenue from selling the eggs at the farmers' market.

Let's start with step 1:
The ducks lay 16 eggs per day.

Step 2: Calculate the number of eggs eaten for breakfast.
Janet eats 3 eggs for breakfast every morning.
So, the number of eggs eaten for breakfast is \(3\).

Step 3: Calculate the number of eggs left after breakfast.
Subtract the number of eggs eaten for breakfast from the total number of eggs laid.
\[16 - 3 = 13\]
So, there are 13 eggs left after breakfast.

Step 4: Calcu

0.87