###1. Installing and Loading Libraries

In [None]:
!pip install datasets
!pip install transformers==4.48.2
!pip install rouge-score
!pip install openai

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [None]:
import torch
import time
import re
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from peft import PeftModel
from datasets import load_dataset, Dataset
from google.colab import files, drive
from rouge_score import rouge_scorer
from openai import OpenAI

###2. Setting up the System Prompt

In [None]:
SYSTEM_PROMPT = """
You will be given a question. What you have to do is generate your answer by strictly following the below instructions,

Select a single label as the answer, which represents the answer from given 4 answers
First identify the goal and wrap it inside <goal_detector>...</goal_detector>tags.
Then generate the step wise plan (with numeric calculation if needed) to achieve the goal (solve the question), and wrap it inside <plan_generator>....</plan_generator> tags.
Think above plan works or not, and wrap it inside <projector>....</projector> tags.
Finally wrap your final answer inside <executer>...</executer> tags.

Your final answer must be in the below format,
<goal_detector>
........
</goal_detector>
<plan_generator>
........
</plan_generator>
<projector>
........
</projector>
<executer>
........
</executer>
Make sure to include all the 8 opening and closing tags in your final answer.
"""

###3. Path Configs and Model Loading

In [None]:
drive.mount('/content/drive')
model_name = "Qwen/Qwen2.5-1.5B-Instruct"
lora_model_name = "/content/drive/My Drive/base_model_v3"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def extract_hash_answer(text: str) -> str | None:
    if "####" not in text:
        return None
    return text.split("####")[1].strip()

In [None]:
def extract_xml_answer(text: str) -> str:
    answer = text.split("<executer>")[-1]
    answer = answer.split("</executer>")[0]
    return answer.strip()

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map=None
).to("cuda")

# Novel Model
peft_model = PeftModel.from_pretrained(model, lora_model_name)
merged_model = peft_model.merge_and_unload()

# Common Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

###4. Setup OpenAI as Validator

In [None]:
OPENAI_PROMPT = """
You will be provided with 10 questions. along with that you will be given 10 answers generated by a small language model which is trained under
a novel Reinforcement Learning pipeline to acheive stochastic reasoning (does the language model stick to a  fixed reasoning structure or is it
stochastic by exploring different reasoning paradigms). And you will be given another 10 answers generated by the vanilla model. What you have
to do is give two rates to both approaches from 1-10 based on their stochastic reasoning. The novel small language model follows a structure which
has 8 opening and closing tags to guide it for a hierachical reasoning, don't misunderstood that structure as a deterministic structure. And rate
the accuracy as well (1-10). The final answer must not be the optimal solution since both the models have small parameter count. In that case you can
evaluate accuracy on intermediate steps. Your answer should be a python dictionary as below,
ex :-
{
  "novel_model": {
    "rate": 10,
    "accuracy": 10
  },
  "vanilla_model": {
    "rate": 10,
    "accuracy": 10
  }
}
"""

client = OpenAI(api_key='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')

# Sending API call for OpenAI completions/ endpoint
def call_openai(CONTENT):
    completion = client.chat.completions.create(
      model="gpt-4o",
      messages=[
        {"role": "developer", "content": OPENAI_PROMPT},
        {"role": "user", "content": CONTENT}
      ]
    )

    eval = completion.choices[0].message.content
    return eval

### 5. Benchmarking For ARC dataset

In [None]:
def get_arc_questions(split = "test") -> Dataset:
    data = load_dataset('allenai/ai2_arc', 'ARC-Challenge')[split]
    data = data.map(lambda x: {
        'prompt': [
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': x['question'] + ' ' + str(x['choices']['text']) + 'corresponding labels for the above answers are as follows : ' + str(x['choices']['label'])}
        ],
        'answer': extract_hash_answer(x['answerKey'])
    })
    return data

In [None]:
def generate_novel(question):
    messages = [
        {"role": "system", "content": ""},
        {"role": "user", "content": SYSTEM_PROMPT + '\n' + question}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(merged_model.device)

    generated_ids = merged_model.generate(
        **model_inputs,
        max_new_tokens=1024,
        temperature=0.1,
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response


def generate_vanilla(question):
    messages = [
        {"role": "system", "content": ""},
        {"role": "user", "content": question}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=1024,
        temperature=0.1,
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response

def extract_final_answer(response):
    final_answer = extract_xml_answer(response)
    return final_answer

def accuracy_check():
    ds_start = 0
    ds_end = 10
    dataset = get_arc_questions()
    for i in range(5):
        print(f'start range {ds_start}, end range {ds_end}')
        current_dataset = dataset.select(range(ds_start, ds_end))
        all_novel_responses = []
        all_vanillla_responses = []
        all_questions = []
        print(f'Executing set: {1}')
        for i, example in enumerate(current_dataset):
            question = example['question'] + ' ' + str(example['choices']['text']) + ' corresponding labels for the above answers are as follows : ' + str(example['choices']['label'])
            response_novel = generate_novel(question) + '\n\n'
            response_vanilla = generate_vanilla(question) + '\n\n'
            all_novel_responses.append(response_novel)
            all_vanillla_responses.append(response_vanilla)
            all_questions.append(question)
            print(f'Question {i+1} completed')
        ds_start += 10
        ds_end += 10
        eval_results = call_openai(f'Questions : \n{all_questions}\n\nNovel Answers : \n{all_novel_responses}\n\nVanilla Answers : \n{all_vanillla_responses}')
        print(eval_results)
    return eval_results

eval_results = accuracy_check()

start range 0, end range 10
Executing set: 1
Question 1 completed
Question 2 completed
Question 3 completed
Question 4 completed
Question 5 completed
Question 6 completed
Question 7 completed
Question 8 completed
Question 9 completed
Question 10 completed
```python
{
  "novel_model": {
    "rate": 8,
    "accuracy": 7
  },
  "vanilla_model": {
    "rate": 6,
    "accuracy": 6
  }
}
```
start range 10, end range 20
Executing set: 1
Question 1 completed
Question 2 completed
Question 3 completed
Question 4 completed
Question 5 completed
Question 6 completed
Question 7 completed
Question 8 completed
Question 9 completed
Question 10 completed
```python
{
  "novel_model": {
    "rate": 8,
    "accuracy": 5
  },
  "vanilla_model": {
    "rate": 3,
    "accuracy": 6
  }
}
```
start range 20, end range 30
Executing set: 1
Question 1 completed
Question 2 completed
Question 3 completed
Question 4 completed
Question 5 completed
Question 6 completed
Question 7 completed
Question 8 completed
Questio

###6. Benchmarking for Code-Alphaca Dataset

In [None]:
def get_code_alpaca(split = "test") -> Dataset:
    data = load_dataset('HuggingFaceH4/CodeAlpaca_20K', 'default')[split]
    data = data.map(lambda x: {
        'prompt': [
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': x['prompt']}
        ],
        'answer': extract_hash_answer(x['completion'])
    })
    return data

In [None]:
def accuracy_check():
    ds_start = 0
    ds_end = 10
    dataset = get_code_alpaca()
    for i in range(5):
        print(f'start range {ds_start}, end range {ds_end}')
        current_dataset = dataset.select(range(ds_start, ds_end))
        all_novel_responses = []
        all_vanillla_responses = []
        all_questions = []
        print(f'Executing set: {1}')
        for i, example in enumerate(current_dataset):
            question = str(example['prompt'])
            response_novel = generate_novel(question) + '\n\n'
            response_vanilla = generate_vanilla(question) + '\n\n'
            all_novel_responses.append(response_novel)
            all_vanillla_responses.append(response_vanilla)
            all_questions.append(question)
            print(f'Question {i+1} completed')
        ds_start += 10
        ds_end += 10
        eval_results = call_openai(f'Questions : \n{all_questions}\n\nNovel Answers : \n{all_novel_responses}\n\nVanilla Answers : \n{all_vanillla_responses}')
        print(eval_results)
    return eval_results

eval_results = accuracy_check()

start range 0, end range 10
Executing set: 1
Question 1 completed
Question 2 completed
Question 3 completed
Question 4 completed
Question 5 completed
Question 6 completed
Question 7 completed
Question 8 completed
Question 9 completed
Question 10 completed
```python
{
  "novel_model": {
    "rate": 7,
    "accuracy": 6
  },
  "vanilla_model": {
    "rate": 3,
    "accuracy": 5
  }
}
```
start range 10, end range 20
Executing set: 1
Question 1 completed
Question 2 completed
Question 3 completed
Question 4 completed
Question 5 completed
Question 6 completed
Question 7 completed
Question 8 completed
Question 9 completed
Question 10 completed
```python
{
  "novel_model": {
    "rate": 8,
    "accuracy": 8
  },
  "vanilla_model": {
    "rate": 6,
    "accuracy": 6
  }
}
```
start range 20, end range 30
Executing set: 1
Question 1 completed
Question 2 completed
Question 3 completed
Question 4 completed
Question 5 completed
Question 6 completed
Question 7 completed
Question 8 completed
Questio

###7. Benchmarking for CommonsenseQA Dataset

In [None]:
def get_commonsense_qa(split = "validation") -> Dataset:
    data = load_dataset('chiayewken/commonsense-qa-2', 'default')[split]
    data = data.map(lambda x: {
        'prompt': [
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': x['question']}
        ],
        'answer': extract_hash_answer(x['answer'])
    })
    return data

In [None]:
def accuracy_check():
    ds_start = 0
    ds_end = 10
    dataset = get_commonsense_qa()
    for i in range(5):
        print(f'start range {ds_start}, end range {ds_end}')
        current_dataset = dataset.select(range(ds_start, ds_end))
        all_novel_responses = []
        all_vanillla_responses = []
        all_questions = []
        print(f'Executing set: {1}')
        for i, example in enumerate(current_dataset):
            question = str(example['question'])
            response_novel = generate_novel(question) + '\n\n'
            response_vanilla = generate_vanilla(question) + '\n\n'
            all_novel_responses.append(response_novel)
            all_vanillla_responses.append(response_vanilla)
            all_questions.append(question)
            print(f'Question {i+1} completed')
        ds_start += 10
        ds_end += 10
        eval_results = call_openai(f'Questions : \n{all_questions}\n\nNovel Answers : \n{all_novel_responses}\n\nVanilla Answers : \n{all_vanillla_responses}')
        print(eval_results)
    return eval_results

eval_results = accuracy_check()

README.md:   0%|          | 0.00/756 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/849k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/233k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9264 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2541 [00:00<?, ? examples/s]

Map:   0%|          | 0/2541 [00:00<?, ? examples/s]

start range 0, end range 10
Executing set: 1
Question 1 completed
Question 2 completed
Question 3 completed
Question 4 completed
Question 5 completed
Question 6 completed
Question 7 completed
Question 8 completed
Question 9 completed
Question 10 completed
```python
{
  "novel_model": {
    "rate": 8,
    "accuracy": 7
  },
  "vanilla_model": {
    "rate": 5,
    "accuracy": 6
  }
}
```
start range 10, end range 20
Executing set: 1
Question 1 completed
Question 2 completed
Question 3 completed
Question 4 completed
Question 5 completed
Question 6 completed
Question 7 completed
Question 8 completed
Question 9 completed
Question 10 completed
```python
{
  "novel_model": {
    "rate": 8,
    "accuracy": 6
  },
  "vanilla_model": {
    "rate": 5,
    "accuracy": 6
  }
}
```
start range 20, end range 30
Executing set: 1
Question 1 completed
Question 2 completed
Question 3 completed
Question 4 completed
Question 5 completed
Question 6 completed
Question 7 completed
Question 8 completed
Questio