In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# File paths
grouped_training_file = '../grade-school-math/grade_school_math/data/train_gr20.jsonl'
train_sample_file = '../grade-school-math/grade_school_math/data/train_gr20_samples.json'
output_file = '../grade-school-math/grade_school_math/data/train_gr20_cot_qa.jsonl'
train_sample_cot_file = '../grade-school-math/grade_school_math/data/train_gr20_sample_cot.json'

In [3]:
import json
import os
from openai import OpenAI
from tqdm import tqdm  # Optional: For a progress bar

# Load train sample data
with open(train_sample_file, 'r') as f:
    train_sample = json.load(f)
train_sample[0]

[{'question': 'For an operations manager job at a company, a person with a degree earns three times the amount paid to a diploma holder for the same position. How much will Jared earn from the company in a year after graduating with a degree if the pay for a person holding a diploma certificate is $4000 per month?',
  'answer': 'Since the pay for a person holding a degree is three times the amount paid for a diploma holder, Jared will earn 3*$4000 = $<<4000*3=12000>>12000 per month.\nIn a year with 12 months, Jared will earn a total of 12*12000 = <<12*12000=144000>>144000\n#### 144000',
  'group_number': 0},
 {'question': 'Susan is taking a two week vacation. She works five days a week and has six days of paid vacation. The rest of her workdays will be unpaid vacation time. She gets paid $15 per hour and works 8 hours a day. How much pay will she miss on her vacation?',
  'answer': 'Susan works 2 * 5 = <<2*5=10>>10 days in two weeks.\nShe is taking 10 - 6 = <<10-6=4>>4 days of unpaid v

In [4]:
# Enhanced data storage
enhanced_data = []

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
    base_url=os.environ.get("OPENAI_API_BASE")
)

# Iterate through each group
for group_number, group_items in tqdm(enumerate(train_sample), desc="Processing groups"):
    enhanced_group = []
    for item in group_items:
        question = item["question"]
        answer = item["answer"]

        # Generate enhanced CoT explanation using the OpenAI API
        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant specializing in generating well structured explanations for educational content."},
                    {
                        "role": "user",
                        "content": f"Generate a detailed chain of thought explanation for the answer, do not include the question itself. In other words, you need to write a step by step explanation that enhances the original answer. Also, you need to follow the original answer's format, that is 1. Do not use latex formula symbols cause that is not used in orignal answer. Instead, use simple  arithmetic operators like +,-,*,/ to denote the calculation.\n2. Wrap each formula inside << >> followd by the result of the calculation. For example: 3 - $1.50 = $<<3-1.5=1.50>>1.50  or 2.50 x 4= $<<2.5*4=10>>10\n3. At the end of your cot answer, add four # followed by a whitespace and then the final numeric answer. For example, if the answer gives a final result of 126, append this to your explanation:'\\n#### 126'  \n\n Here is the original answer, you can see that it follows the format we just talked about: {answer}\n\nYour step by step explanation:"
                    }
                ]
            )
            cot_prompt = response.choices[0].message.content

            # Add the CoT prompt to the original item
            item["cot_answer"] = cot_prompt
            enhanced_group.append(item)
        except Exception as e:
            print(
                f"Error processing group {group_number}, item: {question}. Error: {e}")
            continue

    # Append enhanced group to the data
    enhanced_data.append(enhanced_group)
    
    # Save the enhanced data to a new file
    with open(train_sample_cot_file, 'w') as f:
        json.dump(enhanced_data, f, indent=2, ensure_ascii=False)



Processing groups: 20it [13:02, 39.10s/it]


In [5]:
enhanced_data[0][0]

{'question': 'For an operations manager job at a company, a person with a degree earns three times the amount paid to a diploma holder for the same position. How much will Jared earn from the company in a year after graduating with a degree if the pay for a person holding a diploma certificate is $4000 per month?',
 'answer': 'Since the pay for a person holding a degree is three times the amount paid for a diploma holder, Jared will earn 3*$4000 = $<<4000*3=12000>>12000 per month.\nIn a year with 12 months, Jared will earn a total of 12*12000 = <<12*12000=144000>>144000\n#### 144000',
 'group_number': 0,
 'cot_answer': "First, we start by recognizing that there is a distinction between the earnings of two different levels of education: a degree holder and a diploma holder. Jared, in this scenario, has a degree which significantly impacts his potential earnings.\n\n1. Given that the pay for a diploma holder is stated to be $4000 per month, we can first establish the baseline salary for 

In [6]:
import json
import random

# Load grouped training data
grouped_training = []
with open(grouped_training_file, 'r') as f:
    for line in f:
        grouped_training.append(json.loads(line))

# Load sampled training data
with open(train_sample_cot_file, 'r') as f:
    train_sample = json.load(f)

# Ensure train_sample has exactly 20 lists (one for each group)
assert len(train_sample) == 20, "Train sample must contain 20 groups."

# Create the output file
with open(output_file, 'w') as out_f:
    for item in grouped_training:
        group_number = item['group_number']
        # Get a random sample from the same group
        cot_prompt = random.choice(train_sample[group_number])
        # Construct the new JSON object
        new_item = {
            "cot_prompt": cot_prompt,
            "qa": item
        }
        # Write to the output file in JSONL format
        out_f.write(json.dumps(new_item) + '\n')

print(f"File '{output_file}' has been successfully created.")


File '../grade-school-math/grade_school_math/data/train_gr20_cot_qa.jsonl' has been successfully created.
