# Imports

In [None]:
from datasets import load_dataset, DatasetDict
import os
import json

# Load Raw Dataset

In [2]:
def load_raw_dataset(dataset_name: str = "microsoft/orca-math-word-problems-200k"):
    return load_dataset(dataset_name)["train"]

# Split Dataset

In [3]:
def split_dataset(raw_ds, train_frac: float = 0.8, eval_frac: float = 0.1, test_frac: float = 0.1, seed: int = 42) -> DatasetDict:
    train_ds, temp_ds = raw_ds.train_test_split(train_size=train_frac, seed=seed).values()
    relative_eval = eval_frac / (eval_frac + test_frac)
    eval_ds, test_ds = temp_ds.train_test_split(train_size=relative_eval, seed=seed).values()
    return DatasetDict({"train": train_ds, "eval": eval_ds, "test": test_ds})

# Process Splits for SFT Alignment

In [4]:
def process_split(split_ds, prompt_key: str = "question", answer_key: str = "answer") -> list:
    system_message = """Solve the given high school math problem by providing a clear explanation of each step leading to the final solution.
    
    Provide a detailed breakdown of your calculations, beginning with an explanation of the problem and describing how you derive each formula, value, or conclusion. Use logical steps that build upon one another, to arrive at the final answer in a systematic manner.
    
    # Steps
    
    1. **Understand the Problem**: Restate the given math problem and clearly identify the main question and any important given values.
    2. **Set Up**: Identify the key formulas or concepts that could help solve the problem (e.g., algebraic manipulation, geometry formulas, trigonometric identities).
    3. **Solve Step-by-Step**: Iteratively progress through each step of the math problem, justifying why each consecutive operation brings you closer to the solution.
    4. **Double Check**: If applicable, double check the work for accuracy and sense, and mention potential alternative approaches if any.
    5. **Final Answer**: Provide the numerical or algebraic solution clearly, accompanied by appropriate units if relevant.
    
    # Notes
    
    - Always clearly define any variable or term used.
    - Wherever applicable, include unit conversions or context to explain why each formula or step has been chosen.
    - Assume the level of mathematics is suitable for high school, and avoid overly advanced math techniques unless they are common at that level.
    """
    return [
        {
            "messages": [
                {"role": "system", "content": system_message},
                {"role": "user", "content": ex[prompt_key]},
                {"role": "assistant", "content": ex[answer_key].strip()}
            ]
        }
        for ex in split_ds
    ]

# Save Processed Splits to Disk

In [5]:
def save_splits(processed_splits: dict, output_dir: str = "./processed_data") -> None:
    os.makedirs(output_dir, exist_ok=True)
    for split_name, records in processed_splits.items():
        out_path = os.path.join(output_dir, f"{split_name}.jsonl")
        with open(out_path, "w", encoding="utf-8") as f:
            for rec in records:
                f.write(json.dumps(rec, ensure_ascii=False) + "\n")
        print(f"Saved {len(records)} examples to {out_path}")

# Load Processed Splits from Disk

In [6]:
def load_for_sft(data_dir: str = "./lora_processed_data") -> DatasetDict:
    files = {
        "train": os.path.join(data_dir, "train.jsonl"),
        "eval": os.path.join(data_dir, "eval.jsonl"),
        "test": os.path.join(data_dir, "test.jsonl")
    }
    ds = load_dataset("json", data_files=files)
    return ds

# Observe Data

In [7]:
raw_ds = load_raw_dataset()
raw_ds

Dataset({
    features: ['question', 'answer'],
    num_rows: 200035
})

In [12]:
splits = split_dataset(raw_ds)
splits

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 160028
    })
    eval: Dataset({
        features: ['question', 'answer'],
        num_rows: 20003
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 20004
    })
})

In [13]:
splits = {name: process_split(ds) for name, ds in splits.items()}
len(splits)

3

In [14]:
len(splits['train'])

160028

In [15]:
save_splits(splits, output_dir="./lora_processed_data")

Saved 160028 examples to ./lora_processed_data/train.jsonl
Saved 20003 examples to ./lora_processed_data/eval.jsonl
Saved 20004 examples to ./lora_processed_data/test.jsonl


In [16]:
sft_ds = load_for_sft("lora_processed_data")
sft_ds

Generating train split: 160028 examples [00:00, 296460.61 examples/s]
Generating eval split: 20003 examples [00:00, 342176.76 examples/s]
Generating test split: 20004 examples [00:00, 320904.99 examples/s]


DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 160028
    })
    eval: Dataset({
        features: ['messages'],
        num_rows: 20003
    })
    test: Dataset({
        features: ['messages'],
        num_rows: 20004
    })
})

In [21]:
sft_ds['train']

Dataset({
    features: ['messages'],
    num_rows: 160028
})

In [22]:
sft_ds['train'][2]

{'messages': [{'role': 'system',
   'content': 'Solve the given high school math problem by providing a clear explanation of each step leading to the final solution.\n\n    Provide a detailed breakdown of your calculations, beginning with an explanation of the problem and describing how you derive each formula, value, or conclusion. Use logical steps that build upon one another, to arrive at the final answer in a systematic manner.\n\n    # Steps\n\n    1. **Understand the Problem**: Restate the given math problem and clearly identify the main question and any important given values.\n    2. **Set Up**: Identify the key formulas or concepts that could help solve the problem (e.g., algebraic manipulation, geometry formulas, trigonometric identities).\n    3. **Solve Step-by-Step**: Iteratively progress through each step of the math problem, justifying why each consecutive operation brings you closer to the solution.\n    4. **Double Check**: If applicable, double check the work for accu

In [23]:
sft_ds['train'][2]['messages']

[{'role': 'system',
  'content': 'Solve the given high school math problem by providing a clear explanation of each step leading to the final solution.\n\n    Provide a detailed breakdown of your calculations, beginning with an explanation of the problem and describing how you derive each formula, value, or conclusion. Use logical steps that build upon one another, to arrive at the final answer in a systematic manner.\n\n    # Steps\n\n    1. **Understand the Problem**: Restate the given math problem and clearly identify the main question and any important given values.\n    2. **Set Up**: Identify the key formulas or concepts that could help solve the problem (e.g., algebraic manipulation, geometry formulas, trigonometric identities).\n    3. **Solve Step-by-Step**: Iteratively progress through each step of the math problem, justifying why each consecutive operation brings you closer to the solution.\n    4. **Double Check**: If applicable, double check the work for accuracy and sense