## Extract answer from solutions 

In [None]:
import pandas as pd
import re
import time
from tqdm import tqdm
import google.generativeai as genai

# Load dataset
dataset_path = "/Users/tanalpha_aditya/Desktop/ALM/Project/SPARK-Maths/Baseline/Qwen-Math/Results/MATH/checkpoint_math_answers.csv"
df = pd.read_csv(dataset_path)

# Gemini API client
GEMINI_API_KEY = "AIzaSyAp1rEiqocvC2GlhKL2abcVgcvxd8pWtx4"
genai.configure(api_key=GEMINI_API_KEY)

client = genai.GenerativeModel(model_name="gemini-2.0-flash")

def extract_integer_answer(response):
    """Extracts the final answer from the response and converts it to the nearest integer."""
    numbers = re.findall(r"[-+]?[0-9]*\.?[0-9]+", response)
    if numbers:
        return round(float(numbers[-1]))  # Convert to float, then round to nearest integer
    return None

def get_model_answer(model_response):
    """Generates an answer using Gemini and extracts the final integer."""
    sys_instruct = (
        "Given the extracted solution, extract only the final numerical answer "
        "without any additional text. Provide the closest integer representation."
    )
    max_retries = 5
    for attempt in range(max_retries):
        time.sleep(1)
        try:
            response = client.generate_content(contents=[sys_instruct + "\n" + model_response])
            response_text = response.text if hasattr(response, "text") else ""
            print(response_text)
            ans = extract_integer_answer(response_text)
            print(ans)
            return ans
        except Exception as e:
            print(f"API Error: {e}. Retrying {attempt+1}/{max_retries}...")
            time.sleep(5)
    return None  # Return None if API fails

# Process dataset
total = len(df)
df["Answer"] = None  # Create a new column for extracted answers

for index, row in tqdm(df.iterrows(), total=total, desc="Processing Responses"):
    model_response = row["model_solution"]
    extracted_answer = get_model_answer(model_response)
    df.at[index, "Answer"] = extracted_answer  # Store extracted answer

# Save the updated dataset
df.to_csv("output_qwen_math_with_answers.csv", index=False)
print("CSV file updated with extracted answers: output_qwen_math_with_answers.csv")

In [16]:
import re

def extract_boxed_answer(solution):
    """Extracts the full LaTeX expression inside \boxed{} robustly, handling all cases correctly."""
    match = re.search(r"\\boxed{((?:[^{}]+|{[^{}]*})+)}", str(solution))
    if match:
        return match.group(1).strip()  # Clean any extra spaces
    return None

# Example test cases
test_cases = [
    r"\boxed{\frac{60}{43}}",
    r"\(\boxed{\frac{5}{8}}\)",
    r"\boxed{4.5}",
    r"\(\boxed{420}\)",
    r"\(\boxed{1:03}\)",
    r"\(\boxed{m+2}\)",
    # r"\(\boxed{11 \frac{2}{3}}\)",
    # r"\(\boxed{28\%}\)",
    # r"\(\boxed{10.5\pi}\)"
    # r"\boxed{14\sqrt{15}}",
    r"\boxed{\frac{a}{b} + \sqrt{x}}"
]

# Testing the function
for test in test_cases:
    print(f"Input: {test} → Extracted Answer: {extract_boxed_answer(test)}")


Input: \boxed{\frac{60}{43}} → Extracted Answer: \frac{60}{43}
Input: \(\boxed{\frac{5}{8}}\) → Extracted Answer: \frac{5}{8}
Input: \boxed{4.5} → Extracted Answer: 4.5
Input: \(\boxed{420}\) → Extracted Answer: 420
Input: \(\boxed{1:03}\) → Extracted Answer: 1:03
Input: \(\boxed{m+2}\) → Extracted Answer: m+2
Input: \boxed{\frac{a}{b} + \sqrt{x}} → Extracted Answer: \frac{a}{b} + \sqrt{x}


In [17]:
import pandas as pd
import re
from tqdm import tqdm

# Function to extract the boxed answer
def extract_boxed_answer(solution):
    """Extracts the full LaTeX expression inside \boxed{} robustly, handling all cases correctly."""
    match = re.search(r"\\boxed{((?:[^{}]+|{[^{}]*})+)}", str(solution))
    if match:
        return match.group(1).strip()  # Clean any extra spaces
    return None

# Load dataset
dataset_path = "/Users/tanalpha_aditya/Desktop/ALM/Project/SPARK-Maths/Baseline/Qwen-Math/Results/MATH/checkpoint_math_answers.csv"
df = pd.read_csv(dataset_path)

# Check if "Answer" column exists, if not, create it
if "Answer" not in df.columns:
    df["Answer"] = None

# Process dataset and extract answers
for index, row in tqdm(df.iterrows(), total=len(df), desc="Extracting Answers"):
    if pd.notna(row["Answer"]):  # Skip already processed rows
        continue
    df.at[index, "Answer"] = extract_boxed_answer(row["model_solution"])

# Save the updated dataset
output_path = "output_dataset_with_answers.csv"
df.to_csv(output_path, index=False)

print(f"CSV file updated with extracted answers: {output_path}")


Extracting Answers: 100%|██████████| 2538/2538 [00:00<00:00, 37208.68it/s]

CSV file updated with extracted answers: output_dataset_with_answers.csv



