In [283]:
import pickle
from typing import Type, Callable, Optional

import numpy as np
from datasets import load_dataset
from tqdm import tqdm
from datetime import datetime

from reasoners import LanguageModel, Reasoner, SearchAlgorithm
from reasoners.algorithm import MCTS

from world_model import MATHWorldModel
from search_config import MATHConfig
import utils

from datasets import Dataset
import os
import re
import json

In [284]:
# def data_reader(dataset,dataset_path, split=None, sample_size=100):
#     questions = []
#     answers = []
#     options = []
#     check = []
#     filename = os.path.join(dataset_path, 'AQuA.json')
#     with open(filename, 'r') as file:
#         lines = file.readlines()
#         if split is not None:
#             start, end = split
#             lines = lines[start:end]
#         for line in lines:
#             data = json.loads(line)
#             if isinstance(data, dict):
#                 options_list = data['options']
#                 options_dict = {}
#                 transformed_options = []
#                 for option in options_list:
#                     match = re.search(r'([A-E])\)[^0-9]*([\d.]+)', option)
#                     if match:
#                         formatted_option = match.group(1) + ") " + match.group(2)  # Formatted option
#                         transformed_options.append(formatted_option)
#                         options_dict[match.group(1)] = float(match.group(2))
#                 question_with_options = data['question'] + " " + " ".join(transformed_options)
#                 questions.append(question_with_options)
#                 check.append(transformed_options)
#                 # answers.append(options_dict.get(data['correct']))
#                 answers.append(data['correct'])
#                 options.append(options_list)
#             else:
#                 raise ValueError("Unexpected data format")
#     return Dataset.from_dict({"question": questions, "answer": answers, "options":options, "check": check})

In [285]:
def clean_option_value(value: str) -> str:
    # List of words and characters to exclude
    to_exclude = ["%", "$", "#", " min", " minutes", "seconds" ,"step/minute", "miles", "utes", "km", "Rs. ", "days", "m", "cm","feet", "days", "Loss", "Min","units", "months", "men", "kmph",]  # added spaces before "min" to avoid removing from mathematical expressions
    
    for item in to_exclude:
        value = value.replace(item, "")
        
    match = re.match(r'([A-E])\)', value)
    if match:
        # Replace "A)" with "A) " without adding extra spaces if one already exists
        value = re.sub(r'([A-E])\)', match.group(1) + ') ', value).strip()
        
    return value

In [286]:
def data_reader(dataset,dataset_path, split=None, sample_size=100):
    questions = []
    answers = []
    options = []
    filename = os.path.join(dataset_path, 'AQuA.json')
    with open(filename, 'r') as file:
        lines = file.readlines()
        if split is not None:
            start, end = split
            lines = lines[start:end]
        for line in lines:
            data = json.loads(line)
            if isinstance(data, dict):
                options_list = data['options']
                cleaned_options = [clean_option_value(opt) for opt in options_list]
                transformed_options = []
                question_with_options = data['question'] + " " + " ".join(cleaned_options)
                questions.append(question_with_options)
                answers.append(data['correct'])
                options.append(options_list)
            else:
                raise ValueError("Unexpected data format")
    return Dataset.from_dict({"question": questions, "answer": answers, "options":options})

In [287]:
datasetname: str = 'AQuA'
dataset_path: str = '/data/yueshan/llm-reasoners/examples/AQuA/dataset/AQuA'
dataset = data_reader(datasetname, dataset_path)


In [288]:
for i in range(1):
    print(dataset[i]["question"])

A car is being driven, in a straight line and at a uniform speed, towards the base of a vertical tower. The top of the tower is observed from the car and, in the process, it takes 10 minutes for the angle of elevation to change from 45° to 60°. After how much more time will this car reach the base of the tower? A) 5(√3 + 1) B) 6(√3 + √2) C) 7(√3 – 1) D) 8(√3 – 2) E) None of these


In [289]:
def retrieve_answer(output: str) -> Optional[str]:
    match = re.search(r'(?i)answer is[ (]*([A-Za-z0-9]+)(?!.*the answer is)', output)
    if match is None:
        return None
    answer = match[1].replace(',', '').replace('$', '').replace(' ', '')
    if '=' in answer:
        answer = answer[answer.rindex('=') + 1:]
    return answer.upper()

In [290]:
outputs = [
    "after calculation 18-9=9, The answer is B)",
    "after calculation 18-9=9, The answer is (B",
    "after calculation 18-9=9, The answer is (B)",
    "after calculation 18-9=9, The answer is B ",
    "after calculation 18-9=9, The answer is B.",
    "after calculation 18-9=9, The answer is B, ",
    "after calculation 18-9=9, The answer is a",
    "after calculation 18-9=9, so the answer is d",
    'When we plug in 2 × 19 + 15 + 7 for the expression in parentheses, we get 5 + 126 = 6, so the answer is E. "Civil War"',
    'If he increases the price by 20%, the profit percentage will be the same as it was on Day 1. So the answer is A.',
    'The correct option is option B) 13x/2 - 6, where x is the length of a picket in inches. The correct answer is B.',
    'Since the answer is a square root, it can be found by using the following equation:X = (-b +- sqrt(b^2 - 4ac)) / 2a, the answer is d'
]

In [291]:
for output in outputs:
    print(retrieve_answer(output))

B
B
B
B
B
B
A
D
E
A
B
D
