In [1]:
from datasets import Dataset, DatasetDict
import pandas as pd
import json
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open("MathQA/train.json", "r") as f:
    train_data = json.load(f)
train_data = pd.DataFrame(train_data)

with open("MathQA/test.json", "r") as f:
    test_data = json.load(f)
test_data = pd.DataFrame(test_data)

data = pd.concat([train_data, test_data], ignore_index=True)

In [3]:
len(data)

32822

In [6]:
def extract_answer_text(row):
    options = row['options']
    correct_label = row['correct'].strip()

    # Use regex to find all (label, text) pairs
    pattern = r'([a-e])\s*\)\s*([^,]+)'
    matches = re.findall(pattern, options)

    option_dict = {label.strip(): text.strip() for label, text in matches}
    return option_dict.get(correct_label, "unknown")

data['Answer'] = data.apply(extract_answer_text, axis=1)
data.drop(columns=['options', 'correct', 'annotated_formula', 'linear_formula'], inplace=True)
data.rename(columns={'category': 'Category'}, inplace=True)

In [7]:
data.head()

Unnamed: 0,Problem,Rationale,Category,Answer
0,the banker ' s gain of a certain sum due 3 yea...,"""explanation : t = 3 years r = 10 % td = ( bg ...",gain,rs . 400
1,average age of students of an adult school is ...,"""explanation : let the original no . of studen...",general,240
2,sophia finished 2 / 3 of a book . she calculat...,let xx be the total number of pages in the boo...,general,270
3,120 is what percent of 50 ?,"""50 * x = 120 - - > x = 2.4 - - > 2.4 expresse...",gain,240 %
4,there are 10 girls and 20 boys in a classroom ...,"if girls is 10 and boys is 20 , then 10 / 20 ....",other,1 / 2


In [8]:
hf_dataset = Dataset.from_pandas(data)

split = hf_dataset.train_test_split(test_size=0.1)
dataset_dict = DatasetDict({
    "train": split["train"],
    "test": split["test"]
})

dataset_dict

DatasetDict({
    train: Dataset({
        features: ['Problem', 'Rationale', 'Category', 'Answer'],
        num_rows: 29539
    })
    test: Dataset({
        features: ['Problem', 'Rationale', 'Category', 'Answer'],
        num_rows: 3283
    })
})

In [9]:
dataset_dict.save_to_disk("dataset")

Saving the dataset (1/1 shards): 100%|██████████| 29539/29539 [00:00<00:00, 193357.48 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 3283/3283 [00:00<00:00, 167798.74 examples/s]
