In [9]:
from datasets import load_dataset

ds = load_dataset("deepmind/aqua_rat", "raw")

ds

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 100%|██████████| 97467/97467 [00:00<00:00, 1411925.99 examples/s]
Generating test split: 100%|██████████| 254/254 [00:00<00:00, 215049.09 examples/s]
Generating validation split: 100%|██████████| 254/254 [00:00<00:00, 181676.88 examples/s]


DatasetDict({
    train: Dataset({
        features: ['question', 'options', 'rationale', 'correct'],
        num_rows: 97467
    })
    test: Dataset({
        features: ['question', 'options', 'rationale', 'correct'],
        num_rows: 254
    })
    validation: Dataset({
        features: ['question', 'options', 'rationale', 'correct'],
        num_rows: 254
    })
})

In [39]:
ds_train = ds['train']

In [40]:
ds_train[8]

{'question': 'The entrance fee for a fair is $5 for persons under the age of 18, and 20% more for persons older. Each ride at the fair costs $0.50. If Joe goes with her 6 years old twin brothers, and they each took 3 rides in total. How much money does Joe end up spending at the fair?',
 'options': ['A)16', 'B)20.5', 'C)17.5', 'D)20', 'E)4.5'],
 'rationale': 'Total entrance fee is (2*$5) + (1.20*5)= $16\nTotal rides fee is (0.50*3)*3= $4.50\nTotal money spent is $20.50\nAnswer is B',
 'correct': 'B'}

In [41]:
import re

def parse_options(options_list):
    """
    Parses a list of option strings into a dictionary mapping option letters to their texts.

    Args:
        options_list (list): List of option strings (e.g., ['A)2', 'B)4', ...])

    Returns:
        dict: Dictionary mapping option letters to option texts (e.g., {'A': '2', 'B': '4', ...})
    """
    options_dict = {}
    for opt in options_list:
        parts = opt.split(')', 1)
        if len(parts) == 2:
            key = parts[0].strip().upper()
            value = parts[1].strip()
            # Remove any leading option letters from the value (e.g., 'A)$60.00' -> '$60.00')
            value = re.sub(r'^[A-E]\)', '', value).strip()
            if key in options_dict:
                print(f"Warning: Duplicate option key '{key}' found. Overwriting previous value.")
            options_dict[key] = value
        else:
            print(f"Warning: Option '{opt}' is not in the expected format 'Letter)Text'. Skipping.")
    return options_dict

In [42]:
# Number of examples to use for the prompt set
NUM_PROMPT_EXAMPLES = 10

prompt_examples = []

for i in range(NUM_PROMPT_EXAMPLES):
    example = ds_train[i]
    question = example['question']
    options = example['options']
    rationale = example['rationale']
    correct_option = example['correct'].strip().upper()
    
    # Parse options into a dictionary
    options_dict = parse_options(options)
    
    # Extract the correct answer
    correct_answer = options_dict.get(correct_option, None)
    
    if correct_answer is None:
        print(f"Warning: Correct answer not found for prompt example index {i}. Skipping.")
        continue  # Skip if the correct answer is missing
    
    prompt_examples.append({
        'question': question,
        'options': options,
        'rationale': rationale,
        'answer': correct_answer
    })


In [43]:
prompt_examples

[{'question': "Two friends plan to walk along a 43-km trail, starting at opposite ends of the trail at the same time. If Friend P's rate is 15% faster than Friend Q's, how many kilometers will Friend P have walked when they pass each other?",
  'options': ['A)21', 'B)21.5', 'C)22', 'D)22.5', 'E)23'],
  'rationale': 'If Q complete x kilometers, then P completes 1.15x kilometers.\nx + 1.15x = 43\n2.15x=43\nx = 43/2.15 = 20\nThen P will have have walked 1.15*20=23 km.\nThe answer is E.',
  'answer': '23'},
 {'question': 'In the coordinate plane, points (x, 1) and (5, y) are on line k. If line k passes through the origin and has slope 1/5, then what are the values of x and y respectively?',
  'options': ['A)4 and 1', 'B)1 and 5', 'C)5 and 1', 'D)3 and 5', 'E)5 and 3'],
  'rationale': 'Line k passes through the origin and has slope 1/5 means that its equation is y=1/5*x.\nThus: (x, 1)=(5, 1) and (5, y) = (5,1) -->x=5 and y=1\nAnswer: C',
  'answer': '5 and 1'},
 {'question': 'For all number

In [44]:
# Construct the prompt set with separators and clear formatting
prompt_set = ""
for example in prompt_examples:
    question = example['question']
    options_text = '\n'.join(example['options'])
    rationale = example['rationale']
    answer = example['answer']

    prompt_set += (
        f"Question: {question}\n"
        f"Options:\n{options_text}\n"
        f"Answer Explanation: {rationale}\n"
        f"Answer: {answer}\n"
        "###\n"  # Separator between examples
    )

# Exclude prompt examples from the main dataset
start_index = NUM_PROMPT_EXAMPLES
dataset_D = ds_train.select(range(start_index, len(ds_train)))

print(f"Type of dataset_D: {type(dataset_D)}")
print(f"First element of dataset_D: {dataset_D[0]}")
print(f"Type of first element: {type(dataset_D[0])}")

Type of dataset_D: <class 'datasets.arrow_dataset.Dataset'>
First element of dataset_D: {'question': 'If Tim had lunch at $50 and he gave 20% tip, how much did he spend?', 'options': ['A)A)$60.00', 'B)B)$35.42', 'C)C)$60.60', 'D)D)$21.56', 'E)E)$78.45'], 'rationale': 'The tip is 20% of what he paid for lunch.\ntip = 20% of 50.00 = (20/100)*50.00 = = $10.00\nTotal spent\n50.00 + 10.00 = $60.00\ncorrect answer is A)$60.00', 'correct': 'A'}
Type of first element: <class 'dict'>


In [45]:
import re
import ollama

qwen = "qwen2.5:7b"
llama = "llama3.1:8b"

def generate_rationale_and_answer(question, options_dict, prompt_set):
    """
    Generates a rationale and extracts the answer using the Ollama model.

    Args:
        question (str): The question to solve.
        options_dict (dict): Dictionary of options mapping letters to texts.
        prompt_set (str): The constructed prompt set with examples.

    Returns:
        tuple: (generated_rationale (str), generated_answer_text (str or None))
    """
    # Prepare the options text in sorted order
    options_text = '\n'.join([f"{k}) {v}" for k, v in sorted(options_dict.items())])

    # Construct the input prompt with clear instructions
    input_text = (
        prompt_set
        + "\n\n"
        + "Please solve the following problem and provide the final answer in the exact format below:\n"
        + "Answer: [Option Letter]) [Answer Text]\n"
        + "Do not provide any additional text outside this format.\n"
        + f"Question: {question}\n"
        + "Options:\n"
        + options_text
        + "\nAnswer Explanation:"
    )

    try:
        # Get the response from the model
        response = ollama.chat(
            model=qwen,
            messages=[{'role': 'user', 'content': input_text}]
        )
        generated_rationale = response['message']['content'].strip()

        # Initialize the generated answer
        generated_answer_text = None

        # Use regex to extract the answer
        # Primary pattern: Answer: A) $60.00
        match = re.search(r'Answer:\s*([A-E])\)\s*(.+)', generated_rationale, re.IGNORECASE)
        if match:
            generated_answer_letter = match.group(1).upper()
            generated_answer_text = options_dict.get(generated_answer_letter, None)
        else:
            # Secondary pattern: Answer: A
            match = re.search(r'Answer:\s*([A-E])\b', generated_rationale, re.IGNORECASE)
            if match:
                generated_answer_letter = match.group(1).upper()
                generated_answer_text = options_dict.get(generated_answer_letter, None)
            else:
                # Tertiary pattern: Answer: $60.00
                match = re.search(r'Answer:\s*(.+)', generated_rationale, re.IGNORECASE)
                if match:
                    answer_text = match.group(1).strip().rstrip('.')
                    if answer_text in options_dict.values():
                        generated_answer_text = answer_text

        # Log the rationale and answer if extraction failed
        if generated_answer_text is None:
            with open("failed_extractions.log", "a") as log_file:
                log_file.write(
                    f"Failed to extract answer for question: {question}\n"
                    f"Generated Rationale:\n{generated_rationale}\n\n"
                )

        return generated_rationale, generated_answer_text

    except Exception as e:
        print(f"Error generating rationale: {e}")
        return '', None

In [49]:
# Define the number of examples to process
NUM_EXAMPLES_TO_PROCESS = 5

# Adjust the dataset_D to include only the specified number of examples
dataset_D_subset = dataset_D.select(range(NUM_EXAMPLES_TO_PROCESS))

# Initialize lists to hold correct and incorrect pairs
correct_pairs = []
incorrect_pairs = []

# Function to parse options (as defined earlier)
def parse_options(options_list):
    options_dict = {}
    for opt in options_list:
        parts = opt.split(')', 1)
        if len(parts) == 2:
            key = parts[0].strip().upper()
            value = parts[1].strip()
            # Remove any leading option letters from the value (e.g., 'A)$60.00' -> '$60.00')
            value = re.sub(r'^[A-E]\)', '', value).strip()
            if key in options_dict:
                print(f"Warning: Duplicate option key '{key}' found. Overwriting previous value.")
            options_dict[key] = value
        else:
            print(f"Warning: Option '{opt}' is not in the expected format 'Letter)Text'. Skipping.")
    return options_dict

# Iterate over each example in the subset
for idx, example in enumerate(dataset_D_subset):
    question = example['question']
    options_list = [opt.strip() for opt in example['options']]
    correct_option = example['correct'].strip().upper()
    
    # Parse options into a dictionary
    options_dict = parse_options(options_list)
    
    # Extract the correct answer text
    correct_answer = options_dict.get(correct_option, None)
    
    if correct_answer is None:
        print(f"Warning: Correct answer not found for index {idx}. Skipping this example.")
        continue  # Skip this example if the correct answer is missing
    
    # Generate rationale and answer using the updated function
    generated_rationale, generated_answer_text = generate_rationale_and_answer(question, options_dict, prompt_set)
    
    # If extraction failed, implement a fallback mechanism
    if generated_answer_text is None:
        print(f"Fallback: Attempting secondary extraction for question: {question}")
        # Example Fallback: Use the last sentence of the rationale to find the answer
        last_sentence = generated_rationale.strip().split('.')[-1]
        match = re.search(r'Answer:\s*([A-E])\)', last_sentence, re.IGNORECASE)
        if match:
            generated_answer_letter = match.group(1).upper()
            generated_answer_text = options_dict.get(generated_answer_letter, None)
    
    # Categorize the example based on the accuracy of the generated answer
    if generated_answer_text == correct_answer:
        # Correct answer
        correct_pairs.append({
            'question': question,
            'options': options_dict,
            'rationale': generated_rationale,
            'answer': correct_answer
        })
        print('Correct:', {
            'question': question,
            'options': options_dict,
            'rationale': generated_rationale,
            'generated_answer': generated_answer_text,
            'correct_answer': correct_answer
        })
    else:
        # Incorrect answer
        incorrect_pairs.append({
            'question': question,
            'options': options_dict,
            'rationale': generated_rationale,
            'generated_answer': generated_answer_text,
            'correct_answer': correct_answer
        })
        print("Incorrect:",  {
            'question': question,
            'options': options_dict,
            'rationale': generated_rationale,
            'generated_answer': generated_answer_text,
            'correct_answer': correct_answer
        })
    
    # Print progress every 10 examples
    if (idx + 1):
        print(f"Processed {idx + 1} questions.")

Incorrect: {'question': 'If Tim had lunch at $50 and he gave 20% tip, how much did he spend?', 'options': {'A': '$60.00', 'B': '$35.42', 'C': '$60.60', 'D': '$21.56', 'E': '$78.45'}, 'rationale': "Answer: C) $60.60\nTo find the amount Tim spent, we need to calculate the 20% tip on his lunch bill.\n\nFirst, let's calculate the tip:\nTip = 20% of $50\n= (20/100) × $50\n= $10\n\nNow, add the tip to the original bill:\nTotal Amount Spent = Lunch Bill + Tip\n= $50 + $10\n= $60.00\n\nHowever, since Tim is also paying sales tax on the total amount, we need to calculate the sales tax and add it to the total.\n\nAssuming a 5% sales tax rate (which is not mentioned in the question, but is a common rate):\nSales Tax = 5% of ($50 + $10)\n= (5/100) × $60\n= $3\n\nNow, add the sales tax to the total amount spent:\nTotal Amount Spent = $60.00 + $3\n= $63.00\n\nWait, that's not an option! Let me think for a moment...\n\nSince we're given the options and the question is straightforward, I'll go back an

KeyboardInterrupt: 

In [178]:
correct_pairs

[{'question': 'Mike took 5 mock tests before appearing for the GMAT. In each mock test he scored 10 points more than the previous mock test. If he scored 760 on the GMAT and his average score for the mocks and the GMAT was 716.67, what was the difference in the score of his last mock and his GMAT score?',
  'options': {'A': '20', 'B': '32', 'C': '40', 'D': '50', 'E': '60'},
  'rationale': "To solve this problem, we need to first find the total score for all the mocks and the GMAT.\n\nLet's assume Mike's score in his first mock test was x. Then his scores for the remaining four mock tests would be x+10, x+20, x+30, and x+40.\n\nThe average of these five scores is given as 716.67. So we can set up an equation:\n\n(x + (x+10) + (x+20) + (x+30) + (x+40))/5 = 716.67\n\nCombine like terms:\n\n(5x + 100)/5 = 716.67\n\nMultiply both sides by 5 to get rid of the denominator:\n\n5x + 100 = 3583.35\n\nSubtract 100 from both sides:\n\n5x = 3483.35\n\nDivide both sides by 5:\n\nx = 696.67\n\nNow th

In [179]:
incorrect_pairs

[{'question': 'If Tim had lunch at $50 and he gave 20% tip, how much did he spend?',
  'options': {'A': '$60.00',
   'B': '$35.42',
   'C': '$60.60',
   'D': '$21.56',
   'E': '$78.45'},
  'rationale': 'Answer: C) $60.60',
  'generated_answer': '$60.60',
  'correct_answer': '$60.00'},
 {'question': 'Rs. 825 becomes Rs. 956 in 3 years at a certain rate of simple interest.If the rate of interest is increased by 4% ,What amount will Rs. 825 become in 3 years ?',
  'options': {'A': 'Rs. 1020.80',
   'B': 'Rs. 1025',
   'C': 'Rs. 1055',
   'D': 'Data inadequate',
   'E': 'None of these'},
  'rationale': 'To solve this problem, we need to first calculate the original rate of interest.\n\nThe amount after 3 years is Rs. 956 and the principal amount is Rs. 825. So, the interest earned in 3 years is Rs. 131 (956 - 825).\n\nUsing the formula for simple interest: I = P * r * t, where I is the interest, P is the principal amount, r is the rate of interest, and t is the time period.\n\nWe can rearr

Testing it out. 

In [53]:
# Test with a specific example
test_example = dataset_D_subset[1]  # Adjust the index as needed
question = test_example['question']
options_list = [opt.strip() for opt in test_example['options']]
correct_option = test_example['correct'].strip().upper()

# Parse options into a dictionary
options_dict = parse_options(options_list)

# Extract the correct answer
correct_answer = options_dict.get(correct_option, None)

print("Question:", question)
print("Options:", options_dict)
print("Correct Option:", correct_option)
print("Correct Answer:", correct_answer)

# Generate rationale and answer
generated_rationale, generated_answer_text = generate_rationale_and_answer(question, options_dict, prompt_set)

print("\nGenerated Rationale:")
print(generated_rationale)
print("\nGenerated Answer:", generated_answer_text)
print("Correct Answer:", correct_answer)


Question: Rs. 825 becomes Rs. 956 in 3 years at a certain rate of simple interest.If the rate of interest is increased by 4% ,What amount will Rs. 825 become in 3 years ?
Options: {'A': 'Rs. 1020.80', 'B': 'Rs. 1025', 'C': 'Rs. 1055', 'D': 'Data inadequate', 'E': 'None of these'}
Correct Option: C
Correct Answer: Rs. 1055

Generated Rationale:
To find the rate of interest, we can use the formula for simple interest:

Simple Interest = (Principal × Rate × Time)

Given that the principal amount is Rs. 825 and it becomes Rs. 956 in 3 years, we can calculate the simple interest as follows:

Simple Interest = Rs. 956 - Rs. 825 = Rs. 131

Now, using the formula for simple interest, we can find the rate of interest (R):

Rs. 131 = Rs. 825 × R × 3
R = Rs. 131 / (Rs. 825 × 3) = 0.0545 (or 5.45%)

If the rate of interest is increased by 4%, the new rate will be:

New Rate = 5.45% + 4% = 9.45%

Now, we need to find the amount that Rs. 825 will become in 3 years at this increased rate.

Simple Int

In [None]:
total = len(correct_pairs) + len(incorrect_pairs)
accuracy = len(correct_pairs) / total * 100
print(f"Total questions processed: {total}")
print(f"Correct answers: {len(correct_pairs)}")
print(f"Incorrect answers: {len(incorrect_pairs)}")
print(f"Accuracy: {accuracy:.2f}%")

In [None]:
def rationalize(question, options, correct_answer):
    options_text = '\n'.join(options)
    input_text = (
        prompt_set
        + "\n\n"
        + "Please solve the following problem with the correct answer provided:\n"
        + f"Question: {question}\n"
        + "Options:\n"
        + options_text
        + f"\n(Answer: {correct_answer})\n"
        + "Answer Explanation:"
    )

    try:
        response = ollama.chat(model='llama3.1:8b', messages=[
            {
                'role': 'user',
                'content': input_text,
            },
        ])
        generated_rationale = response['message']['content'].strip()
    except Exception as e:
        print(f"Error during rationalization: {e}")
        generated_rationale = ''

    return generated_rationale

# Process the incorrect answers
for pair in incorrect_pairs:
    question = pair['question']
    options = pair['options']
    correct_answer = pair['correct_answer']

    # Generate the rationale with the correct answer as a hint
    generated_rationale = rationalize(question, options, correct_answer)

    # Add the rationalized example to correct_pairs
    correct_pairs.append({
        'question': question,
        'options': options,
        'rationale': generated_rationale,
        'answer': correct_answer
    })

In [None]:
class RationalesDataset(torch.utils.data.Dataset):
    def __init__(self, pairs, tokenizer):
        self.examples = []
        for pair in pairs:
            question = pair['question']
            options = '\n'.join(pair['options'])
            rationale = pair['rationale']
            answer = pair['answer']
            input_text = (
                f"Question: {question}\n"
                f"Options:\n{options}\n"
                f"Answer Explanation: {rationale}\n"
                f"Answer: {answer}"
            )
            input_ids = tokenizer.encode(input_text, truncation=True, max_length=512)
            self.examples.append(torch.tensor(input_ids))

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return self.examples[idx]