In [1]:
import re
import ollama
from src.data_loading import load_data
from src.parsing import parse_options
from src.rationale_generation import generate_rationale_and_answer
from src.utils import is_rationale_correct
from src.rationalization import rationalize
from src.prompt_generation import create_prompt_examples, create_prompt_set, clean_options, get_correct_answer_text

  from .autonotebook import tqdm as notebook_tqdm


In [808]:
NUM_PROMPT_EXAMPLES = 5 # number of prompt examples
NUM_EXAMPLES_TO_PROCESS = 10 # number of examples to process

### Load Dataset

In [809]:
ds_train = load_data()

# Step 1: Select the desired range from ds_train
dataset_D = ds_train.select(range(NUM_PROMPT_EXAMPLES, len(ds_train)))

# Step 2: Shuffle the selected dataset
dataset_D_shuffled = dataset_D.shuffle()  # Use any seed you prefer

# Step 3: Select a subset of examples to process
dataset_D_subset = dataset_D_shuffled.select(range(NUM_EXAMPLES_TO_PROCESS))

print(dataset_D_subset)

Dataset({
    features: ['question', 'options', 'rationale', 'correct'],
    num_rows: 10
})


### Create prompt examples

In [810]:
prompt_examples = create_prompt_examples(ds_train, NUM_PROMPT_EXAMPLES)

prompt_examples

[{'question': "Two friends plan to walk along a 43-km trail, starting at opposite ends of the trail at the same time. If Friend P's rate is 15% faster than Friend Q's, how many kilometers will Friend P have walked when they pass each other?",
  'rationale': 'If Q complete x kilometers, then P completes 1.15x kilometers.\nx + 1.15x = 43\n2.15x=43\nx = 43/2.15 = 20\nThen P will have have walked 1.15*20=23 km.\nThe answer is E.',
  'answer': 'E'},
 {'question': 'In the coordinate plane, points (x, 1) and (5, y) are on line k. If line k passes through the origin and has slope 1/5, then what are the values of x and y respectively?',
  'rationale': 'Line k passes through the origin and has slope 1/5 means that its equation is y=1/5*x.\nThus: (x, 1)=(5, 1) and (5, y) = (5,1) -->x=5 and y=1\nAnswer: C',
  'answer': 'C'},
 {'question': 'For all numbers p and q, the operation @ is defined by p@q = p^2 - pq. If xy ≠ 0, then which of the following can be equal to zero?\nI. x@y\nII. (xy)@y\nIII. x@

### Create Prompt Sets

In [811]:
prompt_set = create_prompt_set(ds_train, NUM_PROMPT_EXAMPLES)
print(prompt_set)

Question: Two friends plan to walk along a 43-km trail, starting at opposite ends of the trail at the same time. If Friend P's rate is 15% faster than Friend Q's, how many kilometers will Friend P have walked when they pass each other?
Answer Explanation: If Q complete x kilometers, then P completes 1.15x kilometers.\nx + 1.15x = 43\n2.15x=43\nx = 43/2.15 = 20\nThen P will have have walked 1.15*20=23 km.\nThe answer is E.
Answer: 23
###
Question: In the coordinate plane, points (x, 1) and (5, y) are on line k. If line k passes through the origin and has slope 1/5, then what are the values of x and y respectively?
Answer Explanation: Line k passes through the origin and has slope 1/5 means that its equation is y=1/5*x.\nThus: (x, 1)=(5, 1) and (5, y) = (5,1) -->x=5 and y=1\nAnswer: C
Answer: 5 and 1
###
Question: For all numbers p and q, the operation @ is defined by p@q = p^2 - pq. If xy ≠ 0, then which of the following can be equal to zero?
I. x@y
II. (xy)@y
III. x@(x + y)
Answer Expl

### Initialize lists to hold correct and incorrect pairs

In [812]:
correct_pairs = []
incorrect_pairs = []
unanswered_paris = []

### Iterate over each example in the subset

In [813]:
def extract_answer_text(rationale):
    """
    Extracts the answer text from the rationale using regex.
    
    Args:
        rationale (str): Generated rationale string.
    
    Returns:
        str or None: Extracted answer text if found, else None.
    """
    # Attempt to extract the answer text after 'Answer:'
    match = re.search(r'Answer:\s*(.+)', rationale, re.IGNORECASE)
    if match:
        return match.group(1)
    return None

In [814]:
for idx, example in enumerate(dataset_D_subset):
    question = example['question']
    # Map 'correct' label to answer text
    raw_options = example['options']
    cleaned_options = clean_options(raw_options)
    correct_label = example['correct'].strip().upper()
    correct_answer_text = get_correct_answer_text(cleaned_options, correct_label)

    print(f"correct_answer_text: {correct_answer_text}")
    print(f"clean options: {cleaned_options}")
    
    if correct_answer_text is None:
        print(f"Skipping example {idx} due to missing correct answer.")
        continue  # Skip this example
    
    # Generate rationale and answer
    generated_rationale = generate_rationale_and_answer(question, prompt_set)
    
    # Extract the answer text from the rationale
    extracted_answer = extract_answer_text(generated_rationale)
    print(f"extracted answer: {extracted_answer}")

    response = ollama.chat(model="llama3.1:8b", messages=[
            {
                'role': 'user',
                'content':
                f"""
                    Your task is to compare two numerical answers and determine if they are the same answer, ignoring differences in units or formatting.\n\n
                    Comparison Rules:\n\n
                    - If the answers are the same, for example, First Answer: '90' and Second Answer: '90 miles' or ( km, %, sec, ml, etc) this is a match and return 'correct' in your response.\n
                    - If the answers are different, consider them NOT a match and return 'incorrect' in your response.\n\n
                    Ignore differences in formatting, such as trailing zeros.\n\n

                    Compare the Following Answers:\n
                    First answer: {extracted_answer}\n
                    Second answer: {correct_answer_text}\n\n
                    Respond with:\n
                    "correct" if the two answers are the same\n
                    "incorrect" if the two answers are not the same\n\n
                    Please respond with only one of the above options, without any explanations.
                """
            },
     ])
    
    decision = response['message']['content'].strip()
    print(decision)
    
    # Categorize based on extracted answer
    if decision.lower() == "correct":
        correct_pairs.append({
            'question': question,
            'rationale': generated_rationale,
            'answer': extracted_answer
        })
        print('Correct:', {
            'question': question,
            'rationale': generated_rationale,
            'answer': extracted_answer
        })
    elif decision.lower() == "incorrect":
        incorrect_pairs.append({
            'question': question,
            'rationale': generated_rationale,
            'answer': extracted_answer if extracted_answer else "No Answer Extracted"
        })
        print("Incorrect:",  {
            'question': question,
            'rationale': generated_rationale,
            'answer': extracted_answer if extracted_answer else "No Answer Extracted"
        })
    else:
        unanswered_paris.append({
            'question': question,
            'rationale': generated_rationale,
            'answer': extracted_answer if extracted_answer else "No Answer Extracted"
        })
        print("Unanswered:",  {
            'question': question,
            'rationale': generated_rationale,
            'answer': extracted_answer if extracted_answer else "No Answer Extracted"
        })
        

    # Print progress every example
    print(f"Processed {idx + 1} questions.\n")

correct_answer_text: 20
clean options: ['A) 20', 'B) 30', 'C) 40', 'D) 50', 'E) 60']
extracted answer: 20
correct
Correct: {'question': 'Working at a constant rate, P can finish a job in 3 hours. Q, also working at a constant rate, can finish the same job in 9 hours. If they work together for 2 hours, how many more minutes will it take P to finish the job, working alone at his constant rate?', 'rationale': "To solve this problem, we need to first find the individual rates of P and Q.\n\nLet's calculate their rates:\n\nRate of P = 1 job / 3 hours = 1/3 jobs per hour\nRate of Q = 1 job / 9 hours = 1/9 jobs per hour\n\nNow, let's calculate how much work they can do together in 2 hours:\n\nCombined rate = Rate of P + Rate of Q = 1/3 + 1/9\n= (3+1)/9\n= 4/9 jobs per hour\n\nWork done by both in 2 hours = Combined rate × Time = 4/9 × 2 = 8/9\n\nThis means they completed 8/9 of the job together.\n\nNow, let's find out how much work is left for P to finish alone:\n\nWork left for P = Total wor

In [818]:
correct_pairs.append({'question': 'This topic is locked. If you want to discuss this question please re-post it in the respective forum.\nMatt and Peter can do together a piece of work in 20 days. After they have worked together for 12 days Matt stops and Peter completes the remaining work in 12 days. In how many days Peter complete the work separately.',
  'rationale': "Let's solve this problem step by step:\n\nStep 1: Calculate the rate at which Matt and Peter can do the work together.\nSince they can complete the work together in 20 days, their combined rate is 1/20 of the work per day.\n\nStep 2: Calculate the portion of the work completed by Matt and Peter in 12 days.\nIn 12 days, they can complete (12/20) = 3/5 of the work.\n\nStep 3: Calculate the remaining portion of the work to be completed.\nSince they have completed 3/5 of the work, the remaining 2/5 is left for Peter to complete alone.\n\nStep 4: Determine the rate at which Peter can do the work alone.\nPeter completes (2/5) of the work in 12 days. Therefore, his rate is (2/5)/12 = 1/30 of the work per day.\n\nStep 5: Calculate the time it takes for Peter to complete the entire work.\nSince Peter's rate is 1/30 of the work per day, he can complete the entire work in 30 days.\n\nAnswer: 30",
  'answer': '30'})

In [819]:
correct_pairs

[{'question': 'Working at a constant rate, P can finish a job in 3 hours. Q, also working at a constant rate, can finish the same job in 9 hours. If they work together for 2 hours, how many more minutes will it take P to finish the job, working alone at his constant rate?',
  'rationale': "To solve this problem, we need to first find the individual rates of P and Q.\n\nLet's calculate their rates:\n\nRate of P = 1 job / 3 hours = 1/3 jobs per hour\nRate of Q = 1 job / 9 hours = 1/9 jobs per hour\n\nNow, let's calculate how much work they can do together in 2 hours:\n\nCombined rate = Rate of P + Rate of Q = 1/3 + 1/9\n= (3+1)/9\n= 4/9 jobs per hour\n\nWork done by both in 2 hours = Combined rate × Time = 4/9 × 2 = 8/9\n\nThis means they completed 8/9 of the job together.\n\nNow, let's find out how much work is left for P to finish alone:\n\nWork left for P = Total work - Work done by both\n= 1 - 8/9\n= 1/9\n\nSince P can finish 1 job in 3 hours, he can complete 1/9 of the job in (1/9) 

In [817]:
incorrect_pairs

[{'question': 'In a camp,there is a meal for 120 men or 200 children.If 120 children have taken the meal,how many men will be catered to with the remaining meal ?',
  'rationale': "To find out how many men can be catered to with the remaining meal, we first need to determine the total number of meals that were initially available.\n\nSince there is a meal for 120 men or 200 children, we can say that the initial number of meals is a multiple of both 120 and 200. Let's find the least common multiple (LCM) of 120 and 200.\n\nThe prime factorization of 120 is 2^3 * 3 * 5 and of 200 is 2^3 * 5 * 5. The LCM is 2^3 * 3 * 5 * 5 = 600.\n\nSo, the total number of meals initially available is 600.\n\nIf 120 children have taken the meal, then the remaining number of meals can be found by subtracting the number of meals consumed from the initial number of meals:\n\nRemaining meals = Initial meals - Meals consumed\n= 600 - (200 * 120/600)\n= 600 - 2400/6\n= 600 - 400\n= 200\n\nSince there is a meal 

In [764]:
unanswered_paris

[]

In [765]:
total = len(correct_pairs) + len(incorrect_pairs) + len(unanswered_paris)
accuracy = len(correct_pairs) / total * 100
print(f"Total questions processed: {total}")
print(f"Correct answers: {len(correct_pairs)}")
print(f"Incorrect answers: {len(incorrect_pairs)}")
print(f"Unanswered answers: {len(unanswered_paris)}")
print(f"Accuracy: {accuracy:.2f}%")

Total questions processed: 10
Correct answers: 2
Incorrect answers: 8
Unanswered answers: 0
Accuracy: 20.00%


In [26]:
new_correct_pairs = []

# Process the incorrect answers
for pair in incorrect_pairs:
    question = pair['question']
    options = pair['options']
    correct_answer = pair['correct_answer']

    # Generate the rationale with the correct answer as a hint
    generated_rationale = rationalize(question, options, correct_answer, prompt_set)

    # Add the rationalized example to correct_pairs
    new_correct_pairs.append({
        'question': question,
        'options': options,
        'rationale': generated_rationale,
        'answer': correct_answer
    })

    print({
        'question': question,
        'options': options,
        'rationale': generated_rationale,
        'answer': correct_answer
    })

{'question': 'Rs. 825 becomes Rs. 956 in 3 years at a certain rate of simple interest.If the rate of interest is increased by 4% ,What amount will Rs. 825 become in 3 years ?', 'options': {'A': 'Rs. 1020.80', 'B': 'Rs. 1025', 'C': 'Rs. 1055', 'D': 'Data inadequate', 'E': 'None of these'}, 'rationale': "Let's break down the problem step by step.\n\nGiven:\n\n1. Principal (P) = Rs. 825\n2. Amount after 3 years (A) = Rs. 956\n\nWe need to find the rate of interest (R) at which P becomes A in 3 years. We'll use the formula for simple interest:\n\nSimple Interest (SI) = (P × R × T)/100\n\nwhere T is the time period, which is 3 years here.\n\nFirst, let's calculate the SI for the given scenario:\n\nA = P + SI\n956 = 825 + (SI)\nSI = 131\n\nNow, we can use the formula to find the rate of interest (R):\n\nSI = (P × R × T)/100\n131 = (825 × R × 3)/100\n131 = 24.75R\nR = 131/24.75 ≈ 5.29%\n\nThis is the original rate of interest.\n\nNow, let's consider what happens when the rate of interest is i

### Converting our corrected pairs into a structure the Llama3.1 model will understand for fine tuning.

In [23]:
import json
import re

def extract_question(human_message):
    """
    Extracts the question from the human message.

    Args:
        human_message (str): The message content from the human.

    Returns:
        str: Extracted question text or the original message if extraction fails.
    """
    match = re.search(r'Question:\s*(.+)', human_message, re.IGNORECASE)
    if match:
        return match.group(1).strip()
    return human_message.strip()

def extract_rationale(gpt_message):
    """
    Extracts the rationale from the GPT message by removing any initial 'A: ...' lines,
    removing 'Rationale:' labels, and excluding 'Answer: ...' sections.

    Args:
        gpt_message (str): The message content from GPT.

    Returns:
        str: Extracted rationale text or an empty string if extraction fails.
    """
    # Step 1: Remove any initial "A: ..." lines
    gpt_message = re.sub(r'^A:\s*.+\n', '', gpt_message, flags=re.IGNORECASE)

    # Step 2: Extract text after "Rationale:" up to "Answer:" or end of string
    match = re.search(r'Rationale:\s*(.*?)(?:\nAnswer:.*|$)', gpt_message, re.IGNORECASE | re.DOTALL)
    if match:
        rationale = match.group(1).strip()
        return rationale
    else:
        # If "Rationale:" is not found, remove any trailing "Answer: ..." and return the rest
        gpt_message = re.sub(r'\n*Answer:\s*.+$', '', gpt_message, flags=re.IGNORECASE | re.DOTALL).strip()
        return gpt_message

def convert_existing_jsonl(input_file, output_file):
    """
    Converts the existing finetuning_data.jsonl file to the new Q&A format.

    Args:
        input_file (str): Path to the existing JSONL file.
        output_file (str): Path to the new JSONL file to be created.

    Returns:
        None
    """
    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'w', encoding='utf-8') as outfile:
        
        for line_num, line in enumerate(infile, 1):
            try:
                # Each line is expected to be a JSON array of messages
                messages = json.loads(line)
                
                # Ensure there are at least two messages: human and gpt
                if len(messages) < 2:
                    print(f"Warning: Line {line_num} has less than 2 messages. Skipping.")
                    continue
                
                # Extract human and gpt messages
                human_message = ""
                gpt_message = ""
                for msg in messages:
                    if msg.get("from") == "human":
                        human_message = msg.get("value", "").strip()
                    elif msg.get("from") == "gpt":
                        gpt_message = msg.get("value", "").strip()
                
                if not human_message or not gpt_message:
                    print(f"Warning: Line {line_num} is missing human or gpt message. Skipping.")
                    continue
                
                # Extract question and rationale
                question = extract_question(human_message)
                rationale = extract_rationale(gpt_message)
                
                if not question or not rationale:
                    print(f"Warning: Line {line_num} could not extract question or rationale. Skipping.")
                    continue
                
                # Create the new format
                new_entry = {
                    "text": f"Q: {question}\nA: {rationale}"
                }
                
                # Write to the new JSONL file
                outfile.write(json.dumps(new_entry, ensure_ascii=False) + '\n')
            
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON on line {line_num}: {e}. Skipping.")
                continue

    print(f"Conversion complete. New file saved as '{output_file}'.")

if __name__ == "__main__":
    input_jsonl = 'finetuning_data.jsonl'        # Existing file
    output_jsonl = 'finetuning_data_new.jsonl'   # New file
        
    # Perform the conversion
    convert_existing_jsonl(input_jsonl, output_jsonl)

Error decoding JSON on line 18: Extra data: line 1 column 1471 (char 1470). Skipping.
Error decoding JSON on line 84: Extra data: line 1 column 936 (char 935). Skipping.
Conversion complete. New file saved as 'finetuning_data_new.jsonl'.


### Appending New Data

In [520]:
import json
import re
import os

def convert_correct_pairs_to_conversations(correct_pairs):
    """
    Converts a list of correct_pairs to the new conversation format.
    
    Args:
        correct_pairs (list): List of dictionaries with keys 'question', 'rationale', and 'answer'.
    
    Returns:
        list: List of dictionaries in the format {"text": "Q: ...\nA: ..."}
    """
    new_conversations = []
    for pair in correct_pairs:
        question = pair.get('question', '').strip()
        rationale = pair.get('rationale', '').strip()
        
        if not question or not rationale:
            print(f"Warning: Missing question or rationale in pair: {pair}. Skipping.")
            continue
        
        # Remove any trailing "Answer: ..." lines from the rationale
        # This ensures that the rationale does not contain explicit "Answer:" labels
        rationale_cleaned = re.sub(r'\n*Answer:\s*.*$', '', rationale, flags=re.IGNORECASE).strip()
        
        # Additionally, remove any "Rationale:" labels if present
        rationale_cleaned = re.sub(r'^Rationale:\s*', '', rationale_cleaned, flags=re.IGNORECASE).strip()
        
        # Create the new conversation format
        convo = {
            "text": f"Q: {question}\nA: {rationale_cleaned}"
        }
        new_conversations.append(convo)
    return new_conversations

def append_conversations_to_jsonl(conversations, file_path='finetuning_data.jsonl'):
    """
    Appends a list of conversations to an existing JSONL file in the desired format.
    
    Each conversation is a dict with 'text': 'Q: ...\nA: ...'
    This function ensures that any residual 'Answer:' labels are removed from the 'A: ...' section.
    
    Args:
        conversations (list): List of conversations to append, each as {"text": "Q: ...\nA: ..."}.
        file_path (str): Path to the existing JSONL file.
    
    Returns:
        None
    """
    # Check if the file exists
    if os.path.exists(file_path):
        # Check if the file ends with a newline
        with open(file_path, 'rb') as f:
            try:
                f.seek(-1, os.SEEK_END)
                last_char = f.read(1)
                if last_char != b'\n':
                    newline_needed = True
                else:
                    newline_needed = False
            except OSError:
                # File is empty
                newline_needed = False
    else:
        # File does not exist; will be created
        newline_needed = False

    with open(file_path, 'a', encoding='utf-8') as f:
        # If a newline is needed before appending, add it
        if newline_needed:
            f.write('\n')
        
        for convo in conversations:
            text = convo.get('text', '').strip()
            
            # Ensure that the text starts with "Q: " and contains "A: "
            if not text.startswith("Q: ") or "\nA: " not in text:
                print(f"Warning: Conversation does not follow the 'Q: ...\nA: ...' format. Skipping: {convo}")
                continue
            
            # Split the text into Question and Answer parts
            try:
                q_part, a_part = text.split("\nA: ", 1)
            except ValueError:
                print(f"Warning: Unable to split 'Q' and 'A' parts. Skipping: {convo}")
                continue
            
            # Clean the Answer part by removing any "Answer:" labels if present
            # Although it should already be cleaned, this is an extra safeguard
            a_part_cleaned = re.sub(r'\n*Answer:\s*.*$', '', a_part, flags=re.IGNORECASE).strip()
            a_part_cleaned = re.sub(r'^Answer:\s*', '', a_part_cleaned, flags=re.IGNORECASE).strip()
            
            # Reconstruct the cleaned conversation text
            new_text = f"{q_part}\nA: {a_part_cleaned}"
            
            # Append the cleaned conversation to the list
            processed_convo = {"text": new_text}
            
            # Serialize and write to the file
            json_line = json.dumps(processed_convo, ensure_ascii=False)
            f.write(json_line + '\n')
    
    print(f"Successfully appended {len(conversations)} conversations to '{file_path}'.")

In [820]:
new_conversations = convert_correct_pairs_to_conversations(correct_pairs)
    
    # Append to the new JSONL file
append_conversations_to_jsonl(new_conversations, 'finetuning_data_new.jsonl')

Successfully appended 8 conversations to 'finetuning_data_new.jsonl'.


### Data split

In [821]:
import json
import random

def split_jsonl_file(input_file, output_train, output_valid, output_test):
    # Load the jsonl file
    with open(input_file, 'r') as f:
        data = [json.loads(line) for line in f]

    # Shuffle the data to ensure random distribution
    random.shuffle(data)

    # Calculate split indices for 90% train, 5% valid, 5% test
    train_split = int(0.90 * len(data))
    valid_split = int(0.95 * len(data))

    # Split the data
    train_data = data[:train_split]
    valid_data = data[train_split:valid_split]
    test_data = data[valid_split:]

    # Write the data into their respective jsonl files
    with open(output_train, 'w') as f:
        for item in train_data:
            f.write(json.dumps(item) + '\n')

    with open(output_valid, 'w') as f:
        for item in valid_data:
            f.write(json.dumps(item) + '\n')

    with open(output_test, 'w') as f:
        for item in test_data:
            f.write(json.dumps(item) + '\n')

# Example usage
split_jsonl_file('finetuning_data_new.jsonl', 'train.jsonl', 'valid.jsonl', 'test.jsonl')