In [1]:
import re
import ollama
from src.data_loading import load_data
from src.parsing import parse_options
from src.rationale_generation import generate_rationale_and_answer
from src.utils import is_rationale_correct
from src.prompt_generation import create_prompt_examples, extract_answer_text, clean_options, get_correct_answer_text

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
NUM_PROMPT_EXAMPLES = 5 # number of prompt examples
NUM_EXAMPLES_TO_PROCESS = 10 # number of examples to process

### Load Dataset

In [12]:
ds_train = load_data()

# Step 1: Select the desired range from ds_train
dataset_D = ds_train.select(range(NUM_PROMPT_EXAMPLES, len(ds_train)))

# Step 2: Shuffle the selected dataset
dataset_D_shuffled = dataset_D.shuffle()  # Use any seed you prefer

# Step 3: Select a subset of examples to process
dataset_D_subset = dataset_D_shuffled.select(range(NUM_EXAMPLES_TO_PROCESS))

print(dataset_D_subset)

Dataset({
    features: ['question', 'options', 'rationale', 'correct'],
    num_rows: 10
})


### Create prompt examples

In [13]:
prompt_examples = create_prompt_examples(ds_train, NUM_PROMPT_EXAMPLES)

prompt_examples

[{'question': "Two friends plan to walk along a 43-km trail, starting at opposite ends of the trail at the same time. If Friend P's rate is 15% faster than Friend Q's, how many kilometers will Friend P have walked when they pass each other?",
  'rationale': 'If Q complete x kilometers, then P completes 1.15x kilometers.\nx + 1.15x = 43\n2.15x=43\nx = 43/2.15 = 20\nThen P will have have walked 1.15*20=23 km.\nThe answer is E.',
  'answer': 'E'},
 {'question': 'In the coordinate plane, points (x, 1) and (5, y) are on line k. If line k passes through the origin and has slope 1/5, then what are the values of x and y respectively?',
  'rationale': 'Line k passes through the origin and has slope 1/5 means that its equation is y=1/5*x.\nThus: (x, 1)=(5, 1) and (5, y) = (5,1) -->x=5 and y=1\nAnswer: C',
  'answer': 'C'},
 {'question': 'For all numbers p and q, the operation @ is defined by p@q = p^2 - pq. If xy ≠ 0, then which of the following can be equal to zero?\nI. x@y\nII. (xy)@y\nIII. x@

### Create Prompt Sets

In [15]:
# prompt_set = create_prompt_set(ds_train, NUM_PROMPT_EXAMPLES)
# print(prompt_set)

prompt_set = """Question: Two friends plan to walk along a 43-km trail, starting at opposite ends of the trail at the same time. If Friend P's rate is 15% faster than Friend Q's, how many kilometers will Friend P have walked when they pass each other?
Answer Explanation: Let Friend Q's speed be x km/h. Then Friend P's speed is 1.15x km/h. Since they start at the same time and meet after t hours, the total distance covered is:
x * t + 1.15x * t = 43
2.15x * t = 43
t = 43 / 2.15 ≈ 20 hours
Friend P walked 1.15x * t = 1.15 * 20 = 23 km.
Answer: 23
###
Question: In the coordinate plane, points (x, 1) and (5, y) are on line k. If line k passes through the origin and has slope 1/5, then what are the values of x and y respectively?
Answer Explanation: The equation of line k is y = (1/5)x. Substituting (x, 1):
1 = (1/5)x → x = 5
Substituting (5, y):
y = (1/5)*5 → y = 1
Answer: 5 and 1
###
Question: For all numbers p and q, the operation @ is defined by p@q = p^2 - pq. If xy ≠ 0, then which of the following can be equal to zero?
I. x@y
II. (xy)@y
III. x@(x + y)
Answer Explanation: 
p@q = p^2 - pq = p(p - q). 
I. x@y = x(x - y) → zero if x = y
II. (xy)@y = (xy)^2 - (xy)y = x^2y^2 - xy^2 = xy^2(x - 1) → zero if x = 1
III. x@(x + y) = x^2 - x(x + y) = x^2 - x^2 - xy = -xy ≠ 0 since xy ≠ 0
Thus, only I and II can be zero.
Answer: I and II
###
Question: Carl is facing very difficult financial times and can only pay the interest on a $10,000 loan he has taken. The bank charges him a quarterly compound rate of 4%. What is the approximate interest he pays annually?
Answer Explanation: The quarterly interest rate is 4%. The annual compound interest can be calculated using the formula:
A = P(1 + r/n)^(nt) - P
Where P = $10,000, r = 0.16 (16% annual rate), n = 4 (quarterly), t = 1 year.
A = 10000*(1 + 0.04)^4 - 10000 ≈ 10000*(1.16985856) - 10000 ≈ $1,698.59
Rounded to the nearest dollar: $1,700
Answer: $1,700
###
Question: The speed at which a man can row a boat in still water is 25 kmph. If he rows downstream, where the speed of current is 11 kmph, what time will he take to cover 80 metres?
Answer Explanation: Downstream speed = 25 + 11 = 36 kmph = 36 * (1000/3600) = 10 m/s. Time = Distance / Speed = 80 / 10 = 8 seconds.
Answer: 8 seconds
###
"""

### Initialize lists to hold correct and incorrect pairs

In [16]:
correct_pairs = []
incorrect_pairs = []
unanswered_paris = []

### Iterate over each example in the subset

In [17]:
for idx, example in enumerate(dataset_D_subset):
    question = example['question']
    print("question:", question)
    # Map 'correct' label to answer text
    raw_options = example['options']
    cleaned_options = clean_options(raw_options)
    correct_label = example['correct'].strip().upper()
    correct_answer_text = get_correct_answer_text(cleaned_options, correct_label)

    print(f"correct_answer_text: {correct_answer_text}")
    print(f"clean options: {cleaned_options}")
    
    if correct_answer_text is None:
        print(f"Skipping example {idx} due to missing correct answer.")
        continue  # Skip this example
    
    # Generate rationale and answer
    generated_rationale = generate_rationale_and_answer(question, prompt_set)
    
    # Extract the answer text from the rationale
    extracted_answer = extract_answer_text(generated_rationale)
    print(f"extracted answer: {extracted_answer}")

    response = ollama.chat(model="llama3.1:8b", messages=[
            {
                'role': 'user',
                'content':
                f"""
                    Your task is to compare two numerical answers and determine if they are the same answer, ignoring differences in units or formatting.\n\n
                    Comparison Rules:\n\n
                    - If the answers are the same, for example, First Answer: '90' and Second Answer: '90 m' or ( km, %, sec, ml, etc) this is a correct match since numerically the same and return 'correct' in your response.\n
                    - If the answers are different, consider them NOT a match and return 'incorrect' in your response.\n\n
                    Ignore differences in formatting, such as trailing zeros.\n\n

                    Compare the Following Answers:\n
                    First answer: {extracted_answer}\n
                    Second answer: {correct_answer_text}\n\n
                    Respond with:\n
                    "correct" if the two answers are the same\n
                    "incorrect" if the two answers are not the same\n\n
                    Please respond with only one of the above options, without any explanations.
                """
            },
     ])


    decision = response['message']['content'].strip()
    
    # Categorize based on extracted answer
    if decision.lower() == "correct":
        correct_pairs.append({
            'question': question,
            'rationale': generated_rationale,
            'answer': extracted_answer
        })
        print('Correct:', {
            'question': question,
            'rationale': generated_rationale,
            'answer': extracted_answer
        })
    elif decision.lower() == "incorrect":
        incorrect_pairs.append({
            'question': question,
            'rationale': generated_rationale,
            'answer': extracted_answer if extracted_answer else "No Answer Extracted",
            'correct_answer': correct_answer_text
        })
        print("Incorrect:",  {
            'question': question,
            'rationale': generated_rationale,
            'answer': extracted_answer if extracted_answer else "No Answer Extracted",
            'correct_answer': correct_answer_text
        })
    else:
        unanswered_paris.append({
            'question': question,
            'rationale': generated_rationale,
            'answer': extracted_answer if extracted_answer else "No Answer Extracted"
        })
        print("Unanswered:",  {
            'question': question,
            'rationale': generated_rationale,
            'answer': extracted_answer if extracted_answer else "No Answer Extracted"
        })
        

    # Print progress every example
    print(f"Processed {idx + 1} questions.\n")

question: What is the value of 18^2 + 19^2 + 20^2 + 21^2 + 22^2 ?
correct_answer_text: 2010
clean options: ['A) 2010', 'B) 2012', 'C) 2016', 'D) 2028', 'E) 2058']
extracted answer: 2010
Correct: {'question': 'What is the value of 18^2 + 19^2 + 20^2 + 21^2 + 22^2 ?', 'rationale': "To find the value of 18^2 + 19^2 + 20^2 + 21^2 + 22^2, let's calculate each term separately and then add them up.\n\nFirst, we'll find the squares of the given numbers:\n\n18^2 = 324\n19^2 = 361\n20^2 = 400\n21^2 = 441\n22^2 = 484\n\nNow, let's add these values together:\n\n324 + 361 = 685\n685 + 400 = 1085\n1085 + 441 = 1526\n1526 + 484 = 2010\n\nSo, the value of 18^2 + 19^2 + 20^2 + 21^2 + 22^2 is 2010.\n\nAnswer: 2010", 'answer': '2010'}
Processed 1 questions.

question: The speed of a boat in still water is 60kmph and the speed of the current is 5kmph. Find the speed downstream and upstream?
correct_answer_text: 65,55 kmph
clean options: ['A) 80,70 kmph', 'B) 80,40 kmph', 'C) 10,40 kmph', 'D) 65,55 kmph', 

In [18]:
total = len(correct_pairs) + len(incorrect_pairs) + len(unanswered_paris)
accuracy = len(correct_pairs) / total * 100
print(f"Total questions processed: {total}")
print(f"Correct answers: {len(correct_pairs)}")
print(f"Incorrect answers: {len(incorrect_pairs)}")
print(f"Unanswered answers: {len(unanswered_paris)}")
print(f"Accuracy: {accuracy:.2f}%")

Total questions processed: 10
Correct answers: 4
Incorrect answers: 6
Unanswered answers: 0
Accuracy: 40.00%


### Appending New Correct Pair Data

In [19]:
from src.data_appending import convert_correct_pairs_to_conversations, append_conversations_to_jsonl

new_conversations = convert_correct_pairs_to_conversations(correct_pairs)
    
# Append to the new JSONL file
append_conversations_to_jsonl(new_conversations, './data/finetuning_data_new.jsonl')

Successfully appended 4 conversations to './data/finetuning_data_new.jsonl'.


### Appending Incorrect Pair Data

In [20]:
from src.data_appending import push_incorrect_pairs

push_incorrect_pairs(incorrect_pairs)

Successfully added 6 incorrect pair(s) to 'data/incorrect_pairs.json'.


### Rationalization on Incorrect Pairs (After Fine Turning)

In [None]:
import ollama
from src.rationalization import rationalize

# Process the incorrect answers
while incorrect_pairs:
    pair = incorrect_pairs[0]
    question = pair['question']
    correct_answer = pair['correct_answer']
    print(f"Question: {question}")
    print(f"Correct answer: {correct_answer}")

    # Generate the rationale with the correct answer as a hint
    generated_rationale = rationalize(question, correct_answer)

    extracted_answer = extract_answer_text(generated_rationale)

    print(f"Model's answer: {extracted_answer}")

    eval_response = ollama.chat(model="llama3.2:3b", messages=[
            {
                'role': 'user',
                'content':
                f"""
                    Your task is to compare two numerical answers and determine if they are the same answer, ignoring differences in units or formatting.\n\n
                    Comparison Rules:\n\n
                    - If the answers are the same, for example, First Answer: '90' and Second Answer: '90 miles' or ( km, %, sec, ml, etc) this is a match and return 'correct' in your response.\n
                    - If the answers are different, consider them NOT a match and return 'incorrect' in your response.\n\n
                    Ignore differences in formatting, such as trailing zeros.\n\n

                    Compare the Following Answers:\n
                    First answer: {correct_answer}\n
                    Second answer: {extracted_answer}\n\n
                    Respond with:\n
                    "correct" if the two answers are the same or the Second answer is a phrase with the First answer in it.\n
                    "incorrect" if the two answers are not the same\n\n
                    Please respond with only one of the above options, without any explanations.
                """
            },
     ])
    
    decision = eval_response['message']['content'].strip()

        # Categorize based on extracted answer
    if decision.lower() == "correct":
    # Add the rationalized example to correct_pairs
        correct_pairs.append({
            'question': question,
            'rationale': generated_rationale,
            'answer': extracted_answer if extracted_answer else "No Answer Extracted",
            'correct_answer': correct_answer,
        })

        print({
            'question': question,
            'rationale': generated_rationale,
            'answer': extracted_answer if extracted_answer else "No Answer Extracted",
            'correct_answer': correct_answer,
        })

        incorrect_pairs.pop(0)  # Remove the first element since we processed it
        
    elif decision.lower() == "incorrect":
        print("Incorrect:",  {
            'question': question,
            'rationale': generated_rationale,
            'answer': extracted_answer if extracted_answer else "No Answer Extracted",
            'correct_answer': correct_answer_text
        })
    else:
        unanswered_paris.append({
            'question': question,
            'rationale': generated_rationale,
            'answer': extracted_answer if extracted_answer else "No Answer Extracted",
            'correct_answer': correct_answer,
        })
        print("Unanswered:",  {
            'question': question,
            'rationale': generated_rationale,
            'answer': extracted_answer if extracted_answer else "No Answer Extracted",
            'correct_answer': correct_answer_text
        })



### Convert to Llama3.2 chat template

In [198]:
import json
import os

def append_new_conversations(existing_file, new_correct_pairs, output_file=None):
    """
    Appends new conversations to the existing transformed_conversations.json file.

    Args:
        existing_file (str): Path to the existing transformed_conversations.json.
        new_correct_pairs (list): List of new correct_pairs dictionaries.
        output_file (str, optional): Path to save the updated JSON. Defaults to existing_file.
    """
    if output_file is None:
        output_file = existing_file

    # Step 1: Load existing conversations
    if os.path.exists(existing_file):
        with open(existing_file, 'r', encoding='utf-8') as f:
            try:
                transformed_conversations = json.load(f)
                if not isinstance(transformed_conversations, list):
                    raise ValueError("Existing JSON file does not contain a list.")
            except json.JSONDecodeError:
                print("Error: Existing JSON file is not properly formatted.")
                return
    else:
        # If the file doesn't exist, start with an empty list
        transformed_conversations = []
        print(f"Note: {existing_file} does not exist. A new file will be created.")

    # Step 2: Transform new correct_pairs into conversations
    for idx, pair in enumerate(new_correct_pairs, start=1):
        # Ensure each pair is a dictionary
        if not isinstance(pair, dict):
            print(f"Warning: Entry {idx} is not a dictionary. Skipping this entry.")
            continue

        question = pair.get('question', '').strip()
        rationale = pair.get('rationale', '').strip()
        # answer = pair.get('answer', '').strip()
        # rationale += f"\n\nAnswer: {answer}"
        # correct_answer = pair.get('correct_answer', '').strip()

        # # Handle the 'correct_answer' if present and different from 'answer'
        # if correct_answer and answer != correct_answer:
        #     print(f"Warning: Mismatch in answers for new pair {idx}:")
        #     print(f" - Answer: {answer}")
        #     print(f" - Correct Answer: {correct_answer}\n")
        #     # Append the correct answer to the rationale to ensure correctness
        #     rationale += f"\n\nCorrect Answer: {correct_answer}"
        # elif not answer and correct_answer:
        #     # If 'answer' is missing but 'correct_answer' exists
        #     answer = correct_answer
        #     rationale += f"\n\nAnswer: {answer}"

        # # Ensure that the rationale includes the answer
        # if not rationale.endswith(answer):
        #     # Append the answer if it's not already included
        #     rationale += f"\n\nAnswer: {answer}"

        # # Validate that question and rationale are present
        # if not question:
        #     print(f"Warning: Missing 'question' in new pair {idx}. Skipping this entry.")
        #     continue
        # if not rationale:
        #     print(f"Warning: Missing 'rationale' in new pair {idx}. Skipping this entry.")
        #     continue

        # Construct the conversation object
        conversation = {
            'conversations': [
                {
                    'content': question,
                    'role': 'user'
                },
                {
                    'content': rationale,
                    'role': 'assistant'
                }
            ]
        }

        # Append to the list
        transformed_conversations.append(conversation)

    # Step 3: Save the updated conversations back to the JSON file
    try:
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(transformed_conversations, f, ensure_ascii=False, indent=4)
        print(f"Successfully appended {len(new_correct_pairs)} new conversations to {output_file}.")
    except Exception as e:
        print(f"An error occurred while saving to {output_file}: {e}")

existing_json_path = 'transformed_conversations.json'

append_new_conversations(existing_json_path, correct_pairs)

Note: transformed_conversations.json does not exist. A new file will be created.
Successfully appended 9 new conversations to transformed_conversations.json.


# FineTune Data Structure

### Converting our corrected pairs into a structure the Llama3.1 model will understand for fine tuning.

In [None]:
import json
import re

def extract_question(human_message):
    """
    Extracts the question from the human message.

    Args:
        human_message (str): The message content from the human.

    Returns:
        str: Extracted question text or the original message if extraction fails.
    """
    match = re.search(r'Question:\s*(.+)', human_message, re.IGNORECASE)
    if match:
        return match.group(1).strip()
    return human_message.strip()

def extract_rationale(gpt_message):
    """
    Extracts the rationale from the GPT message by removing any initial 'A: ...' lines,
    removing 'Rationale:' labels, and excluding 'Answer: ...' sections.

    Args:
        gpt_message (str): The message content from GPT.

    Returns:
        str: Extracted rationale text or an empty string if extraction fails.
    """
    # Step 1: Remove any initial "A: ..." lines
    gpt_message = re.sub(r'^A:\s*.+\n', '', gpt_message, flags=re.IGNORECASE)

    # Step 2: Extract text after "Rationale:" up to "Answer:" or end of string
    match = re.search(r'Rationale:\s*(.*?)(?:\nAnswer:.*|$)', gpt_message, re.IGNORECASE | re.DOTALL)
    if match:
        rationale = match.group(1).strip()
        return rationale
    else:
        # If "Rationale:" is not found, remove any trailing "Answer: ..." and return the rest
        gpt_message = re.sub(r'\n*Answer:\s*.+$', '', gpt_message, flags=re.IGNORECASE | re.DOTALL).strip()
        return gpt_message

def convert_existing_jsonl(input_file, output_file):
    """
    Converts the existing finetuning_data.jsonl file to the new Q&A format.

    Args:
        input_file (str): Path to the existing JSONL file.
        output_file (str): Path to the new JSONL file to be created.

    Returns:
        None
    """
    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'w', encoding='utf-8') as outfile:
        
        for line_num, line in enumerate(infile, 1):
            try:
                # Each line is expected to be a JSON array of messages
                messages = json.loads(line)
                
                # Ensure there are at least two messages: human and gpt
                if len(messages) < 2:
                    print(f"Warning: Line {line_num} has less than 2 messages. Skipping.")
                    continue
                
                # Extract human and gpt messages
                human_message = ""
                gpt_message = ""
                for msg in messages:
                    if msg.get("from") == "human":
                        human_message = msg.get("value", "").strip()
                    elif msg.get("from") == "gpt":
                        gpt_message = msg.get("value", "").strip()
                
                if not human_message or not gpt_message:
                    print(f"Warning: Line {line_num} is missing human or gpt message. Skipping.")
                    continue
                
                # Extract question and rationale
                question = extract_question(human_message)
                rationale = extract_rationale(gpt_message)
                
                if not question or not rationale:
                    print(f"Warning: Line {line_num} could not extract question or rationale. Skipping.")
                    continue
                
                # Create the new format
                new_entry = {
                    "text": f"Q: {question}\nA: {rationale}"
                }
                
                # Write to the new JSONL file
                outfile.write(json.dumps(new_entry, ensure_ascii=False) + '\n')
            
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON on line {line_num}: {e}. Skipping.")
                continue

    print(f"Conversion complete. New file saved as '{output_file}'.")

if __name__ == "__main__":
    input_jsonl = 'finetuning_data.jsonl'        # Existing file
    output_jsonl = 'finetuning_data_new.jsonl'   # New file
        
    # Perform the conversion
    convert_existing_jsonl(input_jsonl, output_jsonl)

### Data split Jsonl to Train/Test/Valid

In [821]:
import json
import random

def split_jsonl_file(input_file, output_train, output_valid, output_test):
    # Load the jsonl file
    with open(input_file, 'r') as f:
        data = [json.loads(line) for line in f]

    # Shuffle the data to ensure random distribution
    random.shuffle(data)

    # Calculate split indices for 90% train, 5% valid, 5% test
    train_split = int(0.90 * len(data))
    valid_split = int(0.95 * len(data))

    # Split the data
    train_data = data[:train_split]
    valid_data = data[train_split:valid_split]
    test_data = data[valid_split:]

    # Write the data into their respective jsonl files
    with open(output_train, 'w') as f:
        for item in train_data:
            f.write(json.dumps(item) + '\n')

    with open(output_valid, 'w') as f:
        for item in valid_data:
            f.write(json.dumps(item) + '\n')

    with open(output_test, 'w') as f:
        for item in test_data:
            f.write(json.dumps(item) + '\n')

# Example usage
split_jsonl_file('finetuning_data_new.jsonl', 'train.jsonl', 'valid.jsonl', 'test.jsonl')

In [12]:
import json
import re

# Define the input and output file paths
input_jsonl = "finetuning_data_new.jsonl"       # Replace with your actual input JSONL file path
output_json = "formatted_data.json"    # Desired output JSON file path

# Initialize a list to hold reformatted entries
reformatted_entries = []

# Define the instruction text
instruction_text = "Provide a detailed answer to the following question."

# Function to parse Q and A from the text
def parse_q_a(text):
    """
    Parses the input text to extract Question and Answer.

    Args:
        text (str): The input text containing Q and A.

    Returns:
        tuple: (question, answer) if both are found, else (None, None).
    """
    # Regular expressions to capture Q and A
    q_pattern = r'^Q:\s*(.*?)(?=\nA:)'
    a_pattern = r'^A:\s*(.*)$'

    question_match = re.search(q_pattern, text, re.DOTALL | re.MULTILINE)
    answer_match = re.search(a_pattern, text, re.DOTALL | re.MULTILINE)

    question = question_match.group(1).strip() if question_match else None
    answer = answer_match.group(1).strip() if answer_match else None

    return question, answer

# Open and read the input JSONL file
with open(input_jsonl, 'r', encoding='utf-8') as fin:
    for idx, line in enumerate(fin, 1):
        try:
            data = json.loads(line)
            text = data.get('text', '').strip()

            if not text:
                print(f"Warning: Empty 'text' field in line {idx}. Skipping.")
                continue

            # Parse the Question and Answer
            question, answer = parse_q_a(text)

            if not question or not answer:
                print(f"Warning: Missing Question or Answer in line {idx}. Skipping.")
                continue

            # Append the reformatted entry
            reformatted_entries.append({
                "instruction": instruction_text,
                "input": question,
                "output": answer
            })

        except json.JSONDecodeError as e:
            print(f"Error decoding JSON in line {idx}: {e}. Skipping.")
            continue

# Assemble the entries into a dictionary with 'train' split
dataset_dict = {
    "train": reformatted_entries
}

# Save the dictionary as a single JSON file
with open(output_json, 'w', encoding='utf-8') as fout:
    json.dump(dataset_dict, fout, ensure_ascii=False, indent=4)

print(f"Reformatting complete. {len(reformatted_entries)} entries saved to '{output_json}'.")

Reformatting complete. 232 entries saved to 'formatted_data.json'.
