In [83]:
import ollama
from src.rationale_generation import generate_rationale_and_answer
from src.prompt_generation import create_prompt_examples, extract_answer_text, clean_options, get_correct_answer_text

In [84]:
from src.data_loading import load_data

In [85]:
llama_ft = "llama-reason-05:latest"

In [86]:
NUM_PROMPT_EXAMPLES = 5 # number of prompt examples
NUM_EXAMPLES_TO_PROCESS = 5 # number of examples to process

### Load Dataset

In [87]:
ds_train = load_data()

# Step 1: Select the desired range from ds_train
dataset_D = ds_train.select(range(NUM_PROMPT_EXAMPLES, len(ds_train)))

# Step 2: Shuffle the selected dataset
dataset_D_shuffled = dataset_D.shuffle()  # Use any seed you prefer

# Step 3: Select a subset of examples to process
dataset_D_subset = dataset_D_shuffled.select(range(NUM_EXAMPLES_TO_PROCESS))

print(dataset_D_subset)

Dataset({
    features: ['question', 'options', 'rationale', 'correct'],
    num_rows: 5
})


### Create prompt examples

In [88]:
prompt_examples = create_prompt_examples(ds_train, NUM_PROMPT_EXAMPLES)

prompt_examples

[{'question': "Two friends plan to walk along a 43-km trail, starting at opposite ends of the trail at the same time. If Friend P's rate is 15% faster than Friend Q's, how many kilometers will Friend P have walked when they pass each other?",
  'rationale': 'If Q complete x kilometers, then P completes 1.15x kilometers.\nx + 1.15x = 43\n2.15x=43\nx = 43/2.15 = 20\nThen P will have have walked 1.15*20=23 km.\nThe answer is E.',
  'answer': 'E'},
 {'question': 'In the coordinate plane, points (x, 1) and (5, y) are on line k. If line k passes through the origin and has slope 1/5, then what are the values of x and y respectively?',
  'rationale': 'Line k passes through the origin and has slope 1/5 means that its equation is y=1/5*x.\nThus: (x, 1)=(5, 1) and (5, y) = (5,1) -->x=5 and y=1\nAnswer: C',
  'answer': 'C'},
 {'question': 'For all numbers p and q, the operation @ is defined by p@q = p^2 - pq. If xy ≠ 0, then which of the following can be equal to zero?\nI. x@y\nII. (xy)@y\nIII. x@

### Create Prompt Sets

In [89]:
# prompt_set = create_prompt_set(ds_train, NUM_PROMPT_EXAMPLES)
# print(prompt_set)

prompt_set = """Question: Two friends plan to walk along a 43-km trail, starting at opposite ends of the trail at the same time. If Friend P's rate is 15% faster than Friend Q's, how many kilometers will Friend P have walked when they pass each other?
Answer Explanation: Let Friend Q's speed be x km/h. Then Friend P's speed is 1.15x km/h. Since they start at the same time and meet after t hours, the total distance covered is:
x * t + 1.15x * t = 43
2.15x * t = 43
t = 43 / 2.15 ≈ 20 hours
Friend P walked 1.15x * t = 1.15 * 20 = 23 km.
Answer: 23 km

Question: In the coordinate plane, points (x, 1) and (5, y) are on line k. If line k passes through the origin and has slope 1/5, then what are the values of x and y respectively?
Answer Explanation: The equation of line k is y = (1/5)x. Substituting (x, 1):
1 = (1/5)x → x = 5
Substituting (5, y):
y = (1/5)*5 → y = 1
Answer: 5 and 1
"""
# Question: Carl is facing very difficult financial times and can only pay the interest on a $10,000 loan he has taken. The bank charges him a quarterly compound rate of 4%. What is the approximate interest he pays annually?
# Answer Explanation: The quarterly interest rate is 4%. The annual compound interest can be calculated using the formula:
# A = P(1 + r/n)^(nt) - P
# Where P = $10,000, r = 0.16 (16% annual rate), n = 4 (quarterly), t = 1 year.
# A = 10000*(1 + 0.04)^4 - 10000 ≈ 10000*(1.16985856) - 10000 ≈ $1,698.59
# Rounded to the nearest dollar: $1,700
# Answer: $1,700

# Question: The speed at which a man can row a boat in still water is 25 kmph. If he rows downstream, where the speed of current is 11 kmph, what time will he take to cover 80 metres?
# Answer Explanation: Downstream speed = 25 + 11 = 36 kmph = 36 * (1000/3600) = 10 m/s. Time = Distance / Speed = 80 / 10 = 8 seconds.
# Answer: 8 seconds

# """

### Initialize lists to hold correct and incorrect pairs

In [90]:
correct_pairs = []
incorrect_pairs = []
unanswered_paris = []

### Iterate over each example in the subset

In [91]:
for idx, example in enumerate(dataset_D_subset):
    question = example['question']
    print("question:", question)
    # Map 'correct' label to answer text
    raw_options = example['options']
    cleaned_options = clean_options(raw_options)
    correct_label = example['correct'].strip().upper()
    correct_answer_text = get_correct_answer_text(cleaned_options, correct_label)

    print(f"correct_answer_text: {correct_answer_text}")
    print(f"clean options: {cleaned_options}")
    
    if correct_answer_text is None:
        print(f"Skipping example {idx} due to missing correct answer.")
        continue  # Skip this example
    
    # Generate rationale and answer
    generated_rationale = generate_rationale_and_answer(question, prompt_set)
    
    # Extract the answer text from the rationale
    extracted_answer = extract_answer_text(generated_rationale)
    print(f"extracted answer: {extracted_answer}")

    response = ollama.chat(model="llama3.1:8b", messages=[
            {
                'role': 'user',
                'content':
                f"""
                    Your task is to compare two numerical answers and determine if they are the same answer, ignoring differences in units or formatting.\n\n
                    Comparison Rules:\n\n
                    - If the answers are the same, for example, First Answer: '90' and Second Answer: '90 m' or ( km, %, sec, ml, etc) this is a correct match since numerically the same and return 'correct' in your response.\n
                    - If the answers are different, consider them NOT a match and return 'incorrect' in your response.\n\n
                    Ignore differences in formatting, such as trailing zeros.\n\n

                    Compare the Following Answers:\n
                    First answer: {extracted_answer}\n
                    Second answer: {correct_answer_text}\n\n
                    Respond with:\n
                    "correct" if the two answers are the same\n
                    "incorrect" if the two answers are not the same\n\n
                    Please respond with only one of the above options, without any explanations.
                """
            },
     ])


    decision = response['message']['content'].strip()
    
    # Categorize based on extracted answer
    if decision.lower() == "correct":
        correct_pairs.append({
            'question': question,
            'rationale': generated_rationale,
            'answer': extracted_answer,
            'correct_answer': correct_answer_text
        })
        print('Correct:', {
            'question': question,
            'rationale': generated_rationale,
            'answer': extracted_answer,
            'correct_answer': correct_answer_text

        })
    elif decision.lower() == "incorrect":
        incorrect_pairs.append({
            'question': question,
            'rationale': generated_rationale,
            'answer': extracted_answer if extracted_answer else "No Answer Extracted",
            'correct_answer': correct_answer_text
        })
        print("Incorrect:",  {
            'question': question,
            'rationale': generated_rationale,
            'answer': extracted_answer if extracted_answer else "No Answer Extracted",
            'correct_answer': correct_answer_text
        })
    else:
        unanswered_paris.append({
            'question': question,
            'rationale': generated_rationale,
            'answer': extracted_answer if extracted_answer else "No Answer Extracted"
        })
        print("Unanswered:",  {
            'question': question,
            'rationale': generated_rationale,
            'answer': extracted_answer if extracted_answer else "No Answer Extracted"
        })
        

    # Print progress every example
    print(f"Processed {idx + 1} questions.\n")

question: An engineer designed a ball so that when it was dropped, it rose with each bounce exactly one-half as high as it had fallen. The engineer dropped the ball from a 16-meter platform and caught it after it had traveled 45 meters. How many times did the ball bounce?
correct_answer_text: 4
clean options: ['A) 5', 'B) 4', 'C) 7', 'D) 8', 'E) 9']
extracted answer: The ball bounced twice.
Incorrect: {'question': 'An engineer designed a ball so that when it was dropped, it rose with each bounce exactly one-half as high as it had fallen. The engineer dropped the ball from a 16-meter platform and caught it after it had traveled 45 meters. How many times did the ball bounce?', 'rationale': "Step 1: Calculate the height of each bounce\nSince the ball rises to half its previous height with each bounce, we can set up a sequence where h_n is the height of the n-th bounce:\nh_0 = 16 (initial drop from platform)\nh_1 = h_0 / 2 = 8\nh_2 = h_1 / 2 = 4\nand so on.\n\nStep 2: Calculate the distanc

KeyboardInterrupt: 

In [71]:
total = len(correct_pairs) + len(incorrect_pairs) + len(unanswered_paris)
accuracy = len(correct_pairs) / total * 100
print(f"Total questions processed: {total}")
print(f"Correct answers: {len(correct_pairs)}")
print(f"Incorrect answers: {len(incorrect_pairs)}")
print(f"Unanswered answers: {len(unanswered_paris)}")
print(f"Accuracy: {accuracy:.2f}%")

Total questions processed: 5
Correct answers: 1
Incorrect answers: 4
Unanswered answers: 0
Accuracy: 20.00%


In [72]:
correct_pairs

[{'question': "Two people measure each other's height, the height of the taller person is H and the height of the other person is L. If the difference in their heights is equal to the average height, what is the Value of H/L",
  'rationale': 'Step 1: Define variables\n- Let H be the height of the taller person.\n- Let L be the height of the shorter person.\n\nStep 2: Understand the problem statement\nThe difference in their heights (H - L) is equal to the average height, which can be calculated as (H + L)/2.\n\nStep 3: Write down an equation based on this information\nWe have two equations:\n1. H - L = (H + L)/2\n2. The value of H/L needs to be found\n\nStep 4: Solve for the unknown quantity (H/L)\nSubstitute H-L with (H+L)/2 from Equation 1 into Equation 2:\n(H - L) / L = ((H + L) / 2) / L\n\nSimplifying this expression, we get:\n\n(H - L) / L = (H + L) / (2L)\n\nMultiply both sides by L to eliminate the fraction on the left-hand side:\n(H - L) = (H + L) / 2\nSimplifying further gives

In [73]:
incorrect_pairs

[{'question': 'The y intercept of a line L is 1. If the slope of L is negative, which of the following could be the x intercept of L.\nI. -1\nII. 0\nIII. 6',
  'rationale': 'Step 1: Define variables\ny-intercept = 1 (given)\nslope = negative (given)\n\n\nStep 2: Write the equation of line L in slope-intercept form\nSince slope is negative, we can represent it as -m where m is a positive number. The equation becomes:\ny = -mx + 1\n\n\nStep 3: Determine the x-intercept\nTo find the x-intercept, set y to zero in the above equation:\n0 = -mx + 1\nmx = -1\nx = -1/m (since m > 0 and we represented negative slope as -m)\n\n\nStep 4: Consider each option separately\nI. -1 \nThis could be true because if m=1, then x=-1.\n\n\nII. 0 \nThis cannot be true since the x-intercept is not zero for any value of m.\n\n\nIII. 6 \nThis could also be true. If we take m=2/3 or m=-7/4 etc., then x will become -3 or -12 respectively and still satisfy the equation y = -mx + 1 at point (x,0).\n\n\nTherefore, opt

### Appending New Correct Pair Data

In [35]:
from src.data_appending import convert_correct_pairs_to_conversations, append_conversations_to_jsonl

new_conversations = convert_correct_pairs_to_conversations(correct_pairs)
    
# Append to the new JSONL file
append_conversations_to_jsonl(new_conversations, './data/finetuning_data_new.jsonl')

Successfully appended 2 conversations to './data/finetuning_data_new.jsonl'.


### Appending Incorrect Pair Data

In [74]:
from src.data_appending import push_incorrect_pairs

push_incorrect_pairs(incorrect_pairs)

Successfully added 4 incorrect pair(s) to 'data/incorrect_pairs.json'.


### Rationalization on Incorrect Pairs (After Fine Turning)

In [None]:
import ollama
from src.rationalization import rationalize
from src.rationale_generation import eval_rationale

# Process the incorrect answers
while incorrect_pairs:
    pair = incorrect_pairs[0]
    question = pair['question']
    correct_answer = pair['correct_answer']
    print(f"Question: {question}")
    print(f"Correct answer: {correct_answer}")

    # Generate the rationale with the correct answer as a hint
    generated_rationale = rationalize(question, correct_answer)

    extracted_answer = extract_answer_text(generated_rationale)

    print(f"Model's answer: {extracted_answer}")

    eval_response = eval_rationale(extracted_answer, correct_answer)
    
    decision = eval_response['message']['content'].strip()

        # Categorize based on extracted answer
    if decision.lower() == "correct":
    # Add the rationalized example to correct_pairs
        correct_pairs.append({
            'question': question,
            'rationale': generated_rationale,
            'answer': extracted_answer if extracted_answer else "No Answer Extracted",
            'correct_answer': correct_answer,
        })

        print({
            'question': question,
            'rationale': generated_rationale,
            'answer': extracted_answer if extracted_answer else "No Answer Extracted",
            'correct_answer': correct_answer,
        })

        incorrect_pairs.pop(0)  # Remove the first element since we processed it
        
    elif decision.lower() == "incorrect":
        print("Incorrect:",  {
            'question': question,
            'rationale': generated_rationale,
            'answer': extracted_answer if extracted_answer else "No Answer Extracted",
            'correct_answer': correct_answer_text
        })
    else:
        unanswered_paris.append({
            'question': question,
            'rationale': generated_rationale,
            'answer': extracted_answer if extracted_answer else "No Answer Extracted",
            'correct_answer': correct_answer,
        })
        print("Unanswered:",  {
            'question': question,
            'rationale': generated_rationale,
            'answer': extracted_answer if extracted_answer else "No Answer Extracted",
            'correct_answer': correct_answer_text
        })



In [1]:
import json
import os
import ollama
from src.rationalization import rationalize

# Define the path to the incorrect_pairs.json file
NUMBER_OF_PAIRS_TO_PROCESS = 3
INCORRECT_PAIRS_FILE = 'data/incorrect_pairs.json'
OLLOMA_MODEL_NAME = "llama-reason-04:latest"

# Initialize lists to store categorized pairs
correct_pairs = []
incorrect_pairs = []
unanswered_pairs = []

def load_incorrect_pairs(file_path):
    """
    Loads incorrect pairs from a JSON file.

    Args:
        file_path (str): Path to the JSON file.

    Returns:
        list: List of incorrect pairs.
    """
    if not os.path.exists(file_path):
        print(f"File {file_path} does not exist.")
        return []

    with open(file_path, 'r', encoding='utf-8') as file:
        try:
            data = json.load(file)
            if isinstance(data, list):
                return data
            else:
                print(f"Unexpected data format in {file_path}. Expected a list.")
                return []
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON from {file_path}: {e}")
            return []

def save_incorrect_pairs(file_path, data):
    """
    Saves the updated incorrect pairs back to the JSON file.

    Args:
        file_path (str): Path to the JSON file.
        data (list): Updated list of incorrect pairs.
    """
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)
    print(f"Updated {file_path} with {len(data)} incorrect pair(s).")

def process_incorrect_pairs(num_pairs, model_name="llama3.1:8b"):
    """
    Processes a specified number of incorrect pairs.

    Args:
        num_pairs (int): Number of pairs to process.
        model_name (str): Name of the model to use with Ollama.
    """
    # Load incorrect pairs from the JSON file
    incorrect_pairs = load_incorrect_pairs(INCORRECT_PAIRS_FILE)

    if not incorrect_pairs:
        print("No incorrect pairs to process.")
        return

    # Determine the actual number of pairs to process
    num_to_process = min(num_pairs, len(incorrect_pairs))
    print(f"Processing {num_to_process} out of {len(incorrect_pairs)} incorrect pair(s).")

    i = 0  # Index to keep track of how many pairs we have processed
    while i < num_to_process and incorrect_pairs:
        pair = incorrect_pairs[0]  # Always process the first pair in the list
        question = pair.get('question', '').strip()
        correct_answer = pair.get('correct_answer', '').strip()

        if not question or not correct_answer:
            print(f"Skipping pair {i+1} due to missing question or correct_answer.")
            incorrect_pairs.pop(0)
            i += 1
            continue

        print(f"\nProcessing Pair {i+1}:")
        print(f"Question: {question}")
        print(f"Correct Answer: {correct_answer}")

        attempts = 0
        max_attempts = 2
        while attempts < max_attempts:
            attempts += 1
            # Generate the rationale with the correct answer as a hint
            generated_rationale = rationalize(question, correct_answer)

            response = ollama.chat(model=model_name, messages=[
            {
                'role': 'user',
                'content':f"""
                        You are to determine if the correct answer is explicitly stated as the last line of the rationale.\n\n

                        Instructions:\n

                        - Read the rationale.\n
                        - Check if the last line of the rationale has the same answer as the 'Correct Answer' provided.\n
                        - If it is, respond with only 'correct'.
                        - If it is not, respond with only 'incorrect'.
                        - Do not consider any other factors.

                        **Evaluate:**

                        Rationale:
                        {generated_rationale}

                        Correct Answer:
                        {correct_answer}

                        **Respond with:**

                        'correct' or 'incorrect'

                        Only respond with one word.
                                """
                            },
                        ])

            decision = response['message']['content'].strip()

            print("Decision", decision)

            if decision == "correct":
                # Add the rationalized example to correct_pairs
                correct_pairs.append({
                    'question': question,
                    'rationale': generated_rationale,
                    # 'answer': extracted_answer if extracted_answer else "No Answer Extracted",
                    'correct_answer': correct_answer,
                })
                print("Correct", ({
                    'question': question,
                    'rationale': generated_rationale,
                    'correct_answer': correct_answer,
                }))

                incorrect_pairs.pop(0)
                break  # Exit the attempts loop

            elif decision == "incorrect":
                if attempts >= max_attempts:
                    print(f"Reached maximum attempts for pair {i+1}. Moving to next pair.")
                    # Leave the pair in incorrect_pairs, move to next
                    break
                else:
                    print(f"Retrying ({attempts}/{max_attempts})...")
            else:
                # Add to unanswered_pairs for manual review
                unanswered_pairs.append({
                    'question': question,
                    'rationale': generated_rationale,
                    'correct_answer': correct_answer,
                })
                print("Unanswered", ({
                    'question': question,
                    'rationale': generated_rationale,
                    'correct_answer': correct_answer,
                }))
                break  # Exit the attempts loop, move to next pair

        i += 1

    # Save the updated incorrect_pairs back to the JSON file
    save_incorrect_pairs(INCORRECT_PAIRS_FILE, incorrect_pairs)
    print("\nProcessing complete.")
    print(f"Correct Pairs: {len(correct_pairs)}")
    print(f"Incorrect Pairs Remaining: {len(incorrect_pairs)}")
    print(f"Unanswered Pairs: {len(unanswered_pairs)}")

def save_results(data, file_path):
    """
    Saves categorized pairs to a specified JSON file.

    Args:
        data (list): List of categorized pairs.
        file_path (str): Path to the output JSON file.
    """
    if not data:
        print(f"No data to save for {file_path}.")
        return

    # Check if the file exists; if not, create it with an empty list
    if not os.path.exists(file_path):
        with open(file_path, 'w', encoding='utf-8') as file:
            json.dump([], file, ensure_ascii=False, indent=4)

    # Load existing data
    with open(file_path, 'r', encoding='utf-8') as file:
        try:
            existing_data = json.load(file)
            if not isinstance(existing_data, list):
                print(f"Unexpected data format in {file_path}. Overwriting with a new list.")
                existing_data = []
        except json.JSONDecodeError:
            print(f"Error decoding JSON from {file_path}. Overwriting with a new list.")
            existing_data = []

    # Append new data
    existing_data.extend(data)

    # Save back to the file
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(existing_data, file, ensure_ascii=False, indent=4)

    print(f"Saved {len(data)} pair(s) to {file_path}.")


process_incorrect_pairs(NUMBER_OF_PAIRS_TO_PROCESS, model_name="llama3.1:8b")

Processing 3 out of 86 incorrect pair(s).

Processing Pair 1:
Question: In May Mrs Lee's earnings were 60 percent of the Lee family's total income. In June Mrs Lee earned 20 percent more than in May. If the rest of the family's income was the same both months, then, in June, Mrs Lee's earnings were approximately what percent of the Lee family's total income ?
Correct Answer: 64%
Decision correct
Correct {'question': "In May Mrs Lee's earnings were 60 percent of the Lee family's total income. In June Mrs Lee earned 20 percent more than in May. If the rest of the family's income was the same both months, then, in June, Mrs Lee's earnings were approximately what percent of the Lee family's total income ?", 'rationale': "Let's assume that the total income of the Lee family is $100. In May, Mrs Lee earned $60 (which is 60% of the total income).\n\nIn June, her earnings increased by 20%, making them $72.\n\nThe rest of the family's income remained the same at $28.\n\nTo find what percent of 

In [2]:
correct_pairs

[{'question': "In May Mrs Lee's earnings were 60 percent of the Lee family's total income. In June Mrs Lee earned 20 percent more than in May. If the rest of the family's income was the same both months, then, in June, Mrs Lee's earnings were approximately what percent of the Lee family's total income ?",
  'rationale': "Let's assume that the total income of the Lee family is $100. In May, Mrs Lee earned $60 (which is 60% of the total income).\n\nIn June, her earnings increased by 20%, making them $72.\n\nThe rest of the family's income remained the same at $28.\n\nTo find what percent of the total income ($100) Mrs Lee's earnings were in June, we can do:\n\nMrs Lee's earnings / Total Income = $72 / $100 ≈ 0.72 or 72%.\n\nThis means that her earnings were approximately 64% of the family's total income in June.",
  'correct_answer': '64%'}]

### Convert to Llama3.2 chat template

In [198]:
import json
import os

def append_new_conversations(existing_file, new_correct_pairs, output_file=None):
    """
    Appends new conversations to the existing transformed_conversations.json file.

    Args:
        existing_file (str): Path to the existing transformed_conversations.json.
        new_correct_pairs (list): List of new correct_pairs dictionaries.
        output_file (str, optional): Path to save the updated JSON. Defaults to existing_file.
    """
    if output_file is None:
        output_file = existing_file

    # Step 1: Load existing conversations
    if os.path.exists(existing_file):
        with open(existing_file, 'r', encoding='utf-8') as f:
            try:
                transformed_conversations = json.load(f)
                if not isinstance(transformed_conversations, list):
                    raise ValueError("Existing JSON file does not contain a list.")
            except json.JSONDecodeError:
                print("Error: Existing JSON file is not properly formatted.")
                return
    else:
        # If the file doesn't exist, start with an empty list
        transformed_conversations = []
        print(f"Note: {existing_file} does not exist. A new file will be created.")

    # Step 2: Transform new correct_pairs into conversations
    for idx, pair in enumerate(new_correct_pairs, start=1):
        # Ensure each pair is a dictionary
        if not isinstance(pair, dict):
            print(f"Warning: Entry {idx} is not a dictionary. Skipping this entry.")
            continue

        question = pair.get('question', '').strip()
        rationale = pair.get('rationale', '').strip()
        # answer = pair.get('answer', '').strip()
        # rationale += f"\n\nAnswer: {answer}"
        # correct_answer = pair.get('correct_answer', '').strip()

        # # Handle the 'correct_answer' if present and different from 'answer'
        # if correct_answer and answer != correct_answer:
        #     print(f"Warning: Mismatch in answers for new pair {idx}:")
        #     print(f" - Answer: {answer}")
        #     print(f" - Correct Answer: {correct_answer}\n")
        #     # Append the correct answer to the rationale to ensure correctness
        #     rationale += f"\n\nCorrect Answer: {correct_answer}"
        # elif not answer and correct_answer:
        #     # If 'answer' is missing but 'correct_answer' exists
        #     answer = correct_answer
        #     rationale += f"\n\nAnswer: {answer}"

        # # Ensure that the rationale includes the answer
        # if not rationale.endswith(answer):
        #     # Append the answer if it's not already included
        #     rationale += f"\n\nAnswer: {answer}"

        # # Validate that question and rationale are present
        # if not question:
        #     print(f"Warning: Missing 'question' in new pair {idx}. Skipping this entry.")
        #     continue
        # if not rationale:
        #     print(f"Warning: Missing 'rationale' in new pair {idx}. Skipping this entry.")
        #     continue

        # Construct the conversation object
        conversation = {
            'conversations': [
                {
                    'content': question,
                    'role': 'user'
                },
                {
                    'content': rationale,
                    'role': 'assistant'
                }
            ]
        }

        # Append to the list
        transformed_conversations.append(conversation)

    # Step 3: Save the updated conversations back to the JSON file
    try:
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(transformed_conversations, f, ensure_ascii=False, indent=4)
        print(f"Successfully appended {len(new_correct_pairs)} new conversations to {output_file}.")
    except Exception as e:
        print(f"An error occurred while saving to {output_file}: {e}")

existing_json_path = 'transformed_conversations.json'

append_new_conversations(existing_json_path, correct_pairs)

Note: transformed_conversations.json does not exist. A new file will be created.
Successfully appended 9 new conversations to transformed_conversations.json.


# FineTune Data Structure

### Converting our corrected pairs into a structure the Llama3.1 model will understand for fine tuning.

In [8]:
import json
import re

def extract_question(human_message):
    """
    Extracts the question from the human message.

    Args:
        human_message (str): The message content from the human.

    Returns:
        str: Extracted question text or the original message if extraction fails.
    """
    match = re.search(r'Question:\s*(.+)', human_message, re.IGNORECASE)
    if match:
        return match.group(1).strip()
    return human_message.strip()

def extract_rationale(gpt_message):
    """
    Extracts the rationale from the GPT message by removing any initial 'A: ...' lines,
    removing 'Rationale:' labels, and excluding 'Answer: ...' sections.

    Args:
        gpt_message (str): The message content from GPT.

    Returns:
        str: Extracted rationale text or an empty string if extraction fails.
    """
    # Step 1: Remove any initial "A: ..." lines
    gpt_message = re.sub(r'^A:\s*.+\n', '', gpt_message, flags=re.IGNORECASE)

    # Step 2: Extract text after "Rationale:" up to "Answer:" or end of string
    match = re.search(r'Rationale:\s*(.*?)(?:\nAnswer:.*|$)', gpt_message, re.IGNORECASE | re.DOTALL)
    if match:
        rationale = match.group(1).strip()
        return rationale
    else:
        # If "Rationale:" is not found, remove any trailing "Answer: ..." and return the rest
        gpt_message = re.sub(r'\n*Answer:\s*.+$', '', gpt_message, flags=re.IGNORECASE | re.DOTALL).strip()
        return gpt_message

def convert_existing_jsonl(input_file, output_file):
    """
    Converts the existing finetuning_data.jsonl file to the new Q&A format.

    Args:
        input_file (str): Path to the existing JSONL file.
        output_file (str): Path to the new JSONL file to be created.

    Returns:
        None
    """
    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'w', encoding='utf-8') as outfile:
        
        for line_num, line in enumerate(infile, 1):
            try:
                # Each line is expected to be a JSON array of messages
                messages = json.loads(line)
                
                # Ensure there are at least two messages: human and gpt
                if len(messages) < 2:
                    print(f"Warning: Line {line_num} has less than 2 messages. Skipping.")
                    continue
                
                # Extract human and gpt messages
                human_message = ""
                gpt_message = ""
                for msg in messages:
                    if msg.get("from") == "human":
                        human_message = msg.get("value", "").strip()
                    elif msg.get("from") == "gpt":
                        gpt_message = msg.get("value", "").strip()
                
                if not human_message or not gpt_message:
                    print(f"Warning: Line {line_num} is missing human or gpt message. Skipping.")
                    continue
                
                # Extract question and rationale
                question = extract_question(human_message)
                rationale = extract_rationale(gpt_message)
                
                if not question or not rationale:
                    print(f"Warning: Line {line_num} could not extract question or rationale. Skipping.")
                    continue
                
                # Create the new format
                new_entry = {
                    "text": f"Q: {question}\nA: {rationale}"
                }
                
                # Write to the new JSONL file
                outfile.write(json.dumps(new_entry, ensure_ascii=False) + '\n')
            
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON on line {line_num}: {e}. Skipping.")
                continue

    print(f"Conversion complete. New file saved as '{output_file}'.")

if __name__ == "__main__":
    input_jsonl = 'data/finetuning_data.jsonl'        # Existing file
    output_jsonl = 'data/finetuning_data_new.jsonl'   # New file
        
    # Perform the conversion
    convert_existing_jsonl(input_jsonl, output_jsonl)

Error decoding JSON on line 18: Extra data: line 1 column 1471 (char 1470). Skipping.
Error decoding JSON on line 84: Extra data: line 1 column 936 (char 935). Skipping.
Conversion complete. New file saved as 'data/finetuning_data_new.jsonl'.


### Data split Jsonl to Train/Test/Valid

In [821]:
import json
import random

def split_jsonl_file(input_file, output_train, output_valid, output_test):
    # Load the jsonl file
    with open(input_file, 'r') as f:
        data = [json.loads(line) for line in f]

    # Shuffle the data to ensure random distribution
    random.shuffle(data)

    # Calculate split indices for 90% train, 5% valid, 5% test
    train_split = int(0.90 * len(data))
    valid_split = int(0.95 * len(data))

    # Split the data
    train_data = data[:train_split]
    valid_data = data[train_split:valid_split]
    test_data = data[valid_split:]

    # Write the data into their respective jsonl files
    with open(output_train, 'w') as f:
        for item in train_data:
            f.write(json.dumps(item) + '\n')

    with open(output_valid, 'w') as f:
        for item in valid_data:
            f.write(json.dumps(item) + '\n')

    with open(output_test, 'w') as f:
        for item in test_data:
            f.write(json.dumps(item) + '\n')

# Example usage
split_jsonl_file('finetuning_data_new.jsonl', 'train.jsonl', 'valid.jsonl', 'test.jsonl')

In [37]:
import json
import re

# Define the input and output file paths
input_jsonl = "data/finetuning_data_new.jsonl"       # Replace with your actual input JSONL file path
output_json = "data/formatted_data.json"    # Desired output JSON file path

# Initialize a list to hold reformatted entries
reformatted_entries = []

# Define the instruction text
instruction_text = "Provide a detailed answer to the following question."

# Function to parse Q and A from the text
def parse_q_a(text):
    """
    Parses the input text to extract Question and Answer.

    Args:
        text (str): The input text containing Q and A.

    Returns:
        tuple: (question, answer) if both are found, else (None, None).
    """
    # Regular expressions to capture Q and A
    q_pattern = r'^Q:\s*(.*?)(?=\nA:)'
    a_pattern = r'^A:\s*(.*)$'

    question_match = re.search(q_pattern, text, re.DOTALL | re.MULTILINE)
    answer_match = re.search(a_pattern, text, re.DOTALL | re.MULTILINE)

    question = question_match.group(1).strip() if question_match else None
    answer = answer_match.group(1).strip() if answer_match else None

    return question, answer

# Open and read the input JSONL file
with open(input_jsonl, 'r', encoding='utf-8') as fin:
    for idx, line in enumerate(fin, 1):
        try:
            data = json.loads(line)
            text = data.get('text', '').strip()

            if not text:
                print(f"Warning: Empty 'text' field in line {idx}. Skipping.")
                continue

            # Parse the Question and Answer
            question, answer = parse_q_a(text)

            if not question or not answer:
                print(f"Warning: Missing Question or Answer in line {idx}. Skipping.")
                continue

            # Append the reformatted entry
            reformatted_entries.append({
                "instruction": instruction_text,
                "input": question,
                "output": answer
            })

        except json.JSONDecodeError as e:
            print(f"Error decoding JSON in line {idx}: {e}. Skipping.")
            continue

# Assemble the entries into a dictionary with 'train' split
dataset_dict = {
    "train": reformatted_entries
}

# Save the dictionary as a single JSON file
with open(output_json, 'w', encoding='utf-8') as fout:
    json.dump(dataset_dict, fout, ensure_ascii=False, indent=4)

print(f"Reformatting complete. {len(reformatted_entries)} entries saved to '{output_json}'.")

Error decoding JSON in line 308: Expecting ',' delimiter: line 1 column 2649 (char 2648). Skipping.
Reformatting complete. 326 entries saved to 'data/formatted_data.json'.
