In [38]:
import ollama
import os
import json
import re
import random
from src.rationalization import rationalize  # Ensure this module is accessible

# Define the path to the incorrect_pairs.json file
NUMBER_OF_PAIRS_TO_PROCESS = 3
INCORRECT_PAIRS_FILE = 'data/incorrect_pairs.json'
OLLOMA_MODEL_NAME = "llama-reason-04:latest"

# Initialize lists to store categorized pairs
correct_pairs = []
unanswered_pairs = []

def load_incorrect_pairs(file_path):
    """
    Loads incorrect pairs from a JSON file.

    Args:
        file_path (str): Path to the JSON file.

    Returns:
        list: List of incorrect pairs.
    """
    if not os.path.exists(file_path):
        print(f"File {file_path} does not exist.")
        return []

    with open(file_path, 'r', encoding='utf-8') as file:
        try:
            data = json.load(file)
            if isinstance(data, list):
                return data
            else:
                print(f"Unexpected data format in {file_path}. Expected a list.")
                return []
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON from {file_path}: {e}")
            return []

def save_results(data, file_path):
    """
    Saves categorized pairs to a specified JSON file.

    Args:
        data (list): List of categorized pairs.
        file_path (str): Path to the output JSON file.
    """
    if not data:
        print(f"No data to save for {file_path}.")
        return

    # Check if the file exists; if not, create it with an empty list
    if not os.path.exists(file_path):
        with open(file_path, 'w', encoding='utf-8') as file:
            json.dump([], file, ensure_ascii=False, indent=4)

    # Load existing data
    with open(file_path, 'r', encoding='utf-8') as file:
        try:
            existing_data = json.load(file)
            if not isinstance(existing_data, list):
                raise ValueError("Data is not a list")
        except (json.JSONDecodeError, ValueError) as e:
            print(f"Error reading {file_path}: {e}. Please check the file format.")
            return

    # Append new data
    existing_data.extend(data)

    # Save back to the file
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(existing_data, file, ensure_ascii=False, indent=4)

    print(f"Saved {len(data)} pair(s) to {file_path}.")

def extract_decision(response_content):
    """
    Extracts 'correct' or 'incorrect' from the model's response.

    Args:
        response_content (str): The raw response from the model.

    Returns:
        str: 'correct', 'incorrect', or '' if not found.
    """
    # Define the regex pattern to match 'correct' or 'incorrect' as whole words
    pattern = r'\b(correct|incorrect)\b'
    match = re.search(pattern, response_content.lower())
    if match:
        return match.group(1)
    return ''

def extract_final_answer(rationale):
    """
    Extracts the final answer from the rationale.

    Args:
        rationale (str): The generated rationale.

    Returns:
        str: The extracted answer or an empty string if not found.
    """
    match = re.search(r'Answer:\s*(.*)', rationale, re.IGNORECASE)
    if match:
        return match.group(1).strip().lower()
    return ''

def compare_answers(extracted, expected, margin=1.0):
    """
    Compares the extracted answer with the expected answer numerically.
    Allows for minor discrepancies due to approximations.

    Args:
        extracted (str): The extracted answer from the rationale.
        expected (str): The expected correct answer.
        margin (float): Allowed margin of error.

    Returns:
        bool: True if answers match within the margin, False otherwise.
    """
    try:
        # Extract numerical values
        extracted_num = float(re.findall(r'\d+\.?\d*', extracted)[0])
        expected_num = float(re.findall(r'\d+\.?\d*', expected)[0])
        # Allow a small margin for approximation
        return abs(extracted_num - expected_num) <= margin
    except (IndexError, ValueError):
        return False

def process_incorrect_pairs(num_pairs, model_name="llama-reason-04:latest"):
    """
    Processes a specified number of incorrect pairs with combined evaluation.

    Args:
        num_pairs (int): Number of pairs to process.
        model_name (str): Name of the model to use with Ollama.
    """
    # Load incorrect pairs from the JSON file
    incorrect_pairs = load_incorrect_pairs(INCORRECT_PAIRS_FILE)

    if not incorrect_pairs:
        print("No incorrect pairs to process.")
        return

    # Determine the actual number of pairs to process
    num_to_process = min(num_pairs, len(incorrect_pairs))
    print(f"Processing {num_to_process} out of {len(incorrect_pairs)} incorrect pair(s).")

    # Select num_to_process random pairs
    pairs_to_process = random.sample(incorrect_pairs, num_to_process)

    for idx, pair in enumerate(pairs_to_process, 1):
        question = pair.get('question', '').strip()
        correct_answer = pair.get('correct_answer', '').strip()

        if not question or not correct_answer:
            print(f"Skipping pair {idx} due to missing question or correct_answer.")
            continue

        print(f"\nProcessing Pair {idx}:")
        print(f"Question: {question}")
        print(f"Correct Answer: {correct_answer}")

        attempts = 0
        max_attempts = 2  # Total attempts: initial + one retry
        pair_correct = False

        while attempts < max_attempts and not pair_correct:
            attempts += 1
            print(f"Attempt {attempts} for Pair {idx}.")

            # Generate the rationale with the correct answer as a hint
            generated_rationale = rationalize(question, correct_answer)

            # Extract the final answer from the rationale
            extracted_answer = extract_final_answer(generated_rationale)

            print(f"Extracted Answer: '{extracted_answer}'")

            # Programmatic Evaluation
            if compare_answers(extracted_answer, correct_answer):
                # Add the rationalized example to correct_pairs
                correct_entry = {
                    'question': question,
                    'rationale': generated_rationale,
                    'correct_answer': correct_answer,
                }
                correct_pairs.append(correct_entry)
                print(f"Pair {idx} marked as correct.")
                pair_correct = True  # Exit the retry loop
                continue  # Proceed to next pair

            # If programmatic evaluation fails, use model-based evaluation
            evaluation_prompt = f"""
                You are an assistant that evaluates rationales for correctness.\n\n

                Example 1:\n
                Correct Answer: 4\n
                Rationale:\n
                To solve this problem, we need to add 2 + 2.\n
                This gives us a answer of 4.\n
                Decision:\n
                correct\n\n

                Example 2:\n
                Correct Answer: 10\n
                Rationale:\n
                The sum of 5 and 5 is 11.\n
                Therefore, the answer is 11.\n
                Decision:\n
                incorrect\n\n

                Instructions:\n
                1. Rationale: Read the rationale provided below.\n
                2. Correct Answer: Compare the last line of the rationale to the 'Correct Answer'.\n
                3. Decision:\n
                - If the last line of the rationale matches the 'Correct Answer', respond with the single word: correct.\n
                - If it does not, respond with the single word: incorrect.\n
                4. Response Format:\n
                - Do not provide any additional text or explanations.\n
                - Only respond with one word: 'correct' or 'incorrect'.\n\n

                Correct Answer:\n
                {correct_answer}\n\n

                Rationale:\n
                {generated_rationale}\n\n

                Decision:\n
                """

            # Debug: Print the evaluation prompt
            # print(f"Evaluation Prompt:\n{evaluation_prompt}\n")

            # Interact with Ollama to evaluate the rationale
            try:
                response = ollama.chat(model=model_name, messages=[
                    {
                        'role': 'user',
                        'content': evaluation_prompt
                    },
                ])
            except Exception as e:
                print(f"Error communicating with Ollama: {e}")
                print("Adding pair to unanswered_pairs for manual review.")
                unanswered_pairs.append(pair)
                break  # Move to the next pair

            # Debug: Print the raw model response
            raw_response = response['message']['content']
            print(f"Raw Model Response: '{raw_response}'")

            # Extract decision using regex
            decision = extract_decision(raw_response)

            print(f"Decision: {decision}")

            if decision == "correct":
                # Double-check with programmatic evaluation
                if compare_answers(extracted_answer, correct_answer):
                    correct_entry = {
                        'question': question,
                        'rationale': generated_rationale,
                        'correct_answer': correct_answer,
                    }
                    correct_pairs.append(correct_entry)
                    print(f"Pair {idx} marked as correct.")
                    pair_correct = True  # Exit the retry loop
                    
                                        # Remove the pair from incorrect_pairs
                    if pair in incorrect_pairs:
                        incorrect_pairs.remove(pair)
                    else:
                        print(f"Warning: Pair {idx} not found in incorrect_pairs.")

                    continue

                else:
                    print(f"Model indicated 'correct', but programmatic evaluation failed. Adding to unanswered_pairs.")
                    unanswered_pairs.append(pair)
                    break  # Exit the retry loop
            elif decision == "incorrect":
                if attempts >= max_attempts:
                    print(f"Reached maximum attempts for Pair {idx}. Adding to unanswered_pairs.")
                    unanswered_pairs.append(pair)
                else:
                    print(f"Retrying Pair {idx} ({attempts}/{max_attempts})...")
            else:
                # Unexpected response; add to unanswered_pairs for manual review
                print(f"Unexpected decision '{decision}' for Pair {idx}. Adding to unanswered_pairs.")
                unanswered_pairs.append(pair)
                break  # Exit the retry loop

    print("\nProcessing complete.")
    print(f"Correct Pairs: {len(correct_pairs)}")
    print(f"Unanswered Pairs: {len(unanswered_pairs)}")
    
def save_results(data, file_path):
    """
    Saves categorized pairs to a specified JSON file.

    Args:
        data (list): List of categorized pairs.
        file_path (str): Path to the output JSON file.
    """
    if not data:
        print(f"No data to save for {file_path}.")
        return

    # Check if the file exists; if not, create it with an empty list
    if not os.path.exists(file_path):
        with open(file_path, 'w', encoding='utf-8') as file:
            json.dump([], file, ensure_ascii=False, indent=4)

    # Load existing data
    with open(file_path, 'r', encoding='utf-8') as file:
        try:
            existing_data = json.load(file)
            if not isinstance(existing_data, list):
                print(f"Unexpected data format in {file_path}. Overwriting with a new list.")
                existing_data = []
        except json.JSONDecodeError:
            print(f"Error decoding JSON from {file_path}. Overwriting with a new list.")
            existing_data = []

    # Append new data
    existing_data.extend(data)

    # Save back to the file
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(existing_data, file, ensure_ascii=False, indent=4)

    print(f"Saved {len(data)} pair(s) to {file_path}.")

# Run the processing function
process_incorrect_pairs(NUMBER_OF_PAIRS_TO_PROCESS, model_name=OLLOMA_MODEL_NAME)

Processing 3 out of 49 incorrect pair(s).

Processing Pair 1:
Question: If m and n are positive integers, and m=2n and k=9m, then -
Correct Answer: m/2 is a factor of k.
Attempt 1 for Pair 1.
Extracted Answer: 'yes, m/2 is a factor of k.'
Pair 1 marked as correct.

Processing Pair 2:
Question: A rectangular farm has to be fenced one long side, one short side and the diagonal. If the cost of fencing is Rs.10 per meter. The area of farm is 1200 m2 and the short side is 30 m long. How much would the job cost?
Correct Answer: 1200
Attempt 1 for Pair 2.
Extracted Answer: 'rs.1800'
Raw Model Response: ''
Decision: 
Unexpected decision '' for Pair 2. Adding to unanswered_pairs.

Processing Pair 3:
Question: In 1995, the Johns spent $800 on the family’s water bills. Anticipating that water rates would increase in 1996 by 50%, the Johns cut back their water usage. By how much must the Johns have reduce their 1996 water usage to pay exactly the same amount in 1996 as they paid in 1995?
Correct A

In [39]:
correct_pairs

[{'question': 'If m and n are positive integers, and m=2n and k=9m, then -',
  'rationale': "To find out if m/2 is a factor of k, we need to understand what happens when we multiply m by 9.\n\nSince m = 2n and k = 9m:\n\nk = 9 * (2n)\nk = 18n\n\nNow, let's see if n is a multiple of 3. If it is, then m/2 will also be a factor of k because m/2 would be equal to n/1.\n\nSince m=2n and k=9m:\n\nk = 9 * (2n)\nk = 18n\n\nIf we divide 18 by 6, which is a multiple of 3:\n\n18 / 6 = 3\nn = 3\n\nNow that we know n is a multiple of 3, let's check if m/2 is indeed a factor of k.\n\nSince m=2n and k=9m:\nk = 9 * (2 * 3)\nk = 54\n\nWe can divide 54 by 6:\n\n54 / 6 = 9\nm/2 = 3\nThis proves that m/2 is indeed a factor of k.\n\n\nAnswer: Yes, m/2 is a factor of k.",
  'correct_answer': 'm/2 is a factor of k.'},
 {'question': 'In 1995, the Johns spent $800 on the family’s water bills. Anticipating that water rates would increase in 1996 by 50%, the Johns cut back their water usage. By how much must the

In [41]:
unanswered_pairs

[{'question': 'A rectangular farm has to be fenced one long side, one short side and the diagonal. If the cost of fencing is Rs.10 per meter. The area of farm is 1200 m2 and the short side is 30 m long. How much would the job cost?',
  'rationale': '.\nStep-by-step reasoning:\n\n1. We are given that the cost of fencing is Rs.10 per meter.\n2. The area of the farm is 1200 m^2 and its short side is 30 m long.\n3. To find the total length of fence required, we need to consider the perimeter formed by fencing one long side, one short side and the diagonal.\n4. Since the rectangular farm has a square corner (i.e., right angle), the triangle formed by connecting the endpoints of these three sides will be a right triangle with its hypotenuse being this diagonal line segment.\n5. We can use Pythagoras\' theorem to find the length of this diagonal: long side^2 = short side^2 + diagonal^2 → diagonal^2 = 1200 - (30)^2\n6. Taking square root on both sides gives us diagonal = √(1200 - 900) = √300.\

In [42]:
from src.data_appending import convert_correct_pairs_to_conversations, append_conversations_to_jsonl

new_conversations = convert_correct_pairs_to_conversations(correct_pairs)
    
# Append to the new JSONL file
append_conversations_to_jsonl(new_conversations, './data/finetuning_data_new.jsonl')

Successfully appended 2 conversations to './data/finetuning_data_new.jsonl'.


In [43]:
import json
import re

# Define the input and output file paths
input_jsonl = "data/finetuning_data_new.jsonl"       # Replace with your actual input JSONL file path
output_json = "data/formatted_data.json"    # Desired output JSON file path

# Initialize a list to hold reformatted entries
reformatted_entries = []

# Define the instruction text
instruction_text = "Provide a detailed answer to the following question."

def parse_q_a(text):
    """
    Parses the input text to extract Question and Answer.

    Args:
        text (str): The input text containing Q and A.

    Returns:
        tuple: (question, answer) if both are found, else (None, None).
    """
    lines = text.strip().split('\n')

    question_lines = []
    answer_lines = []

    current_section = None

    for line in lines:
        line = line.strip()
        if line.startswith('Q:'):
            current_section = 'question'
            question_lines.append(line[2:].strip())
        elif line.startswith('A:'):
            current_section = 'answer'
            answer_lines.append(line[2:].strip())
        else:
            if current_section == 'question':
                question_lines.append(line)
            elif current_section == 'answer':
                answer_lines.append(line)

    question = '\n'.join(question_lines).strip() if question_lines else None
    answer = '\n'.join(answer_lines).strip() if answer_lines else None

    return question, answer

# Open and read the input JSONL file
with open(input_jsonl, 'r', encoding='utf-8') as fin:
    for idx, line in enumerate(fin, 1):
        try:
            data = json.loads(line)
            text = data.get('text', '').strip()

            if not text:
                print(f"Warning: Empty 'text' field in line {idx}. Skipping.")
                continue

            # Parse the Question and Answer
            question, answer = parse_q_a(text)

            if not question or not answer:
                print(f"Warning: Missing Question or Answer in line {idx}. Skipping.")
                continue

            # Append the reformatted entry
            reformatted_entries.append({
                "instruction": instruction_text,
                "input": question,
                "output": answer
            })

        except json.JSONDecodeError as e:
            print(f"Error decoding JSON in line {idx}: {e}. Skipping.")
            continue

# Assemble the entries into a dictionary with 'train' split
dataset_dict = {
    "train": reformatted_entries
}

# Save the dictionary as a single JSON file
with open(output_json, 'w', encoding='utf-8') as fout:
    json.dump(dataset_dict, fout, ensure_ascii=False, indent=4)

print(f"Reformatting complete. {len(reformatted_entries)} entries saved to '{output_json}'.")

Error decoding JSON in line 308: Expecting ',' delimiter: line 1 column 2649 (char 2648). Skipping.
Reformatting complete. 309 entries saved to 'data/formatted_data.json'.
