In [2]:
import re
import ollama
from src.data_loading import load_data
from src.parsing import parse_options
from src.rationale_generation import generate_rationale_and_answer
from src.utils import is_rationale_correct
from src.rationalization import rationalize
from src.prompt_generation import create_prompt_examples, create_prompt_set, clean_options, get_correct_answer_text

  from .autonotebook import tqdm as notebook_tqdm


In [34]:
NUM_PROMPT_EXAMPLES = 5 # number of prompt examples
NUM_EXAMPLES_TO_PROCESS = 10 # number of examples to process

### Load Dataset

In [35]:
ds_train = load_data()

# Step 1: Select the desired range from ds_train
dataset_D = ds_train.select(range(NUM_PROMPT_EXAMPLES, len(ds_train)))

# Step 2: Shuffle the selected dataset
dataset_D_shuffled = dataset_D.shuffle()  # Use any seed you prefer

# Step 3: Select a subset of examples to process
dataset_D_subset = dataset_D_shuffled.select(range(NUM_EXAMPLES_TO_PROCESS))

print(dataset_D_subset)

Dataset({
    features: ['question', 'options', 'rationale', 'correct'],
    num_rows: 10
})


### Create prompt examples

In [36]:
prompt_examples = create_prompt_examples(ds_train, NUM_PROMPT_EXAMPLES)

prompt_examples

[{'question': "Two friends plan to walk along a 43-km trail, starting at opposite ends of the trail at the same time. If Friend P's rate is 15% faster than Friend Q's, how many kilometers will Friend P have walked when they pass each other?",
  'rationale': 'If Q complete x kilometers, then P completes 1.15x kilometers.\nx + 1.15x = 43\n2.15x=43\nx = 43/2.15 = 20\nThen P will have have walked 1.15*20=23 km.\nThe answer is E.',
  'answer': 'E'},
 {'question': 'In the coordinate plane, points (x, 1) and (5, y) are on line k. If line k passes through the origin and has slope 1/5, then what are the values of x and y respectively?',
  'rationale': 'Line k passes through the origin and has slope 1/5 means that its equation is y=1/5*x.\nThus: (x, 1)=(5, 1) and (5, y) = (5,1) -->x=5 and y=1\nAnswer: C',
  'answer': 'C'},
 {'question': 'For all numbers p and q, the operation @ is defined by p@q = p^2 - pq. If xy ≠ 0, then which of the following can be equal to zero?\nI. x@y\nII. (xy)@y\nIII. x@

### Create Prompt Sets

In [37]:
prompt_set = create_prompt_set(ds_train, NUM_PROMPT_EXAMPLES)
print(prompt_set)

Question: Two friends plan to walk along a 43-km trail, starting at opposite ends of the trail at the same time. If Friend P's rate is 15% faster than Friend Q's, how many kilometers will Friend P have walked when they pass each other?
Answer Explanation: If Q complete x kilometers, then P completes 1.15x kilometers.\nx + 1.15x = 43\n2.15x=43\nx = 43/2.15 = 20\nThen P will have have walked 1.15*20=23 km.\nThe answer is E.
Answer: 23
###
Question: In the coordinate plane, points (x, 1) and (5, y) are on line k. If line k passes through the origin and has slope 1/5, then what are the values of x and y respectively?
Answer Explanation: Line k passes through the origin and has slope 1/5 means that its equation is y=1/5*x.\nThus: (x, 1)=(5, 1) and (5, y) = (5,1) -->x=5 and y=1\nAnswer: C
Answer: 5 and 1
###
Question: For all numbers p and q, the operation @ is defined by p@q = p^2 - pq. If xy ≠ 0, then which of the following can be equal to zero?
I. x@y
II. (xy)@y
III. x@(x + y)
Answer Expl

### Initialize lists to hold correct and incorrect pairs

In [38]:
correct_pairs = []
incorrect_pairs = []
unanswered_paris = []

### Iterate over each example in the subset

In [39]:
def extract_answer_text(rationale):
    """
    Extracts the answer text from the rationale using regex.
    
    Args:
        rationale (str): Generated rationale string.
    
    Returns:
        str or None: Extracted answer text if found, else None.
    """
    # Attempt to extract the answer text after 'Answer:'
    match = re.search(r'Answer:\s*(.+)', rationale, re.IGNORECASE)
    if match:
        return match.group(1)
    return None

In [40]:
for idx, example in enumerate(dataset_D_subset):
    question = example['question']
    # Map 'correct' label to answer text
    raw_options = example['options']
    cleaned_options = clean_options(raw_options)
    correct_label = example['correct'].strip().upper()
    correct_answer_text = get_correct_answer_text(cleaned_options, correct_label)

    print(f"correct_answer_text: {correct_answer_text}")
    print(f"clean options: {cleaned_options}")
    
    if correct_answer_text is None:
        print(f"Skipping example {idx} due to missing correct answer.")
        continue  # Skip this example
    
    # Generate rationale and answer
    generated_rationale = generate_rationale_and_answer(question, prompt_set)

    print(generated_rationale)
    
    # Extract the answer text from the rationale
    extracted_answer = extract_answer_text(generated_rationale)
    print(f"extracted answer: {extracted_answer}")

    response = ollama.chat(model="llama3.2:3b", messages=[
            {
                'role': 'user',
                'content':
                f"""
                    Your task is to compare two numerical answers and determine if they are the same answer, ignoring differences in units or formatting.\n\n
                    Comparison Rules:\n\n
                    - If the answers are the same, for example, First Answer: '90' and Second Answer: '90 miles' or ( km, %, sec, ml, etc) this is a match and return 'correct' in your response.\n
                    - If the answers are different, consider them NOT a match and return 'incorrect' in your response.\n\n
                    Ignore differences in formatting, such as trailing zeros.\n\n

                    Compare the Following Answers:\n
                    First answer: {extracted_answer}\n
                    Second answer: {correct_answer_text}\n\n
                    Respond with:\n
                    "correct" if the two answers are the same\n
                    "incorrect" if the two answers are not the same\n\n
                    Please respond with only one of the above options, without any explanations.
                """
            },
     ])
    
    decision = response['message']['content'].strip()
    
    # Categorize based on extracted answer
    if decision.lower() == "correct":
        correct_pairs.append({
            'question': question,
            'rationale': generated_rationale,
            'answer': extracted_answer
        })
        print('Correct:', {
            'question': question,
            'rationale': generated_rationale,
            'answer': extracted_answer
        })
    elif decision.lower() == "incorrect":
        incorrect_pairs.append({
            'question': question,
            'rationale': generated_rationale,
            'answer': extracted_answer if extracted_answer else "No Answer Extracted",
            'correct_answer': correct_answer_text
        })
        print("Incorrect:",  {
            'question': question,
            'rationale': generated_rationale,
            'answer': extracted_answer if extracted_answer else "No Answer Extracted",
            'correct_answer': correct_answer_text
        })
    else:
        unanswered_paris.append({
            'question': question,
            'rationale': generated_rationale,
            'answer': extracted_answer if extracted_answer else "No Answer Extracted"
        })
        print("Unanswered:",  {
            'question': question,
            'rationale': generated_rationale,
            'answer': extracted_answer if extracted_answer else "No Answer Extracted"
        })
        

    # Print progress every example
    print(f"Processed {idx + 1} questions.\n")

correct_answer_text: 1/6
clean options: ['A) 1/6', 'B) 3/10', 'C) 1/2', 'D) 5/6', 'E) 8/9']
To determine the fraction of the job that B completed, we need to calculate the total work done by both A and B.

A can complete the job in 3 hours, so his rate of work is 1/3 of the job per hour. After working for 1 hour, A has completed 1/3 of the job.

B joins after A has worked for 1 hour, so B starts when there is 2/3 of the job left to be done. Since B can complete the entire job in 3 hours, we need to find out how much work he does in the time remaining (2 hours).

Let's calculate B's rate of work: since B completes the entire job in 3 hours, his rate of work is 1/3 of the job per hour.

However, since A has already worked for 1 hour and completed 1/3 of the job, we need to consider only the remaining time (2 hours) when both A and B are working together. We can calculate their combined rate of work as follows:

Combined rate = Rate of A + Rate of B
Since they work together for 2 hours, l

In [30]:
correct_pairs

[{'question': 'A number when divided by 5, gives 45 as quotient and 0 as remainder. What will be the remainder when dividing the same number by 10',
  'rationale': "Step 1: Understand the problem statement.\nWe are given a number which when divided by 5 gives 45 as the quotient and 0 as the remainder. We need to find out what will be the remainder when this same number is divided by 10.\n\nStep 2: Find the original number\nFirst, let's determine the original number from the given information. Since the number divided by 5 gives a quotient of 45 and a remainder of 0, it means that the original number is exactly 5 times the quotient (since there is no remainder).\n\nNumber = Quotient * Divisor\n= 45 * 5\n= 225\n\nStep 3: Divide the original number by 10 to find the new remainder.\nNow we need to divide this original number (which we found to be 225) by 10.\n\nStep 4: Perform the division and identify the remainder.\nWhen we divide 225 by 10, the quotient is 22 with a remainder of 5. This

In [31]:
incorrect_pairs

[{'question': 'If a, b, and c are negative integers and 3a - 3b = -3c, then which of the following statements must be true?\nI. a> b\nII. a> b> c\nIII. a= c',
  'rationale': "To solve this problem, we need to start by manipulating the equation given and then analyzing each statement.\n\nStep 1: Manipulate the equation to isolate variables.\nGiven: 3a - 3b = -3c\n\nDivide both sides by 3 to simplify:\na - b = -c\n\nStep 2: Analyze Statement I (a > b).\nTo determine if a must be greater than b, we need to see if the equation provides any information that would lead us to conclude this.\n\nFrom the equation a - b = -c, since c is negative and a, b are integers, for the left side to equal a negative number, 'b' must be less than or equal to 'a'. This does not necessarily mean a > b, so Statement I is not necessarily true based on the given information.\n\nStep 3: Analyze Statement II (a > b > c).\nTo determine if both conditions of this statement are true, we need to see how they relate to

In [32]:
unanswered_paris

[]

In [33]:
total = len(correct_pairs) + len(incorrect_pairs) + len(unanswered_paris)
accuracy = len(correct_pairs) / total * 100
print(f"Total questions processed: {total}")
print(f"Correct answers: {len(correct_pairs)}")
print(f"Incorrect answers: {len(incorrect_pairs)}")
print(f"Unanswered answers: {len(unanswered_paris)}")
print(f"Accuracy: {accuracy:.2f}%")

Total questions processed: 10
Correct answers: 3
Incorrect answers: 7
Unanswered answers: 0
Accuracy: 30.00%


In [14]:
new_correct_pairs = []

In [19]:
import ollama
from src.utils import is_rationale_correct

def rationalize(question, correct_answer, model="llama3-1-reason-v01:latest"):
    """
    Generates a correct rationale for a question given the correct answer.

    Parameters:
        question (str): The question to be answered.
        options (dict): A dictionary mapping option letters to option texts.
        correct_answer (str): The correct answer text.
        prompt_set (str): The initial prompt set containing examples.

    Returns:
        str: The generated rationale.
    """
    # Prepare the options text
    # options_text = '\n'.join([f"{key}: {value}" for key, value in options.items()])

    # Construct the input prompt with explicit instructions
    input_text = (
        "Provide a detailed explanation for the following question, ensuring that the explanation clearly justifies why the correct answer is chosen.\n"
        f"Question: {question}\n"
        f"Correct Answer: {correct_answer}\n"
        "Explanation:"
    )

    try:
        # Use Ollama (or your LLM) to get the response
        response = ollama.chat(model=model, messages=[
            {
                'role': 'user',
                'content': input_text,
            },
        ])

        # Extract the generated rationale
        generated_rationale_full = response['message']['content'].strip()

        # Assign the entire response as the rationale
        generated_rationale = generated_rationale_full

        # Optionally, verify the rationale's correctness
        if is_rationale_correct(generated_rationale, correct_answer, question):
            return generated_rationale
        else:
            print(f"Generated rationale does not sufficiently explain the correct answer for question: {question}")
            return ''

    except Exception as e:
        print(f"Error during rationalization: {e}")
        return ''


# Process the incorrect answers
for pair in incorrect_pairs:
    question = pair['question']
    correct_answer = pair['correct_answer']

    # Generate the rationale with the correct answer as a hint
    generated_rationale = rationalize(question, correct_answer)

    # Add the rationalized example to correct_pairs
    new_correct_pairs.append({
        'question': question,
        'rationale': generated_rationale,
        'answer': correct_answer
    })

    print({
        'question': question,
        'rationale': generated_rationale,
        'answer': correct_answer
    })

Error during rationalization: llama runner process has terminated: error loading modelvocabulary: cannot find tokenizer merges in model file
{'question': 'Tim went to a shop and bought things worth Rs. 25, out of which 30 Paise went on sales tax on taxable purchases. If the tax rate was 6%, then what was the cost of the tax free items?', 'rationale': '', 'answer': '19.7'}
Error during rationalization: llama runner process has terminated: error loading modelvocabulary: cannot find tokenizer merges in model file
{'question': 'A palindrome is a number that reads the same forward and backward, such as 242. How many odd seven-digit numbers are palindromes?', 'rationale': '', 'answer': '5,000'}
Error during rationalization: llama runner process has terminated: error loading modelvocabulary: cannot find tokenizer merges in model file
{'question': 'Mixture W is 20% oil and 80% material B by weight. If 2 more kilograms of oil are added to the 8 kilograms mixture W, how many kilograms of mixture

### Converting our corrected pairs into a structure the Llama3.1 model will understand for fine tuning.

In [23]:
import json
import re

def extract_question(human_message):
    """
    Extracts the question from the human message.

    Args:
        human_message (str): The message content from the human.

    Returns:
        str: Extracted question text or the original message if extraction fails.
    """
    match = re.search(r'Question:\s*(.+)', human_message, re.IGNORECASE)
    if match:
        return match.group(1).strip()
    return human_message.strip()

def extract_rationale(gpt_message):
    """
    Extracts the rationale from the GPT message by removing any initial 'A: ...' lines,
    removing 'Rationale:' labels, and excluding 'Answer: ...' sections.

    Args:
        gpt_message (str): The message content from GPT.

    Returns:
        str: Extracted rationale text or an empty string if extraction fails.
    """
    # Step 1: Remove any initial "A: ..." lines
    gpt_message = re.sub(r'^A:\s*.+\n', '', gpt_message, flags=re.IGNORECASE)

    # Step 2: Extract text after "Rationale:" up to "Answer:" or end of string
    match = re.search(r'Rationale:\s*(.*?)(?:\nAnswer:.*|$)', gpt_message, re.IGNORECASE | re.DOTALL)
    if match:
        rationale = match.group(1).strip()
        return rationale
    else:
        # If "Rationale:" is not found, remove any trailing "Answer: ..." and return the rest
        gpt_message = re.sub(r'\n*Answer:\s*.+$', '', gpt_message, flags=re.IGNORECASE | re.DOTALL).strip()
        return gpt_message

def convert_existing_jsonl(input_file, output_file):
    """
    Converts the existing finetuning_data.jsonl file to the new Q&A format.

    Args:
        input_file (str): Path to the existing JSONL file.
        output_file (str): Path to the new JSONL file to be created.

    Returns:
        None
    """
    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'w', encoding='utf-8') as outfile:
        
        for line_num, line in enumerate(infile, 1):
            try:
                # Each line is expected to be a JSON array of messages
                messages = json.loads(line)
                
                # Ensure there are at least two messages: human and gpt
                if len(messages) < 2:
                    print(f"Warning: Line {line_num} has less than 2 messages. Skipping.")
                    continue
                
                # Extract human and gpt messages
                human_message = ""
                gpt_message = ""
                for msg in messages:
                    if msg.get("from") == "human":
                        human_message = msg.get("value", "").strip()
                    elif msg.get("from") == "gpt":
                        gpt_message = msg.get("value", "").strip()
                
                if not human_message or not gpt_message:
                    print(f"Warning: Line {line_num} is missing human or gpt message. Skipping.")
                    continue
                
                # Extract question and rationale
                question = extract_question(human_message)
                rationale = extract_rationale(gpt_message)
                
                if not question or not rationale:
                    print(f"Warning: Line {line_num} could not extract question or rationale. Skipping.")
                    continue
                
                # Create the new format
                new_entry = {
                    "text": f"Q: {question}\nA: {rationale}"
                }
                
                # Write to the new JSONL file
                outfile.write(json.dumps(new_entry, ensure_ascii=False) + '\n')
            
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON on line {line_num}: {e}. Skipping.")
                continue

    print(f"Conversion complete. New file saved as '{output_file}'.")

if __name__ == "__main__":
    input_jsonl = 'finetuning_data.jsonl'        # Existing file
    output_jsonl = 'finetuning_data_new.jsonl'   # New file
        
    # Perform the conversion
    convert_existing_jsonl(input_jsonl, output_jsonl)

Error decoding JSON on line 18: Extra data: line 1 column 1471 (char 1470). Skipping.
Error decoding JSON on line 84: Extra data: line 1 column 936 (char 935). Skipping.
Conversion complete. New file saved as 'finetuning_data_new.jsonl'.


### Appending New Data

In [520]:
import json
import re
import os

def convert_correct_pairs_to_conversations(correct_pairs):
    """
    Converts a list of correct_pairs to the new conversation format.
    
    Args:
        correct_pairs (list): List of dictionaries with keys 'question', 'rationale', and 'answer'.
    
    Returns:
        list: List of dictionaries in the format {"text": "Q: ...\nA: ..."}
    """
    new_conversations = []
    for pair in correct_pairs:
        question = pair.get('question', '').strip()
        rationale = pair.get('rationale', '').strip()
        
        if not question or not rationale:
            print(f"Warning: Missing question or rationale in pair: {pair}. Skipping.")
            continue
        
        # Remove any trailing "Answer: ..." lines from the rationale
        # This ensures that the rationale does not contain explicit "Answer:" labels
        rationale_cleaned = re.sub(r'\n*Answer:\s*.*$', '', rationale, flags=re.IGNORECASE).strip()
        
        # Additionally, remove any "Rationale:" labels if present
        rationale_cleaned = re.sub(r'^Rationale:\s*', '', rationale_cleaned, flags=re.IGNORECASE).strip()
        
        # Create the new conversation format
        convo = {
            "text": f"Q: {question}\nA: {rationale_cleaned}"
        }
        new_conversations.append(convo)
    return new_conversations

def append_conversations_to_jsonl(conversations, file_path='finetuning_data.jsonl'):
    """
    Appends a list of conversations to an existing JSONL file in the desired format.
    
    Each conversation is a dict with 'text': 'Q: ...\nA: ...'
    This function ensures that any residual 'Answer:' labels are removed from the 'A: ...' section.
    
    Args:
        conversations (list): List of conversations to append, each as {"text": "Q: ...\nA: ..."}.
        file_path (str): Path to the existing JSONL file.
    
    Returns:
        None
    """
    # Check if the file exists
    if os.path.exists(file_path):
        # Check if the file ends with a newline
        with open(file_path, 'rb') as f:
            try:
                f.seek(-1, os.SEEK_END)
                last_char = f.read(1)
                if last_char != b'\n':
                    newline_needed = True
                else:
                    newline_needed = False
            except OSError:
                # File is empty
                newline_needed = False
    else:
        # File does not exist; will be created
        newline_needed = False

    with open(file_path, 'a', encoding='utf-8') as f:
        # If a newline is needed before appending, add it
        if newline_needed:
            f.write('\n')
        
        for convo in conversations:
            text = convo.get('text', '').strip()
            
            # Ensure that the text starts with "Q: " and contains "A: "
            if not text.startswith("Q: ") or "\nA: " not in text:
                print(f"Warning: Conversation does not follow the 'Q: ...\nA: ...' format. Skipping: {convo}")
                continue
            
            # Split the text into Question and Answer parts
            try:
                q_part, a_part = text.split("\nA: ", 1)
            except ValueError:
                print(f"Warning: Unable to split 'Q' and 'A' parts. Skipping: {convo}")
                continue
            
            # Clean the Answer part by removing any "Answer:" labels if present
            # Although it should already be cleaned, this is an extra safeguard
            a_part_cleaned = re.sub(r'\n*Answer:\s*.*$', '', a_part, flags=re.IGNORECASE).strip()
            a_part_cleaned = re.sub(r'^Answer:\s*', '', a_part_cleaned, flags=re.IGNORECASE).strip()
            
            # Reconstruct the cleaned conversation text
            new_text = f"{q_part}\nA: {a_part_cleaned}"
            
            # Append the cleaned conversation to the list
            processed_convo = {"text": new_text}
            
            # Serialize and write to the file
            json_line = json.dumps(processed_convo, ensure_ascii=False)
            f.write(json_line + '\n')
    
    print(f"Successfully appended {len(conversations)} conversations to '{file_path}'.")

In [820]:
new_conversations = convert_correct_pairs_to_conversations(correct_pairs)
    
    # Append to the new JSONL file
append_conversations_to_jsonl(new_conversations, 'finetuning_data_new.jsonl')

Successfully appended 8 conversations to 'finetuning_data_new.jsonl'.


### Data split

In [821]:
import json
import random

def split_jsonl_file(input_file, output_train, output_valid, output_test):
    # Load the jsonl file
    with open(input_file, 'r') as f:
        data = [json.loads(line) for line in f]

    # Shuffle the data to ensure random distribution
    random.shuffle(data)

    # Calculate split indices for 90% train, 5% valid, 5% test
    train_split = int(0.90 * len(data))
    valid_split = int(0.95 * len(data))

    # Split the data
    train_data = data[:train_split]
    valid_data = data[train_split:valid_split]
    test_data = data[valid_split:]

    # Write the data into their respective jsonl files
    with open(output_train, 'w') as f:
        for item in train_data:
            f.write(json.dumps(item) + '\n')

    with open(output_valid, 'w') as f:
        for item in valid_data:
            f.write(json.dumps(item) + '\n')

    with open(output_test, 'w') as f:
        for item in test_data:
            f.write(json.dumps(item) + '\n')

# Example usage
split_jsonl_file('finetuning_data_new.jsonl', 'train.jsonl', 'valid.jsonl', 'test.jsonl')

In [12]:
import json
import re

# Define the input and output file paths
input_jsonl = "finetuning_data_new.jsonl"       # Replace with your actual input JSONL file path
output_json = "formatted_data.json"    # Desired output JSON file path

# Initialize a list to hold reformatted entries
reformatted_entries = []

# Define the instruction text
instruction_text = "Provide a detailed answer to the following question."

# Function to parse Q and A from the text
def parse_q_a(text):
    """
    Parses the input text to extract Question and Answer.

    Args:
        text (str): The input text containing Q and A.

    Returns:
        tuple: (question, answer) if both are found, else (None, None).
    """
    # Regular expressions to capture Q and A
    q_pattern = r'^Q:\s*(.*?)(?=\nA:)'
    a_pattern = r'^A:\s*(.*)$'

    question_match = re.search(q_pattern, text, re.DOTALL | re.MULTILINE)
    answer_match = re.search(a_pattern, text, re.DOTALL | re.MULTILINE)

    question = question_match.group(1).strip() if question_match else None
    answer = answer_match.group(1).strip() if answer_match else None

    return question, answer

# Open and read the input JSONL file
with open(input_jsonl, 'r', encoding='utf-8') as fin:
    for idx, line in enumerate(fin, 1):
        try:
            data = json.loads(line)
            text = data.get('text', '').strip()

            if not text:
                print(f"Warning: Empty 'text' field in line {idx}. Skipping.")
                continue

            # Parse the Question and Answer
            question, answer = parse_q_a(text)

            if not question or not answer:
                print(f"Warning: Missing Question or Answer in line {idx}. Skipping.")
                continue

            # Append the reformatted entry
            reformatted_entries.append({
                "instruction": instruction_text,
                "input": question,
                "output": answer
            })

        except json.JSONDecodeError as e:
            print(f"Error decoding JSON in line {idx}: {e}. Skipping.")
            continue

# Assemble the entries into a dictionary with 'train' split
dataset_dict = {
    "train": reformatted_entries
}

# Save the dictionary as a single JSON file
with open(output_json, 'w', encoding='utf-8') as fout:
    json.dump(dataset_dict, fout, ensure_ascii=False, indent=4)

print(f"Reformatting complete. {len(reformatted_entries)} entries saved to '{output_json}'.")

Reformatting complete. 232 entries saved to 'formatted_data.json'.


In [19]:
from datasets import load_dataset

ds = load_dataset("wzebrowski/ft_reasoning_data", split="train")

Generating train split: 232 examples [00:00, 16293.20 examples/s]


In [20]:
ds

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 232
})

KeyError: "Column train not in the dataset. Current columns in the dataset: ['instruction', 'input', 'output']"