In [2]:
import os
import json
import random
import asyncio
import aiohttp
import re
from aiohttp import ClientSession

from openai import OpenAI

os.environ.update({
    'OPENAI_API_KEY': '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'
})

client = OpenAI()

# Function to read JSONL file and return a list of values for a specific key
def read_jsonl(file_path, key):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            json_obj = json.loads(line)
            data.append(json_obj[key])
    return data

def process_generated_prompts(file_path):
    prompts = []
    try:
        with open(file_path, 'r') as file:
            data = json.load(file)
            for prompt_obj in data["prompts"]:
                prompts.append(prompt_obj)
    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except json.JSONDecodeError:
        print(f"Error decoding JSON in file: {file_path}")
    return prompts

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
API_ENDPOINT = "https://api.openai.com/v1/chat/completions"
MODEL = os.getenv("MODEL_NAME", "gpt-3.5-turbo-1106")
MAX_TOKENS = int(os.getenv("MAX_TOKENS", "3800"))

# Generate Subject, Concept and Mediums

In [2]:
def create_prompt(concept, medium, subject):
    return (f"Concept: [{concept}] "
            f"Subjects to cover in prompt: [{subject}] "
            f"Subject specific topics: Cover topics that are unique to ways of "
            f"thinking [{concept}] regarding that specific subject. Prompt must"
            f"include thinking process, be explicit and have substance."
            f"Add persona and context to the prompt to make it subject specific. "
            f"Explain why the `prompt` is explicitly related to the context."
            f"Explain why the `prompt` is tailored for the subject."
            f"Explain why the `prompt` is explicitly an instruction about"
            f"writing/drafting a [{medium}]. ")


 
def create_system_prompt(concept, subject, medium):
    return (f"Write me a diverse list of "
            f"subject-specific concise prompts or questions or queries that are "
            f"around [{concept}] in [{subject}]."
            f"Prompt should contain instructions to write/draft one of these "
            f"mediums: [{medium}] "
            f'Your response should be JSON in the shape of {{}} '
            f'where each prompt has the shape keys for "subject_specific_topic"'
            f', "subject", "why_prompt_tailored_for_subject", "medium_keyword",'
            f'"why_prompt_concept", "why_prompt_contains_instruction_keyword",'
            f'"prompt"')

def call_openai_api(prompt, system_prompt, model, max_tokens):
    try:
        response = client.chat.completions.create(
                        model=model, 
                        response_format={ "type": "json_object" },
                        messages=[
                        {"role": "system", "content": system_prompt},
                        {"role": "user", "content": prompt}
                            ],
                        max_tokens=max_tokens)
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error in API call: {e}")
        return None


def generate_data(concepts, mediums, subjects, output_file, model="gpt-3.5-turbo-1106", max_tokens=3000):
     # Randomly select one from each category
    concept = random.choice(concepts)
    medium = random.choice(mediums)
    subject = random.choice(subjects)
    
    prompt = create_prompt(concept, medium, subject)
    system_prompt = create_system_prompt(concept, subject, medium)

    json_string = call_openai_api(prompt, system_prompt, model, max_tokens)
    if json_string:
        with open(output_file, 'a') as f:
            f.write(json_string + '\n')

    print(f"Data generation complete. Saved to '{output_file}'.")


# Paths and data loading
concepts = read_jsonl(r'C:\Users\eyliw\code\finetune\concepts.jsonl', "concept")
mediums = read_jsonl(r'C:\Users\eyliw\code\finetune\mediums.jsonl', "medium")
subjects = read_jsonl(r'C:\Users\eyliw\code\finetune\subjects.jsonl', "subject")


# Shuffle lists to add more randomness
random.shuffle(concepts)
random.shuffle(mediums)
random.shuffle(subjects)

output_file_path = 'base.json'
number_of_prompts_to_generate = 3  # Set the number of prompts you want to generate

for _ in range(number_of_prompts_to_generate):
    generate_data(concepts, mediums, subjects, output_file_path)
    print(f"{_+1}/{number_of_prompts_to_generate}")


Data generation complete. Saved to 'base.json'.
1
Data generation complete. Saved to 'base.json'.
2


# LOGICAL PUZZLE

In [8]:
def create_prompt(theme):
    return (f"Create a logical arrangement puzzle. "
            f"Choose a unique {theme} and define a set of objects related to it. "
            f"Write statements that describe their order or relationships, ensuring logical consistency. " 
            f"Formulate a question that asks for the correct order or relationship, and provide multiple-choice answers. "
            f"Ensure the puzzle is clear, solvable, and has only one correct answer. "
            f"For example, 'The following statements each describe a set of five objects arranged in a fixed order. The statements are logically consistent.\n- In an antique car show, there are five vehicles: a truck, a station wagon, a motorcyle, a convertible, and a hatchback.\n- The convertible is newer than the truck.\n- The station wagon is newer than the hatchback.\n- The convertible is older than the hatchback.\n- The station wagon is the second-newest.\n\nQ: Given the above statements, which of the following is correct?\nOptions:\n(A) The truck is the third-newest.\n(B) The station wagon is the third-newest.\n(C) The motorcyle is the third-newest.\n(D) The convertible is the third-newest.\n(E) The hatchback is the third-newest.'")


def create_system_prompt():
    return (f"Generate a variety of prompts that require the construction of logical puzzles. "
            f"These prompts should cover a diverse range of themes and challenge the user to create puzzles "
            f"that are unique and engaging. Each puzzle should include a set of objects, logically consistent statements about their arrangement, "
            f"and a question that leads to a single correct answer. The aim is to test logical thinking and problem-solving skills."
            f"Your response should be JSON in the shape of {{}} with the following structure: "
            f'"prompt": "<theme description>\\n- <statement 1>\\n- <statement 2>\\n...\\n\\nQ: <question> '
            f'\\nOptions:\\n(A) <option A>\\n(B) <option B>\\n(C) <option C>\\n(D) <option D>\\n(E) <option E>", '
            f'... for each puzzle. '
            f'Include a unique theme, followed by a series of logically consistent statements about the arrangement of objects related to the theme. '
            f'After the statements, present a clear question with multiple choice answers. '
            f'Each part should be separated by newline characters (`\\n`) for clarity. '
            f'The fields "why_prompt_contains_instruction_keyword" and "why_options_choice" must be consistent with the example provided. '
            f'Regularly review your output against this template to ensure correct formatting and maintain the integrity of the puzzles.')

async def call_openai_api(prompt, system_prompt, model, max_tokens):
    headers = {
        "Authorization": f"Bearer {OPENAI_API_KEY}",
        "Content-Type": "application/json"
    }

    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt}
        ],
        "max_tokens": max_tokens
    }

    async with aiohttp.ClientSession() as session:
        async with session.post(API_ENDPOINT, headers=headers, json=payload) as response:
            if response.status != 200:
                print(f"Error in API call: {response.status}")
                return None
            return await response.json()

async def generate_data(themes, output_file, model="gpt-3.5-turbo-1106", max_tokens=3800):
    theme = random.choice(themes)

    prompt = create_prompt(theme)
    system_prompt = create_system_prompt()

    response = await call_openai_api(prompt, system_prompt, model, max_tokens)
    if response and 'choices' in response and len(response['choices']) > 0:
        content = response['choices'][0]['message']['content']
        try:
            # Try to directly parse the content as JSON
            parsed_content = json.loads(content)
        except json.JSONDecodeError:
            # If direct parsing fails, try extracting JSON from code block
            try:
                json_part = re.search(r'```json\n(\{.*\})\n```', content, re.DOTALL)
                if json_part:
                    parsed_content = json.loads(json_part.group(1))
                else:
                    print(f"Unable to extract JSON from response: {content}")
                    return
            except json.JSONDecodeError as e:
                print(f"Error parsing JSON response: {e}")
                return

        with open(output_file, 'a') as f:
            json_string = json.dumps(parsed_content, indent=2)
            f.write(json_string + '\n')
        print(f"Data generation complete. Saved to '{output_file}'.")
    else:
        print(f"Invalid response format or empty 'choices' in response.")



themes = read_jsonl(r'C:\Users\eyliw\code\finetune\themes.jsonl', "theme")

random.shuffle(themes)

async def main():
    output_file_path = 'logical_puzzle.json'
    number_of_prompts_to_generate = 20

    tasks = [generate_data(themes, output_file_path) for _ in range(number_of_prompts_to_generate)]
    await asyncio.gather(*tasks)

if __name__ == "__main__":
    loop = asyncio.get_event_loop()
    if loop.is_running():
        loop.create_task(main())
    else:
        asyncio.run(main())


Data generation complete. Saved to 'logical_puzzle.json'.
Data generation complete. Saved to 'logical_puzzle.json'.
Unable to extract JSON from response: {
"prompt": "The following statements each describe a set of five objects arranged in a fixed order. The statements are logically consistent.\n- In a blockchain conference, there are five cryptocurrencies: Bitcoin, Ethereum, Ripple, Litecoin, and Cardano.\n- Bitcoin is the oldest cryptocurrency among the five.\n- Ripple is older than Litecoin.\n- Ethereum is newer than Ripple.\n- Cardano is the second-newest cryptocurrency.

Q: Given the above statements, which of the following is correct?
Options:
(A) Bitcoin is the third-oldest.
(B) Ethereum is the third-oldest.
(C) Ripple is the third-oldest.
(D) Litecoin is the third-oldest.
(E) Cardano is the third-oldest.",
"why_prompt_contains_instruction_keyword": "The prompt contains the instruction keyword by clearly providing the instructions for creating a logical arrangement puzzle.",
"wh

Task exception was never retrieved
future: <Task finished name='Task-160' coro=<main() done, defined at C:\Users\eyliw\AppData\Local\Temp\ipykernel_2172\650518064.py:85> exception=TimeoutError()>
Traceback (most recent call last):
  File "C:\Users\eyliw\AppData\Local\Temp\ipykernel_2172\650518064.py", line 90, in main
    await asyncio.gather(*tasks)
  File "C:\Users\eyliw\AppData\Local\Temp\ipykernel_2172\650518064.py", line 53, in generate_data
    response = await call_openai_api(prompt, system_prompt, model, max_tokens)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\eyliw\AppData\Local\Temp\ipykernel_2172\650518064.py", line 41, in call_openai_api
    async with session.post(API_ENDPOINT, headers=headers, json=payload) as response:
  File "c:\Users\eyliw\anaconda3\Lib\site-packages\aiohttp\client.py", line 1141, in __aenter__
    self._resp = await self._coro
                 ^^^^^^^^^^^^^^^^
  File "c:\Users\eyliw\anaconda3\Lib\site

# Logical Puzzle 2

In [4]:
def create_prompt(scenario, attribute):
    return (f"Create a logical arrangement puzzle based on the scenario '{scenario}' and attribute '{attribute}'. "
            f"Formulate statements and a question that asks for the correct order or relationship of entities, "
            f"ensuring the puzzle is clear, solvable, and has only one correct answer."
            f"For example: 'To get into class, the headmistress of the boarding school requires the girls to line up in ascending order of height. Alphonsine is taller than Marie, but shorter than Camille. Madeleine is taller than Suzanne and Camille. Marie is taller than Suzanne, but shorter than Madeleine. \nIn what order will the girls be arranged?'")


def create_system_prompt():
    return (f"Each puzzle must involve a set of entities, logically consistent statements about their arrangement, "
            f"and a question that leads to a single correct answer. "
            f"The aim is to test logical thinking and problem-solving skills. "
            f"Your response should be JSON in the shape of {{}} with the following structure: "
            f'"prompt": "<scenario description>\\n- <statement 1>\\n- <statement 2>\\n...\\n\\nQ: <question> '
            f'\\nA) <option A>\\nB) <option B>\\nC) <option C>\\nD) <option D>\\nE) <option E>", '
            f'... for each puzzle. '
            f'Include a scenario description, followed by a series of logically consistent statements about the arrangement of objects or entities. '
            f'After the statements, present a clear question with multiple choice answers. '
            f'Each part should be separated by newline characters (`\\n`) for clarity. '
            f'Ensure each puzzle is unique and tests logical thinking and problem-solving skills.')

async def call_openai_api(prompt, system_prompt, model, max_tokens):
    headers = {
        "Authorization": f"Bearer {OPENAI_API_KEY}",
        "Content-Type": "application/json"
    }

    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt}
        ],
        "max_tokens": max_tokens
    }

    async with aiohttp.ClientSession() as session:
        async with session.post(API_ENDPOINT, headers=headers, json=payload) as response:
            if response.status != 200:
                print(f"Error in API call: {response.status}")
                return None
            return await response.json()

async def generate_data(scenarios, attributes, output_file, model="gpt-3.5-turbo-1106", max_tokens=3800):
    scenario = random.choice(scenarios)
    attribute = random.choice(attributes)

    prompt = create_prompt(scenario, attribute)
    system_prompt = create_system_prompt()

    response = await call_openai_api(prompt, system_prompt, model, max_tokens)
    if response and 'choices' in response and len(response['choices']) > 0:
        content = response['choices'][0]['message']['content']
        try:
            # Try to directly parse the content as JSON
            parsed_content = json.loads(content)
        except json.JSONDecodeError:
            # If direct parsing fails, try extracting JSON from code block
            try:
                json_part = re.search(r'```json\n(\{.*\})\n```', content, re.DOTALL)
                if json_part:
                    parsed_content = json.loads(json_part.group(1))
                else:
                    print(f"Unable to extract JSON from response: {content}")
                    return
            except json.JSONDecodeError as e:
                print(f"Error parsing JSON response: {e}")
                return

        with open(output_file, 'a') as f:
            json_string = json.dumps(parsed_content, indent=2)
            f.write(json_string + '\n')
        print(f"Data generation complete. Saved to '{output_file}'.")
    else:
        print(f"Invalid response format or empty 'choices' in response.")

# Paths and data loading
attributes = read_jsonl(r'C:\Users\eyliw\code\finetune\attributes.jsonl', "attribute")
scenarios = read_jsonl(r'C:\Users\eyliw\code\finetune\scenarios.jsonl', "scenario")
themes = read_jsonl(r'C:\Users\eyliw\code\finetune\themes.jsonl', "theme")
output_file_path = 'logical_puzzle.json'

# Shuffle lists to add more randomness
random.shuffle(attributes)
random.shuffle(scenarios)
random.shuffle(themes)

async def main():
    output_file_path = 'logical_puzzle.json'
    number_of_prompts_to_generate = 10

    tasks = [generate_data(scenarios, attributes, output_file_path) for _ in range(number_of_prompts_to_generate)]
    await asyncio.gather(*tasks)

if __name__ == "__main__":
    loop = asyncio.get_event_loop()
    if loop.is_running():
        loop.create_task(main())
    else:
        asyncio.run(main())

Error parsing JSON response: Expecting property name enclosed in double quotes: line 3 column 1 (char 415)
Data generation complete. Saved to 'logical_puzzle.json'.
Data generation complete. Saved to 'logical_puzzle.json'.
Data generation complete. Saved to 'logical_puzzle.json'.
Error parsing JSON response: Expecting ':' delimiter: line 4 column 1 (char 373)
Data generation complete. Saved to 'logical_puzzle.json'.
Data generation complete. Saved to 'logical_puzzle.json'.
Data generation complete. Saved to 'logical_puzzle.json'.
Error parsing JSON response: Expecting ':' delimiter: line 8 column 51 (char 473)


# Simple Puzzle

In [6]:
def create_prompt():
    return (f"example : 'Germaine and Raymonde picked 90 potatoes. Germaine picked 14 less than Raymonde. How many did they each pick?'"
            f"Please create a series of math complexe puzzles similar to the Germaine and Raymonde potato picking puzzle. "
            f"Each puzzle should involve multiples characters engaging in an activity where they collectively achieve a quantifiable result. "
            f"The relationship between their individual contributions should be similar to the original puzzle. "
            f"Ensure the puzzle is clear, solvable, and has only one correct answer. "
            f"Vary the characters, activities, and numerical values for each puzzle. I'd like a large number of these puzzles, each unique in its scenario and numbers.")

def create_system_prompt():
    return (f'Your response should be JSON in the shape of {{}} with the following structure: '
            f'"input": "<puzzle description>", "output": "<puzzle solution>", '
            f'"justification": "<explanation of solution>", "thinking_process": "<detailed thought process>", '
            f'... for each puzzle. '
            f'The "input" should contain the text of the math puzzle, describing a scenario with two characters, their activity, and the numerical relationship of their contributions. '
            f'The "output" should hold the correct numerical answer. '
            f'The "justification" should explain the reasoning or method used to arrive at the solution. '
            f'The "thinking_process" should detail the step-by-step thought process behind solving the puzzle, including identifying variables, setting up equations, and the logical reasoning used.')

async def call_openai_api(prompt, system_prompt, model, max_tokens):
    headers = {
        "Authorization": f"Bearer {OPENAI_API_KEY}",
        "Content-Type": "application/json"
    }

    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt}
        ],
        "max_tokens": max_tokens
    }

    async with ClientSession() as session:
        async with session.post(API_ENDPOINT, headers=headers, json=payload) as response:
            if response.status != 200:
                print(f"Error in API call: {response.status}")
                return None
            return await response.json()

async def generate_data(output_file, model="gpt-3.5-turbo-1106", max_tokens=3800):
    prompt = create_prompt()
    system_prompt = create_system_prompt()

    response = await call_openai_api(prompt, system_prompt, model, max_tokens)
    if response and 'choices' in response and len(response['choices']) > 0:
        content = response['choices'][0]['message']['content']
        try:
            # Try to directly parse the content as JSON
            parsed_content = json.loads(content)
        except json.JSONDecodeError:
            # If direct parsing fails, try extracting JSON from code block
            try:
                json_part = re.search(r'```json\n(\{.*\})\n```', content, re.DOTALL)
                if json_part:
                    parsed_content = json.loads(json_part.group(1))
                else:
                    print(f"Unable to extract JSON from response: {content}")
                    return
            except json.JSONDecodeError as e:
                print(f"Error parsing JSON response: {e}")
                return

        with open(output_file, 'a') as f:
            json_string = json.dumps(parsed_content, indent=2)
            f.write(json_string + '\n')
        print(f"Data generation complete. Saved to '{output_file}'.")
    else:
        print(f"Invalid response format or empty 'choices' in response.")

async def main():
    output_file_path = 'P2.json'
    number_of_prompts_to_generate = 5

    tasks = [generate_data(output_file_path) for _ in range(number_of_prompts_to_generate)]
    await asyncio.gather(*tasks)

if __name__ == "__main__":
    loop = asyncio.get_event_loop()
    if loop.is_running():
        loop.create_task(main())
    else:
        asyncio.run(main())

Data generation complete. Saved to 'P2.json'.
Data generation complete. Saved to 'P2.json'.
Data generation complete. Saved to 'P2.json'.
Error parsing JSON response: Extra data: line 6 column 2 (char 692)
Unable to extract JSON from response: {
  "input": "Bryan and Maria baked 72 cakes. Bryan baked 8 more cakes than Maria. How many cakes did each bake?",
  "output": "Bryan: 40 cakes, Maria: 32 cakes",
  "justification": "Bryan baked 8 more cakes than Maria, so if we let Maria's number of cakes be x, then Bryan's number of cakes would be x + 8. The total number of cakes they baked is 72, so x + (x + 8) = 72. Solving for x gives x = 32, so Maria baked 32 cakes and Bryan baked 40 cakes.",
  "thinking_process": "Let's set up the equation: x + (x + 8) = 72, where x is the number of cakes Maria baked and (x + 8) is the number of cakes Bryan baked. Solve for x to find the number of cakes each of them baked."
},
{
  "input": "Lucas and Sofia sold 140 concert tickets. Lucas sold 10 more ticke

Task exception was never retrieved
future: <Task finished name='Task-36' coro=<main() done, defined at C:\Users\eyliw\AppData\Local\Temp\ipykernel_2172\2902887193.py:87> exception=TimeoutError()>
Traceback (most recent call last):
  File "C:\Users\eyliw\AppData\Local\Temp\ipykernel_2172\2902887193.py", line 92, in main
    await asyncio.gather(*tasks)
  File "C:\Users\eyliw\AppData\Local\Temp\ipykernel_2172\2902887193.py", line 50, in generate_data
    response = await call_openai_api(prompt, system_prompt, model, max_tokens)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\eyliw\AppData\Local\Temp\ipykernel_2172\2902887193.py", line 37, in call_openai_api
    async with session.post(API_ENDPOINT, headers=headers, json=payload) as response:
  File "c:\Users\eyliw\anaconda3\Lib\site-packages\aiohttp\client.py", line 1141, in __aenter__
    self._resp = await self._coro
                 ^^^^^^^^^^^^^^^^
  File "c:\Users\eyliw\anaconda3\Lib\s

# Shuffle_item

In [10]:
def create_prompt():
    return (f"Example: 'Alice, Bob, and Claire are friends and avid readers who occasionally trade books. "
            f"At the start of the semester, they each buy one new book: Alice gets The Great Gatsby, "
            f"Bob gets Ulysses, and Claire gets Moby Dick. As the semester proceeds, they start trading around the new books. "
            f"First, Alice and Claire swap books. Then, Bob and Claire swap books. Finally, Alice and Claire swap books. "
            f"At the end of the semester, Alice has\nOptions:\n(A) The Great Gatsby.\n(B) Ulysses.\n(C) Moby Dick.' "
            f"Please create a series of puzzles where characters trade or swap items in a sequence. "
            f"Each puzzle should involve multiple characters and items, with a series of trades leading to a final question about the ownership or position of a specific item. "
            f"Ensure the puzzle is clear, solvable, and has only one correct answer. "
            f"Vary the characters, items, and trading sequences for each puzzle. I'd like a large number of these puzzles, each unique in its scenario.")

def create_system_prompt():
    return (f'Your response should be JSON in the shape of {{}} with the following structure: '
            f'"input": "<puzzle description>", "output": "<puzzle solution>", '
            f'"justification": "<explanation of solution>", "thinking_process": "<detailed thought process>", '
            f'... for each puzzle. '
            f'The "input" should contain the text of the puzzle, describing a scenario with multiple characters, their items, and the sequence of trades. '
            f'The "output" should specify which item a particular character ends up with. '
            f'The "justification" should explain the logical steps to determine the final ownership of the item. '
            f'The "thinking_process" should detail the step-by-step thought process behind solving the puzzle, including tracking the trades, and the logical reasoning used.')

async def call_openai_api(prompt, system_prompt, model, max_tokens):
    headers = {
        "Authorization": f"Bearer {OPENAI_API_KEY}",
        "Content-Type": "application/json"
    }

    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt}
        ],
        "max_tokens": max_tokens
    }

    async with ClientSession() as session:
        async with session.post(API_ENDPOINT, headers=headers, json=payload) as response:
            if response.status != 200:
                print(f"Error in API call: {response.status}")
                return None
            return await response.json()

async def generate_data(output_file, model="gpt-3.5-turbo-1106", max_tokens=3800):
    prompt = create_prompt()
    system_prompt = create_system_prompt()

    response = await call_openai_api(prompt, system_prompt, model, max_tokens)
    if response and 'choices' in response and len(response['choices']) > 0:
        content = response['choices'][0]['message']['content']
        try:
            # Try to directly parse the content as JSON
            parsed_content = json.loads(content)
        except json.JSONDecodeError:
            # If direct parsing fails, try extracting JSON from code block
            try:
                json_part = re.search(r'```json\n(\{.*\})\n```', content, re.DOTALL)
                if json_part:
                    parsed_content = json.loads(json_part.group(1))
                else:
                    print(f"Unable to extract JSON from response: {content}")
                    return
            except json.JSONDecodeError as e:
                print(f"Error parsing JSON response: {e}")
                return

        with open(output_file, 'a') as f:
            json_string = json.dumps(parsed_content, indent=2)
            f.write(json_string + '\n')
        print(f"Data generation complete. Saved to '{output_file}'.")
    else:
        print(f"Invalid response format or empty 'choices' in response.")

async def main():
    output_file_path = 'P2.json'
    number_of_prompts_to_generate = 5

    tasks = [generate_data(output_file_path) for _ in range(number_of_prompts_to_generate)]
    await asyncio.gather(*tasks)

if __name__ == "__main__":
    loop = asyncio.get_event_loop()
    if loop.is_running():
        loop.create_task(main())
    else:
        asyncio.run(main())

Data generation complete. Saved to 'P2.json'.
Data generation complete. Saved to 'P2.json'.
Unable to extract JSON from response: {
  "input": "Alice, Bob, and Claire are friends and passionate coffee drinkers. They each start with a different type of coffee: Alice has Espresso, Bob has Latte, and Claire has Cappuccino. They decide to trade their coffees in the following sequence: Alice and Bob swap coffees, then Bob and Claire swap coffees, and finally Alice and Claire swap coffees. At the end of the trading sequence, who has the Espresso?",
  "output": "Claire",
  "justification": "At the start, Alice has Espresso, Bob has Latte, and Claire has Cappuccino. After the sequence of trades, Alice gives her Espresso to Bob and receives Claire's Cappuccino. Then Bob gives the Espresso to Claire and receives her Cappuccino. Finally, Alice receives the Cappuccino from Bob, while Claire ends up with the Espresso.",
  "thinking_process": "At the beginning, the initial ownership of the coffees i

Task exception was never retrieved
future: <Task finished name='Task-221' coro=<main() done, defined at C:\Users\eyliw\AppData\Local\Temp\ipykernel_2172\4109212896.py:74> exception=TimeoutError()>
Traceback (most recent call last):
  File "C:\Users\eyliw\AppData\Local\Temp\ipykernel_2172\4109212896.py", line 79, in main
    await asyncio.gather(*tasks)
  File "C:\Users\eyliw\AppData\Local\Temp\ipykernel_2172\4109212896.py", line 48, in generate_data
    response = await call_openai_api(prompt, system_prompt, model, max_tokens)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\eyliw\AppData\Local\Temp\ipykernel_2172\4109212896.py", line 38, in call_openai_api
    async with session.post(API_ENDPOINT, headers=headers, json=payload) as response:
  File "c:\Users\eyliw\anaconda3\Lib\site-packages\aiohttp\client.py", line 1141, in __aenter__
    self._resp = await self._coro
                 ^^^^^^^^^^^^^^^^
  File "c:\Users\eyliw\anaconda3\Lib\

In [12]:
def create_math_expression():
    operators = ['+', '-', '*', '/']
    numbers = [str(random.randint(-10, 10)) for _ in range(3)]
    inner_expression = f"({numbers[0]} {random.choice(operators)} {numbers[1]} {random.choice(operators)} {numbers[2]})"
    
    numbers = [str(random.randint(-10, 10)) for _ in range(3)]
    outer_expression = f"({numbers[0]} {random.choice(operators)} {numbers[1]} {random.choice(operators)} {numbers[2]})"
    
    final_expression = f"(({inner_expression}) {random.choice(operators)} ({outer_expression})) ="
    return final_expression

def create_prompt():
    math_puzzle = create_math_expression()
    return (f"example : '{math_puzzle}'"
            f"Please create a series of complex math puzzles similar to the example. "
            f"Each puzzle should be a single line arithmetic expression involving a combination of operations (addition, subtraction, multiplication, division) "
            f"and grouping with parentheses. "
            f"Ensure the puzzle is clear, solvable, and each has a unique structure.")


async def call_openai_api(prompt, system_prompt, model, max_tokens):
    headers = {
        "Authorization": f"Bearer {OPENAI_API_KEY}",
        "Content-Type": "application/json"
    }

    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt}
        ],
        "max_tokens": max_tokens
    }

    async with ClientSession() as session:
        async with session.post(API_ENDPOINT, headers=headers, json=payload) as response:
            if response.status != 200:
                print(f"Error in API call: {response.status}")
                return None
            return await response.json()

async def generate_data(output_file, model="gpt-3.5-turbo-1106", max_tokens=3800):
    prompt = create_prompt()
    system_prompt = create_system_prompt()

    response = await call_openai_api(prompt, system_prompt, model, max_tokens)
    if response and 'choices' in response and len(response['choices']) > 0:
        content = response['choices'][0]['message']['content']
        try:
            # Try to directly parse the content as JSON
            parsed_content = json.loads(content)
        except json.JSONDecodeError:
            # If direct parsing fails, try extracting JSON from code block
            try:
                json_part = re.search(r'```json\n(\{.*\})\n```', content, re.DOTALL)
                if json_part:
                    parsed_content = json.loads(json_part.group(1))
                else:
                    print(f"Unable to extract JSON from response: {content}")
                    return
            except json.JSONDecodeError as e:
                print(f"Error parsing JSON response: {e}")
                return

        with open(output_file, 'a') as f:
            json_string = json.dumps(parsed_content, indent=2)
            f.write(json_string + '\n')
        print(f"Data generation complete. Saved to '{output_file}'.")
    else:
        print(f"Invalid response format or empty 'choices' in response.")

async def main():
    output_file_path = 'P2.json'
    number_of_prompts_to_generate = 5

    tasks = [generate_data(output_file_path) for _ in range(number_of_prompts_to_generate)]
    await asyncio.gather(*tasks)

if __name__ == "__main__":
    loop = asyncio.get_event_loop()
    if loop.is_running():
        loop.create_task(main())
    else:
        asyncio.run(main())

Unable to extract JSON from response: {
  "input": "(((8 / -4 / -2)) - ((-6 * -2 / -3))) =",
  "output": "2",
  "justification": "After solving the equation, the output shows up as 2",
  "thinking_process": "First solve the innermost parentheses. Begin by solving the equation within the parentheses, and then doing the operations in order from left to right."
},
{
  "input": "((((12 - 3) * 2) / (18 - 6)) + ((6 / 3) * 5)) =",
  "output": "11",
  "justification": "After solving the equation, the output shows up as 11",
  "thinking_process": "First solve the innermost parentheses. Division and multiplication should be resolved before addition and subtraction."
},
{
  "input": "(((7 / -10 / -10)) - ((-7 * -2 / -3))) =",
  "output": "-3",
  "justification": "After doing the calculation, the output shows up as -3",
  "thinking_process": "First solve the innermost parentheses. Division should perform before subtraction."
},
{
  "input": "((15 / (-2 * -3)) + (25 - (7 * 2))) =",
  "output": "13"

# Generate Q-A

In [20]:

def create_prompt(prompt_obj):
    return (f'input: {prompt_obj["prompt"]}\n\n'
            f'Task: Understand the input, identify its direct instruction. '
            f'Based on this understanding, use the full input to formulate a response. '
            f'This response is your output. '
            f'Justify your reasoning and explain the thinking process behind generating the output. '
            f'Format your response as JSON with the keys "input", "output", "justification", "thinking_process". '
            f'Ensure that the response adheres strictly to this structure, without any additional fields.')

def create_system_prompt():
    return (f'Your task is to analyze and respond to the provided prompt. '
            f'The response should be structured as JSON with the following keys: '
            f'1. "input": <introduction> \\n<statemtents> \\n<Q:> \\n<options>. '
            f'2. "output": Your direct response to the "input", formatted as text. '
            f'3. "justification": Explain the reasoning behind your response. '
            f'4. "thinking_process": Detail the thought process that led to the generation of the output. '
            f'Ensure that each of these fields is provided and that your response strictly adheres to this structure. '
            f'Do not include any additional fields.')

async def call_openai_api(prompt, system_prompt):
    headers = {
        "Authorization": f"Bearer {OPENAI_API_KEY}",
        "Content-Type": "application/json"
    }

    payload = {
        "model": MODEL,
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt}
        ],
        "max_tokens": MAX_TOKENS
    }

    async with ClientSession() as session:
        async with session.post(API_ENDPOINT, headers=headers, json=payload) as response:
            if response.status != 200:
                # Implement retry logic with exponential backoff here if needed
                return None
            return await response.json()

async def generate_data_async(prompt_obj, output_file):
    prompt = create_prompt(prompt_obj)
    system_prompt = create_system_prompt()

    response = await call_openai_api(prompt, system_prompt)
    if response:
        # Check if the response contains the expected keys
        if 'choices' in response and len(response['choices']) > 0:
            content = response['choices'][0]['message']['content']
            try:
                # Parse the JSON string into a Python dictionary
                parsed_content = json.loads(content)
                with open(output_file, 'a') as f:
                    # Write the parsed content as a JSON string with proper formatting
                    json_string = json.dumps(parsed_content, indent=2)
                    f.write(json_string + '\n')

                print(f"Data generation complete. Saved to '{output_file}'.")
            except json.JSONDecodeError as e:
                print(f"Error parsing JSON response: {e}")
        else:
            print(f"Invalid response format: {response}")

async def main():
    prompts = process_generated_prompts('logical_puzzle.json')
    output_file_path = 'P2.json'
    number_of_prompts_to_generate = 6

    tasks = [generate_data_async(random.choice(prompts), output_file_path) for _ in range(number_of_prompts_to_generate)]
    await asyncio.gather(*tasks)

# This checks if there's an existing event loop and uses it instead of creating a new one.
if __name__ == "__main__":
    loop = asyncio.get_event_loop()
    if loop.is_running():
        # If the loop is already running, just add the main coroutine to the loop.
        loop.create_task(main())
    else:
        # Otherwise, use asyncio.run (for standalone scripts)
        asyncio.run(main())



Data generation complete. Saved to 'P2.json'.
Error parsing JSON response: Expecting value: line 1 column 1 (char 0)
Error parsing JSON response: Expecting value: line 1 column 1 (char 0)
Error parsing JSON response: Expecting value: line 1 column 1 (char 0)
Data generation complete. Saved to 'P2.json'.
Data generation complete. Saved to 'P2.json'.


# Generating data from P2

In [6]:
import json

# Function to read JSON file and return data
def read_json_file(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

def create_jsonl(json_data, output_file):
    with open(output_file, 'w') as file:
        for item in json_data.get('P2', []):  # Safely get 'P2' key
            output = item.get('output', '')  # Safely get 'output' key, default to empty string if not found
            thinking_process = item.get('thinking_process', '')  # Safely get 'thinking_process'

            if thinking_process:
                output += "\nThinking Process: " + thinking_process

            jsonl_obj = {
                "instruction": item.get('instruction', ''), 
                "input": item.get('input', ''),  # Safely get 'input'
                "output": output
            }

            file.write(json.dumps(jsonl_obj) + '\n')


# Path to the original JSON file and the output JSONL file
input_file_name = 'P2.json'
output_file_name = 'output.jsonl'

# Read data from the JSON file
json_data = read_json_file(input_file_name)

# Create the JSONL file
create_jsonl(json_data, output_file_name)
