In [2]:
import os
import json
from tqdm import tqdm
from openai import OpenAI
import random

# Setup
os.environ["OPENAI_API_KEY"] = ""  # Replace with your key
openai_client = OpenAI()

# Load data
with open('./data-collection/data/chapter-data.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Prompt templates
prompt_templates = [
    lambda paragraph: f"Reflect on the following passage and ask a thought-provoking question:\n\n{paragraph}",
    lambda paragraph: f"Based on the passage below, generate a question that encourages philosophical reflection:\n\n{paragraph}",
    lambda paragraph: f"Read this historical excerpt and pose a meaningful question that would challenge a modern reader:\n\n{paragraph}",
    lambda paragraph: f"Analyze the text and craft a question a curious student might ask:\n\n{paragraph}",
    lambda paragraph: f"Given the following text, what is a deep question that could arise from it?\n\n{paragraph}"
]

# OpenAI call wrapper
def query_chatgpt(prompt, model_name="gpt-4o-mini", max_tokens=300, temperature=0.5):
    try:
        completion = openai_client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "system", "content": "You are a thoughtful assistant that analyzes historical texts and generates reflective questions and answers."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=max_tokens,
            temperature=temperature,
        )
        return completion.choices[0].message.content.strip()
    except Exception as e:
        print(f"❌ API Error: {e}")
        return None


In [3]:
# Main dataset generation
dataset = []
LIMIT = 2  # Testing cap

for entry in tqdm(data):
    book_title = entry.get("book_title", "")
    chapter_name = entry.get("chapter_name", "")
    paragraphs = entry.get("paragraphs", [])

    for paragraph in paragraphs:
        if len(dataset) >= LIMIT:
            break

        if len(paragraph.strip()) < 100:
            continue

        instruction = random.choice(prompt_templates)(paragraph)
        question = query_chatgpt(instruction)

        if not question:
            continue

        answer_prompt = (
            f"Based on the following text, answer the question in a reflective and thoughtful manner, "
            f"as if by the author:\n\nText:\n{paragraph}\n\nQuestion:\n{question}\n\nAnswer:"
        )
        answer = query_chatgpt(answer_prompt, max_tokens=400)

        if not answer:
            continue

        dataset.append({
            "input": f"{instruction}",
            "output": f"{question}\n\n{answer}"
        })

# Save to file
with open('instruction_tuning_dataset.jsonl', 'w') as f:
    for item in dataset:
        f.write(json.dumps(item) + "\n")

# Optional preview
print("\n📄 Preview of the dataset:")
for i, item in enumerate(dataset[:3]):
    print(f"\n🔹 Example {i+1}")
    print("Input:\n", item["input"][:500], "...")
    print("Output:\n", item["output"][:300], "...\n")

100%|██████████| 2601/2601 [00:18<00:00, 141.64it/s]


📄 Preview of the dataset:

🔹 Example 1
Input:
 Reflect on the following passage and ask a thought-provoking question:

The persons who understand this subject, call the substance which forms the first foundation of their combs, commosis,<@1> the next, pissoceros,<@2> and the third propolis;<@3> which last is placed between the other layers and the wax, and is remarkable for its utility in medicine.<@4> The commosis forms the first crust or layer, and has a bitter taste; and upon it is laid the pissoceros, a kind of thin wax, which acts as a  ...
Output:
 Reflective Question: How does the intricate composition of bee products, as described in this passage, illustrate the interconnectedness of nature and its applications in both ecological systems and human medicine?

Reflective Answer: The passage highlights the complex layering and specific function ...


🔹 Example 2
Input:
 Analyze the text and craft a question a curious student might ask:

In addition to this, the bees form collecti




In [5]:
print("\n📄 Preview of the dataset:")
for i, item in enumerate(dataset[:3]):
    print(f"\n🔹 Example {i+1}")
    print("Input:\n", item["input"][:500], "...")
    print("Output:\n", item["output"], "...\n")


📄 Preview of the dataset:

🔹 Example 1
Input:
 Reflect on the following passage and ask a thought-provoking question:

The persons who understand this subject, call the substance which forms the first foundation of their combs, commosis,<@1> the next, pissoceros,<@2> and the third propolis;<@3> which last is placed between the other layers and the wax, and is remarkable for its utility in medicine.<@4> The commosis forms the first crust or layer, and has a bitter taste; and upon it is laid the pissoceros, a kind of thin wax, which acts as a  ...
Output:
 Reflective Question: How does the intricate composition of bee products, as described in this passage, illustrate the interconnectedness of nature and its applications in both ecological systems and human medicine?

Reflective Answer: The passage highlights the complex layering and specific functions of substances like commosis, pissoceros, and propolis within the structure of honeycombs. This complexity not only showcases the bees' r