In [None]:
from tqdm import tqdm
from datasets import load_dataset
import pandas as pd
from collections import deque
import pickle
import json
import os
import random

## Vocab seed

In [None]:
#load a huggingface dataset
aoa = load_dataset("StephanAkkerman/English-Age-of-Acquisition", split="train")
aoa = pd.DataFrame(aoa)
aoa = aoa[["Dom_PoS_SUBTLEX", "Lemma_highest_PoS", "AoA_Kup_lem", "Perc_known_lem"]]
aoa = aoa.drop_duplicates(subset=["Lemma_highest_PoS", "AoA_Kup_lem"])
aoa = aoa.dropna()
aoa = aoa.reset_index(drop=True)
len(aoa[aoa['AoA_Kup_lem']<=14])

In [None]:
stage_0 = aoa[aoa['AoA_Kup_lem']<=5].reset_index(drop=True)
stage_1 = aoa[(aoa['AoA_Kup_lem']>5) & (aoa['AoA_Kup_lem']<=6)].reset_index(drop=True)
stage_2 = aoa[(aoa['AoA_Kup_lem']>6) & (aoa['AoA_Kup_lem']<=7)].reset_index(drop=True)
stage_3 = aoa[(aoa['AoA_Kup_lem']>7) & (aoa['AoA_Kup_lem']<=8)].reset_index(drop=True)
stage_4 = aoa[(aoa['AoA_Kup_lem']>8) & (aoa['AoA_Kup_lem']<=9)].reset_index(drop=True)
stage_5 = aoa[(aoa['AoA_Kup_lem']>9) & (aoa['AoA_Kup_lem']<=10)].reset_index(drop=True)
stage_6 = aoa[(aoa['AoA_Kup_lem']>10) & (aoa['AoA_Kup_lem']<=11)].reset_index(drop=True)
stage_7 = aoa[(aoa['AoA_Kup_lem']>11) & (aoa['AoA_Kup_lem']<=12)].reset_index(drop=True)
stage_8 = aoa[(aoa['AoA_Kup_lem']>12) & (aoa['AoA_Kup_lem']<=13)].reset_index(drop=True)
stage_9 = aoa[(aoa['AoA_Kup_lem']>13) & (aoa['AoA_Kup_lem']<=14)].reset_index(drop=True)

In [None]:
def get_stage_words(stage_df):
    filtered_df = stage_df[stage_df['Dom_PoS_SUBTLEX'].isin(['Noun', 'Verb', 'Adjective', 'Adverb'])]
    filtered_df = filtered_df.sort_values(by=["Perc_known_lem"], ascending=False)
    filtered_df = filtered_df.reset_index(drop=True)
    stage_words = list(zip(filtered_df['Lemma_highest_PoS'], filtered_df['Dom_PoS_SUBTLEX']))
    return stage_words, filtered_df

In [None]:
words_0, df_0 = get_stage_words(stage_0)
words_1, df_1 = get_stage_words(stage_1)
words_2, df_2 = get_stage_words(stage_2)
words_3, df_3 = get_stage_words(stage_3)
words_4, df_4 = get_stage_words(stage_4)
words_5, df_5 = get_stage_words(stage_5)
words_6, df_6 = get_stage_words(stage_6)
words_7, df_7 = get_stage_words(stage_7)
words_8, df_8 = get_stage_words(stage_8)
words_9, df_9 = get_stage_words(stage_9)

In [None]:
#Create directories if they do not exist
for i in range(10):
    os.makedirs(f"/datadrive/pavan/az_storage/data_unorganized/stages/stage{i}/seed/instruct/", exist_ok=True)
    os.makedirs(f"/datadrive/pavan/az_storage/data_unorganized/stages/stage{i}/seed/context/", exist_ok=True)
    os.makedirs(f"/datadrive/pavan/az_storage/data_unorganized/stages/stage{i}/raw/instruct/", exist_ok=True)
    os.makedirs(f"/datadrive/pavan/az_storage/data_unorganized/stages/stage{i}/raw/context/", exist_ok=True)


In [None]:
def save_seeds_and_leftovers(save_dir, num_seeds):
    for i in range(num_seeds):
        words = globals().get(f"words_{i}")
        #sample 1000 random words from the list
        if len(words) > 1000:
            random.seed(i)  # Use the loop index as the seed for reproducibility
            words = random.sample(words, 1000)
        else:
            words = list(words)
        with open(f"{save_dir}{i}/seed_words_limited.pkl", "wb") as f:
            pickle.dump(words, f)

In [None]:
save_dir = "./CurLL_data/stages/stage"
save_seeds_and_leftovers(save_dir, num_seeds=10)

## Final seed

In [None]:
dir_name = "/datadrive/pavan/az_storage/CurLL_data"

# For each indicator, this contains all metadata for generatiom except the words for the age group.
with open(f"{dir_name}/skill_graph/graph_final.pkl", "rb") as f:
    DG = pickle.load(f)
    
# load the words for all the stages
seed_words = {}
for i in range(10):
    with open(f"{dir_name}/stages/stage{i}/seed_words_limited.pkl", "rb") as f:
        seed_words[i] = pickle.load(f)

In [None]:
#create the seed data for context
seed_data = {}
for i in range(10):
    seed_data[i] = []

for node in tqdm(DG.nodes()):
    stage = DG.nodes[node]['stage']
    for context in DG.nodes[node]['context_templates']:
        for word in seed_words[stage]:
            seed_data[stage].append({
                    "id": node,
                    "indicator": DG.nodes[node]['label'],
                    "skill": DG.nodes[node]['skill'],
                    "subskill": DG.nodes[node]['subskill'],
                    "goal": DG.nodes[node]['goal'],
                    "age_group": DG.nodes[node]['age_group'],
                    "stage": DG.nodes[node]['stage'],
                    "context_template": context,
                    "word_list": word
                })

In [None]:
#create a folder for context in each stage
#create a folder for seed
#create a folder for raw
#save the metadata for chunks in each stage

root_dir = "/datadrive/pavan/az_storage/CurLL_data/stages"

for i in tqdm(range(10)):
    stage_folder = os.path.join(root_dir, f'stage{i}')
    context_dir = os.path.join(stage_folder, 'context')
    seed_dir = os.path.join(context_dir, 'seed')
    raw_dir = os.path.join(context_dir, 'raw')
    os.makedirs(seed_dir, exist_ok=True)
    os.makedirs(raw_dir, exist_ok=True)
    num_chunks = 4
    chunk_size = len(seed_data[i]) // num_chunks
    raw_chunks = []
    # First (num_chunks - 1) chunks
    for j in range(num_chunks - 1):
        start_idx = j * chunk_size
        end_idx = start_idx + chunk_size
        raw_chunks.append(seed_data[i][start_idx:end_idx])
        
    # Last chunk
    start_idx = (num_chunks - 1) * chunk_size
    raw_chunks.append(seed_data[i][start_idx:])
    
    chunk_metadata = {}
    
    #dump as pickle files
    for j, chunk in enumerate(raw_chunks):
        with open(os.path.join(seed_dir, f'chunk_{j}.pkl'), 'wb') as f:
            pickle.dump(chunk, f)
            
        chunk_metadata[f"c_{j}"] = {
            "size": len(chunk)
        }

    # Save metadata
    metadata_file = os.path.join(seed_dir, "metadata_chunks.json")
    with open(metadata_file, "w") as f:
        json.dump(chunk_metadata, f, indent=2)
        
    print(f"Stage {i} - Seed data saved with {len(seed_data[i])} entries and {num_chunks} chunks.")


In [None]:
#create the seed data for context
seed_data = {}
for i in range(10):
    seed_data[i] = []

for node in tqdm(DG.nodes()):
    stage = DG.nodes[node]['stage']
    for context in DG.nodes[node]['ins_templates']:
        for word in seed_words[stage]:
            seed_data[stage].append({
                    "id": node,
                    "indicator": DG.nodes[node]['label'],
                    "skill": DG.nodes[node]['skill'],
                    "subskill": DG.nodes[node]['subskill'],
                    "goal": DG.nodes[node]['goal'],
                    "age_group": DG.nodes[node]['age_group'],
                    "stage": DG.nodes[node]['stage'],
                    "instruct_template": context,
                    "word_list": word
                })

In [None]:
#create a folder for instruct in each stage
#create a folder for seed
#create a folder for raw
#save the metadata for chunks in each stage

root_dir = "/datadrive/pavan/az_storage/CurLL_data/stages"

for i in tqdm(range(10)):
    stage_folder = os.path.join(root_dir, f'stage{i}')
    instruct_dir = os.path.join(stage_folder, 'instruct')
    seed_dir = os.path.join(instruct_dir, 'seed')
    raw_dir = os.path.join(instruct_dir, 'raw')
    os.makedirs(seed_dir, exist_ok=True)
    os.makedirs(raw_dir, exist_ok=True)
    num_chunks = 4
    chunk_size = len(seed_data[i]) // num_chunks
    raw_chunks = []
    # First (num_chunks - 1) chunks
    for j in range(num_chunks - 1):
        start_idx = j * chunk_size
        end_idx = start_idx + chunk_size
        raw_chunks.append(seed_data[i][start_idx:end_idx])
        
    # Last chunk
    start_idx = (num_chunks - 1) * chunk_size
    raw_chunks.append(seed_data[i][start_idx:])
    
    chunk_metadata = {}
    
    #dump as pickle files
    for j, chunk in enumerate(raw_chunks):
        with open(os.path.join(seed_dir, f'chunk_{j}.pkl'), 'wb') as f:
            pickle.dump(chunk, f)
            
        chunk_metadata[f"c_{j}"] = {
            "size": len(chunk)
        }

    # Save metadata
    metadata_file = os.path.join(seed_dir, "metadata_chunks.json")
    with open(metadata_file, "w") as f:
        json.dump(chunk_metadata, f, indent=2)
        
    print(f"Stage {i} - Seed data saved with {len(seed_data[i])} entries and {num_chunks} chunks.")


In [None]:
#Pavankalyan/stage0_context_cleaned
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
d0 = load_dataset("Pavankalyan/stage0_context_cleaned")['train'].to_list()
d1 = load_dataset("Pavankalyan/stage1_context_cleaned")['train'].to_list()

seed_data = {}

seed_data[0] = d0
seed_data[1] = d1

In [None]:
root_dir = "/datadrive/pavan/az_storage/CurLL_data/stages"

for i in tqdm(range(len(seed_data))):
    stage_folder = os.path.join(root_dir, f'stage{i}')
    cqa_dir = os.path.join(stage_folder, 'cqa')
    seed_dir = os.path.join(cqa_dir, 'seed')
    raw_dir = os.path.join(cqa_dir, 'raw')
    num_chunks = 4
    chunk_size = len(seed_data[i]) // num_chunks
    raw_chunks = []
    # First (num_chunks - 1) chunks
    for j in range(num_chunks - 1):
        start_idx = j * chunk_size
        end_idx = start_idx + chunk_size
        raw_chunks.append(seed_data[i][start_idx:end_idx])
        
    # Last chunk
    start_idx = (num_chunks - 1) * chunk_size
    raw_chunks.append(seed_data[i][start_idx:])
    
    chunk_metadata = {}
    
    #dump as pickle files
    for j, chunk in enumerate(raw_chunks):
        with open(os.path.join(seed_dir, f'chunk_{j}.pkl'), 'wb') as f:
            pickle.dump(chunk, f)
            
        chunk_metadata[f"c_{j}"] = {
            "size": len(chunk)
        }

    # Save metadata
    metadata_file = os.path.join(seed_dir, "metadata_chunks.json")
    with open(metadata_file, "w") as f:
        json.dump(chunk_metadata, f, indent=2)
        
    print(f"Stage {i} - Seed data saved with {len(seed_data[i])} entries and {num_chunks} chunks.")


In [None]:
root_dir = "/datadrive/pavan/az_storage/CurLL_data/stages"

for i in tqdm(range(len(seed_data))):
    stage_folder = os.path.join(root_dir, f'stage{i}')
    csqa_dir = os.path.join(stage_folder, 'csqa')
    seed_dir = os.path.join(csqa_dir, 'seed')
    raw_dir = os.path.join(csqa_dir, 'raw')
    num_chunks = 4
    chunk_size = len(seed_data[i]) // num_chunks
    raw_chunks = []
    # First (num_chunks - 1) chunks
    for j in range(num_chunks - 1):
        start_idx = j * chunk_size
        end_idx = start_idx + chunk_size
        raw_chunks.append(seed_data[i][start_idx:end_idx])
        
    # Last chunk
    start_idx = (num_chunks - 1) * chunk_size
    raw_chunks.append(seed_data[i][start_idx:])
    
    chunk_metadata = {}
    
    #dump as pickle files
    for j, chunk in enumerate(raw_chunks):
        with open(os.path.join(seed_dir, f'chunk_{j}.pkl'), 'wb') as f:
            pickle.dump(chunk, f)
            
        chunk_metadata[f"c_{j}"] = {
            "size": len(chunk)
        }

    # Save metadata
    metadata_file = os.path.join(seed_dir, "metadata_chunks.json")
    with open(metadata_file, "w") as f:
        json.dump(chunk_metadata, f, indent=2)
        
    print(f"Stage {i} - Seed data saved with {len(seed_data[i])} entries and {num_chunks} chunks.")


In [None]:
#Pavankalyan/stage0_context_cleaned
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
d0 = load_dataset("Pavankalyan/context_cleaned_stage2")['train'].to_list()
d1 = load_dataset("Pavankalyan/context_cleaned_stage3")['train'].to_list()
d2 = load_dataset("Pavankalyan/context_cleaned_stage4")['train'].to_list()

seed_data = {}

seed_data[2] = d0
seed_data[3] = d1
seed_data[4] = d2

In [None]:
root_dir = "/datadrive/pavan/az_storage/CurLL_data/stages"

for i in tqdm(range(2,5)):
    stage_folder = os.path.join(root_dir, f'stage{i}')
    cqa_dir = os.path.join(stage_folder, 'cqa')
    seed_dir = os.path.join(cqa_dir, 'seed')
    raw_dir = os.path.join(cqa_dir, 'raw')
    num_chunks = 4
    chunk_size = len(seed_data[i]) // num_chunks
    raw_chunks = []
    # First (num_chunks - 1) chunks
    for j in range(num_chunks - 1):
        start_idx = j * chunk_size
        end_idx = start_idx + chunk_size
        raw_chunks.append(seed_data[i][start_idx:end_idx])
        
    # Last chunk
    start_idx = (num_chunks - 1) * chunk_size
    raw_chunks.append(seed_data[i][start_idx:])
    
    chunk_metadata = {}
    
    #dump as pickle files
    for j, chunk in enumerate(raw_chunks):
        with open(os.path.join(seed_dir, f'chunk_{j}.pkl'), 'wb') as f:
            pickle.dump(chunk, f)
            
        chunk_metadata[f"c_{j}"] = {
            "size": len(chunk)
        }

    # Save metadata
    metadata_file = os.path.join(seed_dir, "metadata_chunks.json")
    with open(metadata_file, "w") as f:
        json.dump(chunk_metadata, f, indent=2)
        
    print(f"Stage {i} - Seed data saved with {len(seed_data[i])} entries and {num_chunks} chunks.")


In [None]:
root_dir = "/datadrive/pavan/az_storage/CurLL_data/stages"

for i in tqdm(range(2,5)):
    stage_folder = os.path.join(root_dir, f'stage{i}')
    csqa_dir = os.path.join(stage_folder, 'csqa')
    seed_dir = os.path.join(csqa_dir, 'seed')
    raw_dir = os.path.join(csqa_dir, 'raw')
    num_chunks = 4
    chunk_size = len(seed_data[i]) // num_chunks
    raw_chunks = []
    # First (num_chunks - 1) chunks
    for j in range(num_chunks - 1):
        start_idx = j * chunk_size
        end_idx = start_idx + chunk_size
        raw_chunks.append(seed_data[i][start_idx:end_idx])
        
    # Last chunk
    start_idx = (num_chunks - 1) * chunk_size
    raw_chunks.append(seed_data[i][start_idx:])
    
    chunk_metadata = {}
    
    #dump as pickle files
    for j, chunk in enumerate(raw_chunks):
        with open(os.path.join(seed_dir, f'chunk_{j}.pkl'), 'wb') as f:
            pickle.dump(chunk, f)
            
        chunk_metadata[f"c_{j}"] = {
            "size": len(chunk)
        }

    # Save metadata
    metadata_file = os.path.join(seed_dir, "metadata_chunks.json")
    with open(metadata_file, "w") as f:
        json.dump(chunk_metadata, f, indent=2)
        
    print(f"Stage {i} - Seed data saved with {len(seed_data[i])} entries and {num_chunks} chunks.")


## Prompts

### Context

In [None]:
prompt = {
    "system": "You are an AI model generating training data to help language models simulate human developmental skills at various stages from early childhood through early adolescence.\n\nYour task is to create engaging, developmentally appropriate texts based on provided developmental indicators, skills, and a tuple of word and its part of speech.\n\nStrictly follow these guidelines:\n\n1. **Developmental Appropriateness:**\n   - Stage 0 (Age 5): Use simple sentences, concrete concepts, familiar experiences, present tense focus\n   - Stages 1-3 (Ages 6-8): Introduce basic past/future concepts, simple cause-effect, familiar settings\n   - Stages 4-6 (Ages 9-11): Include more complex reasoning, abstract thinking, varied sentence structures\n   - Stages 7-9 (Ages 12-14): Incorporate hypothetical scenarios, multiple perspectives, sophisticated vocabulary\n\n2. **Context Generation:**\n   - Use the provided word and its part of speech to create a meaningful, developmentally appropriate topic\n   - **Ensure the selected word and expanded topic fit the required Text Type Template (context_template)**\n   - Expand the selected word into a more detailed, skill-aligned topic that resonates with the target age group\n   - Generate a rich, complete, and engaging text matching the provided context template\n   - The generated text must be **between 250 and 500 words regardless of developmental stage**\n   - The text must clearly align with the skill, subskill, goal, and indicator\n   - The selected word does not need to explicitly appear in the final text\n\n3. **Writing Style by Stage:**\n   - **Early Stages (0-3):** Simple vocabulary, short to medium sentences, concrete experiences, repetitive patterns for reinforcement\n   - **Middle Stages (4-6):** More varied vocabulary, complex sentences, introduction of abstract concepts, problem-solving scenarios\n   - **Later Stages (7-9):** Sophisticated vocabulary, complex sentence structures, abstract reasoning, multiple viewpoints\n\n4. **Content Enrichment:**\n   - Include age-appropriate actions, feelings, interactions, and sensory details\n   - Incorporate social situations relevant to the developmental stage\n   - Use scenarios that promote the specific skill being targeted\n   - Avoid overly abstract or culturally specific references unless appropriate for the age group\n\n5. **Output Format:** Strictly return the output in the following JSON structure:\n```json\n{{\n    \"expanded_topic\": \"<expanded topic>\",\n    \"generated_text\": \"<generated text between 250 and 500 words>\"\n}}\n```\nOnly output the JSON. No additional commentary.",
    "user": "Generate a rich and engaging context text based on the following input:\n\n- ID: {id}\n- Indicator: {indicator}\n- Skill: {skill}\n- Sub-skill: {subskill}\n- Goal: {goal}\n- Age Group: {age_group}\n- Stage: {stage}\n- Text Type Template: {context_template}\n- (Word, Part of speech): {word_list}\n\nInstructions:\n- Consider the developmental stage ({stage}) and age group ({age_group}) when crafting vocabulary, sentence complexity, and content themes\n- Expand the selected word into a skill-relevant topic **that fits the Text Type Template**\n- Generate a detailed text of **250–500 words** following the context template\n- Enrich the text with developmentally appropriate actions, emotions, and interactions\n- Ensure the content promotes the specific skill and subskill being targeted\n\nOutput strictly in this format:\n```json\n{{\n    \"expanded_topic\": \"<expanded topic>\",\n    \"generated_text\": \"<generated text between 250 and 500 words>\"\n}}\n```"
}

root_dir = "/datadrive/pavan/az_storage/CurLL_data/stages"
for i in range(10):
    stage_folder = os.path.join(root_dir, f'stage{i}')
    context_dir = os.path.join(stage_folder, 'context')

    with open(os.path.join(context_dir, "prompt.json"), "w") as f:
        json.dump(prompt, f, indent=2)
        
    print(f"Stage {i} - Prompt saved.")

### Instruct

In [None]:
#instruct prompt
ins_prompt = {"system": "You are an AI model generating training data to help language models simulate human developmental skills at various stages from early childhood through early adolescence.\n\nYour task is to create realistic instruction-response pairs between an educator and a child, based on developmental indicators, skills, and a tuple of word and its part of speech.\n\nStrictly follow these guidelines:\n\n1. **Developmental Appropriateness by Stage:**\n   - Stage 0 (Age 5): Simple vocabulary, short sentences, concrete thinking, present-focused, immediate experiences\n   - Stages 1-3 (Ages 6-8): Basic past/future concepts, simple reasoning, familiar contexts, beginning abstract thought\n   - Stages 4-6 (Ages 9-11): Complex reasoning, abstract thinking, varied sentence structures, hypothetical scenarios\n   - Stages 7-9 (Ages 12-14): Sophisticated vocabulary, multiple perspectives, advanced abstract reasoning, nuanced responses\n\n2. **Instruction Creation:**\n   - Use the provided word and its part of speech to meaningfully inspire the interaction topic\n   - **Ensure the topic aligns with the Text Type Template (instruct_template)**\n   - Craft prompts that naturally elicit demonstration of the specific indicator and skill\n   - Vary instruction starters - avoid overusing \"Imagine...\" or \"Tell me about...\"\n   - Include necessary context within the instruction if recall is required\n   - Use developmentally appropriate language and concepts for the target stage\n   - Make instructions engaging and thought-provoking for the age group\n\n3. **Response Generation:**\n   - Create authentic child responses that clearly demonstrate the target indicator\n   - Use vocabulary, sentence complexity, and reasoning appropriate to the developmental stage\n   - Include natural speech patterns and expressions typical of the age group\n   - Ensure responses are verifiable through either:\n     * Information provided in the instruction\n     * Common world knowledge appropriate for the child's developmental level\n     * Typical personal experiences for that age group\n   - Avoid arbitrary claims or purely imaginative details unless storytelling is explicitly encouraged\n\n4. **Content Guidelines:**\n   - **Purely verbal exchanges** - no references to physical objects, gestures, or non-verbal actions\n   - No formatting (bold, italics, markdown)\n   - Responses should sound natural and spontaneous, not rehearsed\n   - Include appropriate emotional expressions and personal connections when relevant\n   - Ensure logical consistency between instruction and response\n\n5. **Quality Standards:**\n   - The exchange must demonstrate clear alignment with the skill, subskill, goal, and indicator\n   - Both instruction and response should feel authentic to a real classroom or learning interaction\n   - Avoid overly abstract concepts for younger stages or overly simple concepts for older stages\n   - Ensure the selected word meaningfully influences the dialogue topic\n\n6. **Output Format:** Strictly return the output in the following JSON structure:\n```json\n{{\n    \"instruction\": \"<instruction>\",\n    \"response\": \"<response>\"\n}}\n```\nOnly output the JSON. No additional commentary or explanations.",
 "user": "Generate a developmentally appropriate instruction-response pair based on the following input:\n\n- ID: {id}\n- Indicator: {indicator}\n- Skill: {skill}\n- Sub-skill: {subskill}\n- Goal: {goal}\n- Age Group: {age_group}\n- Stage: {stage}\n- Text Type Template: {instruct_template}\n- (Word, Part of speech): {word_list}\n\nInstructions:\n- Consider the developmental stage ({stage}) and age group ({age_group}) when crafting language complexity and content themes\n- Use the selected word to meaningfully inspire the interaction topic **that fits the Text Type Template**\n- Create an engaging instruction that naturally elicits demonstration of the target indicator\n- Generate an authentic child response that clearly shows mastery of the skill and subskill\n- Ensure the exchange feels natural and appropriate for a real educational interaction\n\nOutput strictly in this format:\n```json\n{{\n    \"instruction\": \"<instruction>\",\n    \"response\": \"<response>\"\n}}\n```"
}
print(ins_prompt['system'])
print('-----')
print(ins_prompt['user'])

root_dir = "/datadrive/pavan/az_storage/CurLL_data/stages"
for i in range(10):
    stage_folder = os.path.join(root_dir, f'stage{i}')
    instruct_dir = os.path.join(stage_folder, 'instruct')

    with open(os.path.join(instruct_dir, "prompt.json"), "w") as f:
        json.dump(ins_prompt, f, indent=2)
        
    print(f"Stage {i} - Prompt saved.")

### CQA prompt

In [None]:
cqa_prompt = {
    "system": "You are an AI model generating training data to help language models simulate human reading comprehension skills at various stages from early childhood through early adolescence.\n\nYour task is to create 5 developmentally appropriate question-answer pairs based on a provided text, ensuring all questions test understanding of the given paragraph and can be answered directly from the text.\n\nStrictly follow these guidelines:\n\n1. **Developmental Appropriateness by Stage:**\n   - Stage 0 (Age 5): Simple \"what/who/where\" questions, literal comprehension, single-step reasoning\n   - Stages 1-3 (Ages 6-8): Basic \"why/how\" questions, simple cause-effect, sequence understanding, character feelings\n   - Stages 4-6 (Ages 9-11): Inference questions, comparing/contrasting, predicting outcomes, understanding motivations\n   - Stages 7-9 (Ages 12-14): Complex analysis, multiple perspectives, abstract concepts, theme identification\n\n2. **Question Creation Standards:**\n   - **All answers must be directly supported by information in the provided text**\n   - No questions requiring outside knowledge or information not present in the text\n   - Questions should test different types of comprehension appropriate to the developmental stage\n   - Vary question types to assess different reading skills (literal, inferential, evaluative)\n   - Use vocabulary and sentence complexity appropriate to the age group\n   - Ensure questions are engaging and relevant to the child's interests and experiences\n\n3. **Question Types by Stage:**\n   - **Early Stages (0-3):** Literal recall, identifying main characters/objects, simple sequence, basic emotions\n   - **Middle Stages (4-6):** Cause-effect relationships, character motivations, comparing details, simple predictions\n   - **Later Stages (7-9):** Drawing conclusions, analyzing relationships, evaluating actions, understanding themes\n\n4. **Answer Generation:**\n   - Create authentic child responses that demonstrate comprehension at the target developmental stage\n   - Use vocabulary and sentence structures appropriate to the age group\n   - Include natural speech patterns and expressions typical of the developmental stage\n   - Ensure answers are complete but not overly elaborate for the age group\n   - Answers should sound conversational and natural, not textbook-like\n\n5. **Content Guidelines:**\n   - **Purely verbal exchanges** - no references to physical gestures or non-verbal actions\n   - No formatting (bold, italics, markdown)\n   - Questions should flow naturally and cover different aspects of the text\n   - Ensure logical progression from simpler to more complex questions when appropriate\n   - Include a mix of question types (factual, inferential, personal connection when text-supported)\n\n6. **Quality Standards:**\n   - Every question must be answerable using only information provided in the text\n   - Questions should test genuine comprehension, not just memory of isolated facts\n   - Avoid questions with obvious or trivial answers\n   - Ensure questions are meaningful and help assess understanding of key text elements\n   - Create questions that feel natural in an educational setting\n\n7. **Output Format:** Strictly return the output in the following JSON structure:\n```json\n{{\n    \"question_answer_pairs\": [\n        {{\n            \"question\": \"<question 1>\",\n            \"answer\": \"<answer 1>\"\n        }},\n        {{\n            \"question\": \"<question 2>\",\n            \"answer\": \"<answer 2>\"\n        }},\n        {{\n            \"question\": \"<question 3>\",\n            \"answer\": \"<answer 3>\"\n        }},\n        {{\n            \"question\": \"<question 4>\",\n            \"answer\": \"<answer 4>\"\n        }},\n        {{\n            \"question\": \"<question 5>\",\n            \"answer\": \"<answer 5>\"\n        }}\n    ]\n}}\n```\nOnly output the JSON. No additional commentary or explanations.",
    "user": "Generate 5 developmentally appropriate reading comprehension question-answer pairs based on the following input:\n\n- Text: {output}\n- Age Group: {age_group}\n- Stage: {stage}\n\nInstructions:\n- Consider the developmental stage ({stage}) and age group ({age_group}) when crafting question complexity and answer expectations\n- Create questions that test different types of comprehension appropriate to the developmental level\n- **Ensure all questions can be answered directly from the provided text**\n- Generate authentic child responses that demonstrate comprehension at the target stage\n- Use vocabulary and sentence structures appropriate to the age group\n- Create a mix of question types that genuinely assess understanding of the text\n\nOutput strictly in this format:\n```json\n{{\n    \"question_answer_pairs\": [\n        {{\n            \"question\": \"<question 1>\",\n            \"answer\": \"<answer 1>\"\n        }},\n        {{\n            \"question\": \"<question 2>\",\n            \"answer\": \"<answer 2>\"\n        }},\n        {{\n            \"question\": \"<question 3>\",\n            \"answer\": \"<answer 3>\"\n        }},\n        {{\n            \"question\": \"<question 4>\",\n            \"answer\": \"<answer 4>\"\n        }},\n        {{\n            \"question\": \"<question 5>\",\n            \"answer\": \"<answer 5>\"\n        }}\n    ]\n}}\n```"
}

print(cqa_prompt['system'])
print("-----")
print(cqa_prompt['user'])

root_dir = "/datadrive/pavan/az_storage/CurLL_data/stages"
for i in range(10):
    stage_folder = os.path.join(root_dir, f'stage{i}')
    cqa_dir = os.path.join(stage_folder, 'cqa')
    cqa_seed_dir = os.path.join(cqa_dir, 'seed')
    os.makedirs(cqa_seed_dir, exist_ok=True)
    os.makedirs(os.path.join(cqa_dir, 'raw'), exist_ok=True)

    with open(os.path.join(cqa_dir, "prompt.json"), "w") as f:
        json.dump(cqa_prompt, f, indent=2)

    print(f"Stage {i} - Prompt saved.")

### CSQA prompt

In [None]:
csqa_prompt = {
    "system": "You are an AI model generating training data to help language models simulate human developmental skills at various stages from early childhood through early adolescence.\n\nYour task is to create 3 skill-based instruction-response pairs between an educator and a child that use a provided text as context to test specific developmental skills, rather than simple reading comprehension.\n\nStrictly follow these guidelines:\n\n1. **Developmental Appropriateness by Stage:**\n   - Stage 0 (Age 5): Simple vocabulary, short sentences, concrete thinking, present-focused, immediate experiences\n   - Stages 1-3 (Ages 6-8): Basic past/future concepts, simple reasoning, familiar contexts, beginning abstract thought\n   - Stages 4-6 (Ages 9-11): Complex reasoning, abstract thinking, varied sentence structures, hypothetical scenarios\n   - Stages 7-9 (Ages 12-14): Sophisticated vocabulary, multiple perspectives, advanced abstract reasoning, nuanced responses\n\n2. **Skill-Based Instruction Creation:**\n   - **Use the provided text as context, not as the primary focus**\n   - Create instructions that test the specific skill, subskill, goal, and indicator provided\n   - Instructions should prompt the child to demonstrate the target skill using elements from the text\n   - Avoid simple recall questions - focus on skill application, analysis, synthesis, or evaluation\n   - Vary instruction starters - avoid overusing \"Imagine...\" or \"Tell me about...\"\n   - Include necessary context within the instruction if recall is required\n   - Use developmentally appropriate language and concepts for the target stage\n   - Make instructions engaging and thought-provoking for the age group\n\n3. **Response Generation:**\n   - Create authentic child responses that clearly demonstrate the target indicator\n   - Use vocabulary, sentence complexity, and reasoning appropriate to the developmental stage\n   - Include natural speech patterns and expressions typical of the age group\n   - Ensure responses show genuine skill application, not just text recall\n   - Responses should be verifiable through either:\n     * Information provided in the instruction or text\n     * Common world knowledge appropriate for the child's developmental level\n     * Typical personal experiences for that age group\n   - Avoid arbitrary claims or purely imaginative details unless the skill explicitly encourages creativity\n\n4. **Context Integration:**\n   - Use the provided text as a springboard for skill demonstration\n   - Connect text elements to real-world applications of the skill\n   - Encourage children to apply their skills to analyze, extend, or relate to the text content\n   - Ensure the skill being tested is meaningfully connected to the text context\n\n5. **Content Guidelines:**\n   - **Purely verbal exchanges** - no references to physical objects, gestures, or non-verbal actions\n   - No formatting (bold, italics, markdown)\n   - Instructions should feel natural and appropriate for educational settings\n   - Responses should sound natural and spontaneous, not rehearsed\n   - Include appropriate emotional expressions and personal connections when relevant\n   - Ensure logical consistency between instruction and response\n   - Focus on the skill demonstration rather than text comprehension\n\n6. **Quality Standards:**\n   - The exchange must demonstrate clear alignment with the skill, subskill, goal, and indicator\n   - Each instruction must clearly target the specific developmental parameters provided\n   - Instructions should be distinct from each other, testing different aspects of the same skill\n   - Both instruction and response should feel authentic to a real classroom or learning interaction\n   - Responses must demonstrate clear mastery or development of the target skill\n   - The text should serve as meaningful context, not just background information\n   - Avoid overly abstract concepts for younger stages or overly simple concepts for older stages\n   - Ensure developmental appropriateness in both challenge level and expectations\n\n7. **Output Format:** Strictly return the output in the following JSON structure:\n```json\n{{\n    \"skill_based_pairs\": [\n        {{\n            \"instruction\": \"<instruction 1>\",\n            \"response\": \"<response 1>\"\n        }},\n        {{\n            \"instruction\": \"<instruction 2>\",\n            \"response\": \"<response 2>\"\n        }},\n        {{\n            \"instruction\": \"<instruction 3>\",\n            \"response\": \"<response 3>\"\n        }}\n    ]\n}}\n```\nOnly output the JSON. No additional commentary or explanations.",
    "user": "Generate 3 developmentally appropriate skill-based instruction-response pairs based on the following input:\n\n- Text: {output}\n- Age Group: {age_group}\n- Stage: {stage}\n- Skill: {skill}\n- Sub-skill: {subskill}\n- Goal: {goal}\n- Indicator: {indicator}\n\nInstructions:\n- Consider the developmental stage ({stage}) and age group ({age_group}) when crafting instruction complexity and response expectations\n- Use the provided text as context to create instructions that test the specific skill ({skill}) and subskill ({subskill})\n- Create instructions that elicit demonstration of the goal ({goal}) and indicator ({indicator})\n- **Focus on skill application and demonstration, not text comprehension**\n- Generate authentic child responses that show clear mastery of the target skill at the developmental stage\n- Use vocabulary and sentence structures appropriate to the age group\n- Create 3 distinct instructions that test different aspects of the same skill\n\nOutput strictly in this format:\n```json\n{{\n    \"skill_based_pairs\": [\n        {{\n            \"instruction\": \"<instruction 1>\",\n            \"response\": \"<response 1>\"\n        }},\n        {{\n            \"instruction\": \"<instruction 2>\",\n            \"response\": \"<response 2>\"\n        }},\n        {{\n            \"instruction\": \"<instruction 3>\",\n            \"response\": \"<response 3>\"\n        }}\n    ]\n}}\n```"
}
print(csqa_prompt['system'])
print("-----")
print(csqa_prompt['user'])


root_dir = "/datadrive/pavan/az_storage/CurLL_data/stages"
for i in range(10):
    stage_folder = os.path.join(root_dir, f'stage{i}')
    csqa_dir = os.path.join(stage_folder, 'csqa')
    csqa_seed_dir = os.path.join(csqa_dir, 'seed')
    # os.makedirs(csqa_seed_dir, exist_ok=True)
    # os.makedirs(os.path.join(csqa_dir, 'raw'), exist_ok=True)

    with open(os.path.join(csqa_dir, "prompt.json"), "w") as f:
        json.dump(csqa_prompt, f, indent=2)

    print(f"Stage {i} - Prompt saved.")