In [1]:
from tqdm import tqdm
from datasets import load_dataset
import pandas as pd
from collections import deque
import pickle
import json
import os
import random

  from .autonotebook import tqdm as notebook_tqdm


## Seed Words: Age of Acquistion

In [19]:
#load a huggingface dataset
aoa = load_dataset("StephanAkkerman/English-Age-of-Acquisition", split="train")

In [20]:
aoa = pd.DataFrame(aoa)
aoa.head()

Unnamed: 0,Word,Alternative.spelling,Freq_pm,Dom_PoS_SUBTLEX,Nletters,Nphon,Nsyll,Lemma_highest_PoS,AoA_Kup,Perc_known,AoA_Kup_lem,Perc_known_lem,AoA_Bird_lem,AoA_Bristol_lem,AoA_Cort_lem,AoA_Schock
0,a,a,20415.27,Article,1,1,1,a,2.89,1.0,2.89,1.0,3.16,,,
1,aardvark,aardvark,0.41,Noun,8,7,2,aardvark,9.89,1.0,9.89,1.0,,,,
2,abacus,abacus,0.24,Noun,6,6,3,abacus,8.69,0.65,8.69,0.65,,,,
3,abacuses,abacuses,0.02,Noun,8,9,4,abacus,,,8.69,0.65,,,,
4,abalone,abalone,0.51,Verb,7,7,4,abalone,12.23,0.72,12.23,0.72,,,,


In [21]:
aoa = aoa[["Dom_PoS_SUBTLEX", "Lemma_highest_PoS", "AoA_Kup_lem", "Perc_known_lem"]]
aoa.head()

Unnamed: 0,Dom_PoS_SUBTLEX,Lemma_highest_PoS,AoA_Kup_lem,Perc_known_lem
0,Article,a,2.89,1.0
1,Noun,aardvark,9.89,1.0
2,Noun,abacus,8.69,0.65
3,Noun,abacus,8.69,0.65
4,Verb,abalone,12.23,0.72


In [22]:
#remove duplicates
aoa = aoa.drop_duplicates(subset=["Lemma_highest_PoS", "AoA_Kup_lem"])
aoa = aoa.dropna()
aoa = aoa.reset_index(drop=True)

In [23]:
len(aoa[aoa['AoA_Kup_lem']<=14])

25652

In [24]:
stage_0 = aoa[aoa['AoA_Kup_lem']<=5].reset_index(drop=True)
stage_1 = aoa[(aoa['AoA_Kup_lem']>5) & (aoa['AoA_Kup_lem']<=6)].reset_index(drop=True)
stage_2 = aoa[(aoa['AoA_Kup_lem']>6) & (aoa['AoA_Kup_lem']<=7)].reset_index(drop=True)
stage_3 = aoa[(aoa['AoA_Kup_lem']>7) & (aoa['AoA_Kup_lem']<=8)].reset_index(drop=True)
stage_4 = aoa[(aoa['AoA_Kup_lem']>8) & (aoa['AoA_Kup_lem']<=9)].reset_index(drop=True)
stage_5 = aoa[(aoa['AoA_Kup_lem']>9) & (aoa['AoA_Kup_lem']<=10)].reset_index(drop=True)
stage_6 = aoa[(aoa['AoA_Kup_lem']>10) & (aoa['AoA_Kup_lem']<=11)].reset_index(drop=True)
stage_7 = aoa[(aoa['AoA_Kup_lem']>11) & (aoa['AoA_Kup_lem']<=12)].reset_index(drop=True)
stage_8 = aoa[(aoa['AoA_Kup_lem']>12) & (aoa['AoA_Kup_lem']<=13)].reset_index(drop=True)
stage_9 = aoa[(aoa['AoA_Kup_lem']>13) & (aoa['AoA_Kup_lem']<=14)].reset_index(drop=True)

In [25]:
def get_stage_words(stage_df):
    filtered_df = stage_df[stage_df['Dom_PoS_SUBTLEX'].isin(['Noun', 'Verb', 'Adjective', 'Adverb'])]
    filtered_df = filtered_df.sort_values(by=["Perc_known_lem"], ascending=False)
    filtered_df = filtered_df.reset_index(drop=True)
    stage_words = list(zip(filtered_df['Lemma_highest_PoS'], filtered_df['Dom_PoS_SUBTLEX']))
    return stage_words, filtered_df

In [26]:
words_0, df_0 = get_stage_words(stage_0)
words_1, df_1 = get_stage_words(stage_1)
words_2, df_2 = get_stage_words(stage_2)
words_3, df_3 = get_stage_words(stage_3)
words_4, df_4 = get_stage_words(stage_4)
words_5, df_5 = get_stage_words(stage_5)
words_6, df_6 = get_stage_words(stage_6)
words_7, df_7 = get_stage_words(stage_7)
words_8, df_8 = get_stage_words(stage_8)
words_9, df_9 = get_stage_words(stage_9)

In [28]:
print("Stage 0: ", len(words_0))
print("Stage 1: ", len(words_1))
print("Stage 2: ", len(words_2))
print("Stage 3: ", len(words_3))
print("Stage 4: ", len(words_4))
print("Stage 5: ", len(words_5))
print("Stage 6: ", len(words_6))
print("Stage 7: ", len(words_7))
print("Stage 8: ", len(words_8))
print("Stage 9: ", len(words_9))

Stage 0:  903
Stage 1:  964
Stage 2:  1386
Stage 3:  1859
Stage 4:  2579
Stage 5:  3147
Stage 6:  3605
Stage 7:  3821
Stage 8:  3768
Stage 9:  3143


In [31]:
def save_seeds_and_leftovers(save_dir, num_seeds):
    for i in range(num_seeds):
        words = globals().get(f"words_{i}")
        with open(f"{save_dir}{i}/seed/seed_words.pkl", "wb") as f:
            pickle.dump(words, f)
        df_var = globals().get(f"df_{i}")
        df_var.to_csv(f"{save_dir}{i}/seed/df_words.csv", index=False)

In [33]:
#Create directories if they do not exist
for i in range(10):
    os.makedirs(f"/datadrive/pavan/az_storage/data_unorganized/stages/stage{i}/seed/instruct/", exist_ok=True)
    os.makedirs(f"/datadrive/pavan/az_storage/data_unorganized/stages/stage{i}/seed/context/", exist_ok=True)
    os.makedirs(f"/datadrive/pavan/az_storage/data_unorganized/stages/stage{i}/raw/instruct/", exist_ok=True)
    os.makedirs(f"/datadrive/pavan/az_storage/data_unorganized/stages/stage{i}/raw/context/", exist_ok=True)


In [34]:
save_dir = "/datadrive/pavan/az_storage/data_unorganized/stages/stage"
save_seeds_and_leftovers(save_dir, num_seeds=10)

### Recheck the saved seed words

In [36]:
for i in range(10):
    #load the words
    with open(f"{save_dir}{i}/seed/seed_words.pkl", "rb") as f:
        words = pickle.load(f)

    print(f"Stage {i}: ", len(words))

Stage 0:  903
Stage 1:  964
Stage 2:  1386
Stage 3:  1859
Stage 4:  2579
Stage 5:  3147
Stage 6:  3605
Stage 7:  3821
Stage 8:  3768
Stage 9:  3143


## Stage-wise Seed

In [2]:
with open("/datadrive/pavan/az_storage/data_unorganized/skill_graph/version2/graph_final.pkl", "rb") as f:
    DG = pickle.load(f)

In [4]:
DG.nodes["i182"]

{'label': 'Know the name of each letter in the English alphabet and the most common sound (phoneme) associated with it. ',
 'age_group': '5-11',
 'skill': 'English',
 'subskill': 'Reading',
 'goal': 'Word structure (phonics) (Stages 1 to 4 only) Learners develop the decoding skills that form the foundation of reading for all stages.',
 'stage': 1,
 'modality_textual': None,
 'perspective': None,
 'require_multimodal_context': None,
 'embodied': None,
 'ins_templates': ['Identify and name',
  'Sound association practice',
  'Categorize and sort',
  'Pattern recognition task',
  'Fill in the blank',
  'Complete the sequence',
  'Describe the relationship',
  'What comes next?',
  'Find the example',
  'Define and illustrate',
  'Explain the connection',
  'Simple matching exercise',
  'Identify the odd one',
  'List and describe',
  'How does it work?',
  'Rewrite in different words',
  'Give an example',
  'Explain step-by-step'],
 'context_templates': ['Descriptive scene setting',
  'S

In [8]:
seed_words = {}
for i in range(10):
    with open(f"/datadrive/pavan/az_storage/data_unorganized/stages/stage{i}/seed/seed_words.pkl", "rb") as f:
        seed_words[i] = pickle.load(f)

In [9]:
len(seed_words)

10

In [10]:
for k in seed_words:
    print(f"Stage {k}: ", len(seed_words[k]))

Stage 0:  903
Stage 1:  964
Stage 2:  1386
Stage 3:  1859
Stage 4:  2579
Stage 5:  3147
Stage 6:  3605
Stage 7:  3821
Stage 8:  3768
Stage 9:  3143


In [11]:
c = 0
ir = 0
for node in DG.nodes():
    c+= len(DG.nodes[node]['context_templates'])
    ir+= len(DG.nodes[node]['ins_templates'])
print("Total number of context templates: ", c)
print("Total number of instruction templates: ", ir)

Total number of context templates:  52770
Total number of instruction templates:  53284


In [12]:
seed_data = {}
for i in range(10):
    seed_data[i] = {"context": [], "instruct": []}

for node in tqdm(DG.nodes()):
    stage = DG.nodes[node]['stage']
    for context in DG.nodes[node]['context_templates']:
        for word in seed_words[stage]:
            seed_data[stage]['context'].append({
                    "id": node,
                    "indicator": DG.nodes[node]['label'],
                    "skill": DG.nodes[node]['skill'],
                    "subskill": DG.nodes[node]['subskill'],
                    "goal": DG.nodes[node]['goal'],
                    "age_group": DG.nodes[node]['age_group'],
                    "stage": DG.nodes[node]['stage'],
                    "context_template": context,
                    "word_list": word
                })

    for instruct in DG.nodes[node]['ins_templates']:
        for word in seed_words[stage]:
            seed_data[stage]['instruct'].append({
                    "id": node,
                    "indicator": DG.nodes[node]['label'],
                    "skill": DG.nodes[node]['skill'],
                    "subskill": DG.nodes[node]['subskill'],
                    "goal": DG.nodes[node]['goal'],
                    "age_group": DG.nodes[node]['age_group'],
                    "stage": DG.nodes[node]['stage'],
                    "context_template": instruct,
                    "word_list": word
                })

  0%|          | 5/2776 [00:00<02:15, 20.42it/s]

100%|██████████| 2776/2776 [06:09<00:00,  7.50it/s]


In [13]:
for k in seed_data:
    print(f"Stage {k}: ", len(seed_data[k]['context']), len(seed_data[k]['instruct']))

Stage 0:  3006990 3320331
Stage 1:  4038196 4120136
Stage 2:  6514200 6668046
Stage 3:  11598301 11763752
Stage 4:  14158710 14310871
Stage 5:  15231480 15187422
Stage 6:  24658200 24625755
Stage 7:  19162315 18978907
Stage 8:  20305752 20090976
Stage 9:  21177534 21023527


In [14]:
skill_dict = {}
for i in tqdm(seed_data):
    if i not in skill_dict:
        skill_dict[i] = {}
    for j in seed_data[i]['context']:
        if j['skill'] not in skill_dict[i]:
            skill_dict[i][j['skill']] = []
        skill_dict[i][j['skill']].append(j['id'])

for stage in skill_dict:
    print(f"Stage {stage}:")
    for skill in skill_dict[stage]:
        print(f"  {skill}: {len(skill_dict[stage][skill])} instances, {len(set(skill_dict[stage][skill]))} unique instances")
        skill_dict[stage][skill] = list(set(skill_dict[stage][skill]))
    print("***************")

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:35<00:00,  3.57s/it]


Stage 0:
  Language and Communication: 394611 instances, 24 unique instances
  Literacy: 316050 instances, 20 unique instances
  Mathematics Development: 413574 instances, 26 unique instances
  Scientific Reasoning: 305214 instances, 18 unique instances
  Perceptual, Motor, and Physical Development: 297990 instances, 17 unique instances
  Approaches to Learning: 670026 instances, 41 unique instances
  Social and Emotional Development: 609525 instances, 36 unique instances
***************
Stage 1:
  English: 1648440 instances, 90 unique instances
  Mathematics: 650700 instances, 37 unique instances
  Science: 667088 instances, 37 unique instances
  Computing: 521524 instances, 28 unique instances
  Global Perspectives: 327760 instances, 18 unique instances
  Digital Literacy: 222684 instances, 11 unique instances
***************
Stage 2:
  English: 2562714 instances, 99 unique instances
  Mathematics: 1222452 instances, 48 unique instances
  Science: 1086624 instances, 42 unique instanc

In [15]:
skill_dict = {}
for i in tqdm(seed_data):
    if i not in skill_dict:
        skill_dict[i] = {}
    for j in seed_data[i]['instruct']:
        if j['skill'] not in skill_dict[i]:
            skill_dict[i][j['skill']] = []
        skill_dict[i][j['skill']].append(j['id'])

for stage in skill_dict:
    print(f"Stage {stage}:")
    for skill in skill_dict[stage]:
        print(f"  {skill}: {len(skill_dict[stage][skill])} instances, {len(set(skill_dict[stage][skill]))} unique instances")
        skill_dict[stage][skill] = list(set(skill_dict[stage][skill]))
    print("***************")

100%|██████████| 10/10 [00:35<00:00,  3.55s/it]


Stage 0:
  Language and Communication: 443373 instances, 24 unique instances
  Literacy: 423507 instances, 20 unique instances
  Mathematics Development: 459627 instances, 26 unique instances
  Scientific Reasoning: 317856 instances, 18 unique instances
  Perceptual, Motor, and Physical Development: 282639 instances, 17 unique instances
  Approaches to Learning: 742266 instances, 41 unique instances
  Social and Emotional Development: 651063 instances, 36 unique instances
***************
Stage 1:
  English: 1712064 instances, 90 unique instances
  Mathematics: 700828 instances, 37 unique instances
  Science: 681548 instances, 37 unique instances
  Computing: 508028 instances, 28 unique instances
  Global Perspectives: 315228 instances, 18 unique instances
  Digital Literacy: 202440 instances, 11 unique instances
***************
Stage 2:
  English: 2623698 instances, 99 unique instances
  Mathematics: 1226610 instances, 48 unique instances
  Science: 1124046 instances, 42 unique instanc

### Random 1000 for prompt checking

In [13]:
# save random 1000 samples for each stage
seed_data_sample = {}
for i in range(10):
    seed_data_sample[i] = {"context": [], "instruct": []}
for i in range(10):
    seed_data_sample[i]['context'] = random.sample(seed_data[i]['context'], 1000)
    seed_data_sample[i]['instruct'] = random.sample(seed_data[i]['instruct'], 1000)
for i in range(10):
    with open(f"/datadrive/pavan/az_storage/data_unorganized/stages/stage{i}/seed/instruct/random_1000.jsonl", "w") as f:
        json.dump(seed_data_sample[i]['instruct'], f, indent=4)
    with open(f"/datadrive/pavan/az_storage/data_unorganized/stages/stage{i}/seed/context/random_1000.jsonl", "w") as f:
        json.dump(seed_data_sample[i]['context'], f, indent=4)

### Stage 0

In [29]:
num_gpus = 11
stage = 0
type = "context"
chunk_size = len(seed_data[stage][type]) // num_gpus
chunks = [seed_data[stage][type][i:i + chunk_size] for i in range(0, len(seed_data[stage][type]), chunk_size)]
chunk_metadata = {}
for i, chunk in enumerate(chunks):
    with open(f"/datadrive/pavan/az_storage/data_unorganized/stages/stage{stage}/seed/{type}/chunk_{i}.jsonl", "w") as f:
        json.dump(chunk, f)
    chunk_metadata[f"c_{i}"] = {
        "start": i * chunk_size,
        "end": (i + 1) * chunk_size,
        "size": len(chunk)
    }
with open(f"/datadrive/pavan/az_storage/data_unorganized/stages/stage{stage}/seed/{type}/metadata_chunks.jsonl", "w") as f:
    json.dump(chunk_metadata, f)

In [30]:
len(seed_data[stage][type]), len(chunks), chunk_size

(3006990, 12, 273362)

In [13]:
num_gpus = 19
stage = 0
type = "instruct"
chunk_size = len(seed_data[stage][type]) // num_gpus
chunks = [seed_data[stage][type][i:i + chunk_size] for i in range(0, len(seed_data[stage][type]), chunk_size)]
chunk_metadata = {}
for i, chunk in enumerate(chunks):
    with open(f"/datadrive/pavan/az_storage/data_unorganized/stages/stage{stage}/seed/{type}/chunk_{i}.jsonl", "w") as f:
        json.dump(chunk, f)
    chunk_metadata[f"c_{i}"] = {
        "start": i * chunk_size,
        "end": (i + 1) * chunk_size,
        "size": len(chunk)
    }
with open(f"/datadrive/pavan/az_storage/data_unorganized/stages/stage{stage}/seed/{type}/metadata_chunks.jsonl", "w") as f:
    json.dump(chunk_metadata, f)

In [14]:
len(seed_data[stage][type]), len(chunks), chunk_size

(3320331, 20, 174754)

In [6]:
instruction_response_prompts = {
    "system": "You are an AI model generating training instruction-response pairs to help language models simulate human developmental skills across different stages.\n\nYour task is to create high-quality instruction-response pairs based on a provided developmental indicator, skill, and a tuple of word and its part of speech.\n\nStrictly follow these guidelines:\n\n1. Skill Alignment:\n   - The instruction must be a realistic situation that directly invites the model to demonstrate the specified skill and indicator.\n   - The response must fully demonstrate the expected behavior according to the skill and indicator, in a simple and clear way.\n   - The response should mimic the complexity of the language used by a child of that age group and stage.\n\n2. Using the Word, Part-of-speech tuple:\n   - Use the word and its part of speech tag to build a realistic, developmentally appropriate situation.\n   - Expand the word into a topic relevant to the skill and age group.\n   - Ensure the selected word and expanded topic fit the required Text Type Template (instruct_template).\n   - Neither the instruction nor the response need not use the word, but the context should logically connect to it.\n\n3. Language Style:\n   - Keep vocabulary simple and concrete, matching the given age group and stage.\n\n4. Output Format:\n\nOutput strictly in this format:\n\n```json\n{{\n    \"expanded_topic\": \"<expanded topic>\",\n    \"instruction\": \"<instruction>\",\n    \"response\": \"<response>\"\n}}\n```\n\nDo not add any commentary or explanations. Only output the JSON.",
    "user": "Generate an instruction-response pair based on the following input:\n\n- id: {id}\n- Indicator: {indicator}\n- Skill: {skill}\n- Sub-skill: {subskill}\n- Goal: {goal}\n- Age Group: {age_group}\n- Stage: {stage}\n- Text Type Template: {context_template}\n- (Word, Part of speech): {word_list}\n\nInstructions:\n- Expand the selected word into a skill-relevant topic **that fits the Text Type Template**.\n- Generate an instruction based on the topic, targeting the skill/indicator.\n- Generate a response that demonstrates correct behavior aligned with the skill/indicator.\n\nOutput strictly in this format:\n\n```json\n{{\n    \"expanded_topic\": \"<expanded topic>\",\n    \"instruction\": \"<instruction>\",\n    \"response\": \"<response>\"\n}}\n```"
}
print(instruction_response_prompts['system'])
print("____________________________________")
print(instruction_response_prompts['user'])
type = "instruct"
stage = 0
with open(f"/datadrive/pavan/az_storage/data_unorganized/stages/stage{stage}/seed/{type}/prompt.json", "w") as f:
    json.dump(instruction_response_prompts, f, indent=4)

You are an AI model generating training instruction-response pairs to help language models simulate human developmental skills across different stages.

Your task is to create high-quality instruction-response pairs based on a provided developmental indicator, skill, and a tuple of word and its part of speech.

Strictly follow these guidelines:

1. Skill Alignment:
   - The instruction must be a realistic situation that directly invites the model to demonstrate the specified skill and indicator.
   - The response must fully demonstrate the expected behavior according to the skill and indicator, in a simple and clear way.
   - The response should mimic the complexity of the language used by a child of that age group and stage.

2. Using the Word, Part-of-speech tuple:
   - Use the word and its part of speech tag to build a realistic, developmentally appropriate situation.
   - Expand the word into a topic relevant to the skill and age group.
   - Ensure the selected word and expanded to

In [6]:
instruction_response_prompts = {
    "system": "You are an AI model generating training instruction-response pairs to help language models simulate human developmental skills across different stages.\n\nYour task is to create high-quality instruction-response pairs based on a provided developmental indicator, skill, and a tuple of word and its part of speech.\n\nStrictly follow these guidelines:\n\n1. Skill Alignment:\n   - The instruction must be a realistic situation that directly invites the model to demonstrate the specified skill and indicator.\n   - The response must fully demonstrate the expected behavior according to the skill and indicator, in a simple and clear way.\n   - The response should mimic the complexity of the language used by a child of that age group and stage.\n\n2. Using the Word, Part-of-speech tuple:\n   - Use the word and its part of speech tag to build a realistic, developmentally appropriate situation.\n   - Expand the word into a topic relevant to the skill and age group.\n   - Ensure the selected word and expanded topic fit the required Text Type Template (instruct_template).\n   - Neither the instruction nor the response need not use the word, but the context should logically connect to it.\n\n3. Language Style:\n   - Keep vocabulary simple and concrete, matching the given age group and stage.\n\n4. Output Format:\n\nOutput strictly in this format:\n\n```json\n{{\n    \"expanded_topic\": \"<expanded topic>\",\n    \"instruction\": \"<instruction>\",\n    \"response\": \"<response>\"\n}}\n```\n\nDo not add any commentary or explanations. Only output the JSON.",
    "user": "Generate an instruction-response pair based on the following input:\n\n- id: {id}\n- Indicator: {indicator}\n- Skill: {skill}\n- Sub-skill: {subskill}\n- Goal: {goal}\n- Age Group: {age_group}\n- Stage: {stage}\n- Text Type Template: {context_template}\n- (Word, Part of speech): {word_list}\n\nInstructions:\n- Expand the selected word into a skill-relevant topic **that fits the Text Type Template**.\n- Generate an instruction based on the topic, targeting the skill/indicator.\n- Generate a response that demonstrates correct behavior aligned with the skill/indicator.\n\nOutput strictly in this format:\n\n```json\n{{\n    \"expanded_topic\": \"<expanded topic>\",\n    \"instruction\": \"<instruction>\",\n    \"response\": \"<response>\"\n}}\n```"
}
print(instruction_response_prompts['system'])
print("____________________________________")
print(instruction_response_prompts['user'])
type = "instruct"
stage = 0
with open(f"/datadrive/pavan/az_storage/data_unorganized/stages/stage{stage}/seed/{type}/prompt.json", "w") as f:
    json.dump(instruction_response_prompts, f, indent=4)

You are an AI model generating training instruction-response pairs to help language models simulate human developmental skills across different stages.

Your task is to create high-quality instruction-response pairs based on a provided developmental indicator, skill, and a tuple of word and its part of speech.

Strictly follow these guidelines:

1. Skill Alignment:
   - The instruction must be a realistic situation that directly invites the model to demonstrate the specified skill and indicator.
   - The response must fully demonstrate the expected behavior according to the skill and indicator, in a simple and clear way.
   - The response should mimic the complexity of the language used by a child of that age group and stage.

2. Using the Word, Part-of-speech tuple:
   - Use the word and its part of speech tag to build a realistic, developmentally appropriate situation.
   - Expand the word into a topic relevant to the skill and age group.
   - Ensure the selected word and expanded to

In [13]:
import json
new_ins = {"system": "You are an expert in early childhood education and developmental psychology. Your task is to generate realistic instruction–response pairs between a teacher and a 5-year-old child, based on early learning indicators and goals.\n\nYou will be given a dictionary with the following fields:\nindicator: A specific developmental behavior or learning objective the child should demonstrate.\nskill: The broader area of development (e.g., Social-Emotional, Language, Scientific Reasoning).\nsubskill: A narrower focus within the skill.\ngoal: The learning goal for the child.\nage_group: Always 0–5.\nstage: A developmental stage (typically 0).\nText Type Template: A short phrase describing the activity type or context (e.g., \"Compare size/quantity\", \"Take turns\", \"Retell a story\").\nword_list: A word (and its part of speech) that must serve as a general inspiration for the exchange (e.g., [\"bee\", \"Noun\"] or [\"run\", \"Verb\"]). This word does not need to appear verbatim in the dialogue.\n\nYour task is to:\n1. Write a short teacher prompt that invites the child to think, reflect, observe, express, or act in a way that helps them demonstrate the given indicator.\n2. Write a natural-sounding 5-year-old child response that clearly shows the child demonstrating the indicator, goal, and skill through words.\n3. The entire exchange must be purely verbal – do not reference physical objects, gestures, or actions such as \"pretends to flip a switch\".\n4. Do not use any italic, bold, or markdown styling (*like this*).\n5. Use the word_list only as a soft prompt for variety – it is not required to appear in the final text.\n6. Do not include irrelevant or excessive character details (e.g., “Leo is friendly” or unrelated anecdotes).\n7. Use developmentally appropriate language, tone, and sentence length. The child’s answer should sound like something a real 5-year-old might say.\n8. Avoid meta-language or explanations. Just produce the conversational pair.\n\nNote:\n-   Entire exchange should utilize only text based cues - no pictures, physical objects or sensory materials\n-   The child’s response should be plausible and verifiable based on either the text given or common world knowledge that a 5 year old could reasonably have\n-   Avoid responses that make arbitrary or subjective claims unless they reflect typical personal experiences (e.g., \"I like red more\" is fine, \"red is better than yellow\" is not, unless justified in the prompt)\n-   Always use the word to generate the instruction-response pairs.\n-   Acceptable responses may include:\n    -   Facts stated or implied by the prompt\n    -   Inferences from general child-level world knowledge (e.g., waves break sandcastles, bees fly, a ball can bounce)\n    -   Personal experiences that are common for children (e.g. \"I felt happy when I played with my ball\")\n-   Avoid:\n    -   Imaginary details that require visuals or unspecified context to verify (e.g., \"the red bee is faster than the yellow bee\")\n    -   Fantastical or inconsistent events unless the prompt clearly allows or invites storytelling.\n    -   Descriptions of non-verbal actions, such as \"nods\", \"smiles\", or \"runs in place\".\n\nStrictly follow the Output Format:\n\n```json\n{{\n    \"instruction\": \"<instruction>\",\n    \"response\": \"<response>\"\n}}\n```\n\nDo not add any commentary or explanations. Only output the JSON.",
"user": "Generate an instruction-response pair based on the following input:\n\n- id: {id}\n- Indicator: {indicator}\n- Skill: {skill}\n- Sub-skill: {subskill}\n- Goal: {goal}\n- Age Group: {age_group}\n- Stage: {stage}\n- Text Type Template: {context_template}\n- word_list: {word_list}\n\nOutput strictly in this format:\n\n```json\n{{\n    \"instruction\": \"<instruction>\",\n    \"response\": \"<response>\"\n}}\n```"}

print(new_ins['system'])
print("____________________________________")
print(new_ins['user'])

You are an expert in early childhood education and developmental psychology. Your task is to generate realistic instruction–response pairs between a teacher and a 5-year-old child, based on early learning indicators and goals.

You will be given a dictionary with the following fields:
indicator: A specific developmental behavior or learning objective the child should demonstrate.
skill: The broader area of development (e.g., Social-Emotional, Language, Scientific Reasoning).
subskill: A narrower focus within the skill.
goal: The learning goal for the child.
age_group: Always 0–5.
stage: A developmental stage (typically 0).
Text Type Template: A short phrase describing the activity type or context (e.g., "Compare size/quantity", "Take turns", "Retell a story").
word_list: A word (and its part of speech) that must serve as a general inspiration for the exchange (e.g., ["bee", "Noun"] or ["run", "Verb"]). This word does not need to appear verbatim in the dialogue.

Your task is to:
1. Wri

In [11]:
type = "instruct"
stage = 0
with open(f"/datadrive/pavan/az_storage/data_unorganized/stages/stage{stage}/seed/{type}/prompt_v3.json", "w") as f:
    json.dump(new_ins, f, indent=4)

In [14]:
new_ins = {
    "system": "You are an expert in early childhood education and developmental psychology. Your task is to generate realistic instruction–response pairs between a teacher and a 5-year-old child, based on early learning indicators and goals.\n\nYou will be given a dictionary with the following fields:\nindicator: A specific developmental behavior or learning objective the child should demonstrate.\nskill: The broader area of development (e.g., Social-Emotional, Language, Scientific Reasoning).\nsubskill: A narrower focus within the skill.\ngoal: The learning goal for the child.\nage_group: Always 0–5.\nstage: A developmental stage (typically 0).\nText Type Template: A short phrase describing the activity type or context (e.g., \"Compare size/quantity\", \"Take turns\", \"Retell a story\").\nword_list: A word (and its part of speech) that must clearly inspire the topic, imagery, or situation used in the exchange (e.g., [\"bee\", \"Noun\"] should lead to an interaction involving bees, even if the word \"bee\" itself is not spoken).\n\nYour task is to:\n1. Write a short teacher prompt that invites the child to think, reflect, observe, express, or act in a way that helps them demonstrate the given indicator.\n2. Write a natural-sounding 5-year-old child response that clearly shows the child demonstrating the indicator, goal, and skill through words.\n3. The entire exchange must be purely verbal – do not reference physical objects, gestures, or actions such as \"pretends to flip a switch\".\n4. Do not use any italic, bold, or markdown styling (*like this*).\n5. The word_list must meaningfully inspire the setting, situation, or imagery of the exchange, even if the word is not used exactly.\n6. Do not include irrelevant or excessive character details (e.g., “Leo is friendly” or unrelated anecdotes).\n7. Use developmentally appropriate language, tone, and sentence length. The child’s answer should sound like something a real 5-year-old might say.\n8. Avoid meta-language or explanations. Just produce the conversational pair.\n\nNote:\n-   Entire exchange should utilize only text based cues - no pictures, physical objects or sensory materials\n-   The child’s response should be plausible and verifiable based on either the text given or common world knowledge that a 5 year old could reasonably have\n-   Avoid responses that make arbitrary or subjective claims unless they reflect typical personal experiences (e.g., \"I like red more\" is fine, \"red is better than yellow\" is not, unless justified in the prompt)\n-   Acceptable responses may include:\n    -   Facts stated or implied by the prompt\n    -   Inferences from general child-level world knowledge (e.g., waves break sandcastles, bees fly, a ball can bounce)\n    -   Personal experiences that are common for children (e.g. \"I felt happy when I played with my ball\")\n-   Avoid:\n    -   Imaginary details that require visuals or unspecified context to verify (e.g., \"the red bee is faster than the yellow bee\")\n    -   Fantastical or inconsistent events unless the prompt clearly allows or invites storytelling.\n    -   Descriptions of non-verbal actions, such as \"nods\", \"smiles\", or \"runs in place\".\n\nStrictly follow the Output Format:\n\n```json\n{{\n    \"instruction\": \"<instruction>\",\n    \"response\": \"<response>\"\n}}\n```\n\nDo not add any commentary or explanations. Only output the JSON.",
    "user": "Generate an instruction-response pair based on the following input:\n\n- id: {id}\n- Indicator: {indicator}\n- Skill: {skill}\n- Sub-skill: {subskill}\n- Goal: {goal}\n- Age Group: {age_group}\n- Stage: {stage}\n- Text Type Template: {context_template}\n- word_list: {word_list}\n\nOutput strictly in this format:\n\n```json\n{{\n    \"instruction\": \"<instruction>\",\n    \"response\": \"<response>\"\n}}\n```"
}
print(new_ins['system'])
print("____________________________________")
print(new_ins['user'])

You are an expert in early childhood education and developmental psychology. Your task is to generate realistic instruction–response pairs between a teacher and a 5-year-old child, based on early learning indicators and goals.

You will be given a dictionary with the following fields:
indicator: A specific developmental behavior or learning objective the child should demonstrate.
skill: The broader area of development (e.g., Social-Emotional, Language, Scientific Reasoning).
subskill: A narrower focus within the skill.
goal: The learning goal for the child.
age_group: Always 0–5.
stage: A developmental stage (typically 0).
Text Type Template: A short phrase describing the activity type or context (e.g., "Compare size/quantity", "Take turns", "Retell a story").
word_list: A word (and its part of speech) that must clearly inspire the topic, imagery, or situation used in the exchange (e.g., ["bee", "Noun"] should lead to an interaction involving bees, even if the word "bee" itself is not 

In [15]:
type = "instruct"
stage = 0
with open(f"/datadrive/pavan/az_storage/data_unorganized/stages/stage{stage}/seed/{type}/prompt_v4.json", "w") as f:
    json.dump(new_ins, f, indent=4)

In [16]:
new_ins = {
    "system": "You are an expert in early childhood education and developmental psychology. Your task is to generate realistic instruction–response pairs between a teacher and a 5-year-old child, based on early learning indicators and goals.\n\nYou will be given a dictionary with the following fields:\n- indicator: A specific developmental behavior or learning objective the child should demonstrate.\n- skill: The broader area of development (e.g., Social-Emotional, Language, Scientific Reasoning).\n- subskill: A narrower focus within the skill.\n- goal: The learning goal for the child.\n- age_group: Always 0–5.\n- stage: A developmental stage (typically 0).\n- context_template: A short phrase describing the activity type or context (e.g., \"Compare size/quantity\", \"Take turns\", \"Retell a story\").\n- word_list: A word (and its part of speech) that must serve as a general inspiration for the exchange (e.g., [\"bee\", \"Noun\"] or [\"run\", \"Verb\"]). This word does not need to appear verbatim in the dialogue, but should meaningfully inspire the prompt or response.\n\nYour task is to:\n1. Write a short teacher prompt that invites the child to think, reflect, observe, express, or act in a way that helps them demonstrate the given indicator.\n2. Write a natural-sounding 5-year-old child response that clearly shows the child demonstrating the indicator, goal, and skill through words.\n3. The entire exchange must be purely verbal – do not reference physical objects, gestures, or actions such as \"pretends to flip a switch\".\n4. Do not use any italic, bold, or markdown styling (*like this*).\n5. Use the word_list as a soft prompt – it must meaningfully influence either the instruction or response (but not necessarily appear verbatim).\n6. Do not start all prompts with \"Imagine...\" – vary your phrasing. Use imagination only where natural, not by default.\n7. Avoid assumptions about prior events. If a task involves recalling something (e.g., \"What happened yesterday?\"), include that information in the prompt. Do not assume past events unless explicitly described.\n8. Use developmentally appropriate language, tone, and sentence length. The child’s answer should sound like something a real 5-year-old might say.\n9. Avoid meta-language or explanations. Just produce the conversational pair.\n\nNote:\n- Entire exchange should utilize only text-based cues – no pictures, physical objects, or sensory materials.\n- The child’s response should be plausible and verifiable based on either the text given or common world knowledge that a 5-year-old could reasonably have.\n- Avoid responses that make arbitrary or subjective claims unless they reflect typical personal experiences (e.g., \"I like red more\" is fine, \"red is better than yellow\" is not, unless justified in the prompt).\n\nAcceptable responses may include:\n- Facts stated or implied by the prompt.\n- Inferences from general child-level world knowledge (e.g., waves break sandcastles, bees fly, a ball can bounce).\n- Personal experiences that are common for children (e.g., \"I felt happy when I played with my ball\").\n\nAvoid:\n- Imaginary details that require visuals or unspecified context to verify (e.g., \"the red bee is faster than the yellow bee\").\n- Fantastical or inconsistent events unless the prompt clearly allows or invites storytelling.\n- Descriptions of non-verbal actions, such as \"nods\", \"smiles\", or \"runs in place\".\n\nStrictly follow the Output Format:\n\n```json\n{{\n    \"instruction\": \"<instruction>\",\n    \"response\": \"<response>\"\n}}\n```\n\nDo not add any commentary or explanations. Only output the JSON.",
    "user": "Generate an instruction-response pair based on the following input:\n\n- id: {id}\n- Indicator: {indicator}\n- Skill: {skill}\n- Sub-skill: {subskill}\n- Goal: {goal}\n- Age Group: {age_group}\n- Stage: {stage}\n- Text Type Template: {context_template}\n- word_list: {word_list}\n\nOutput strictly in this format:\n\n```json\n{{\n    \"instruction\": \"<instruction>\",\n    \"response\": \"<response>\"\n}}\n```"
}
print(new_ins['system'])
print("____________________________________")
print(new_ins['user'])

You are an expert in early childhood education and developmental psychology. Your task is to generate realistic instruction–response pairs between a teacher and a 5-year-old child, based on early learning indicators and goals.

You will be given a dictionary with the following fields:
- indicator: A specific developmental behavior or learning objective the child should demonstrate.
- skill: The broader area of development (e.g., Social-Emotional, Language, Scientific Reasoning).
- subskill: A narrower focus within the skill.
- goal: The learning goal for the child.
- age_group: Always 0–5.
- stage: A developmental stage (typically 0).
- context_template: A short phrase describing the activity type or context (e.g., "Compare size/quantity", "Take turns", "Retell a story").
- word_list: A word (and its part of speech) that must serve as a general inspiration for the exchange (e.g., ["bee", "Noun"] or ["run", "Verb"]). This word does not need to appear verbatim in the dialogue, but should

In [17]:
type = "instruct"
stage = 0
with open(f"/datadrive/pavan/az_storage/data_unorganized/stages/stage{stage}/seed/{type}/prompt_v5.json", "w") as f:
    json.dump(new_ins, f, indent=4)

In [18]:
new_ins = {
    "system": "You are an expert in early childhood education and developmental psychology. Your task is to generate realistic instruction–response pairs between a teacher and a 5-year-old child, based on early learning indicators and goals.\n\nYou will be given a dictionary with the following fields:\n- indicator: A specific developmental behavior or learning objective the child should demonstrate.\n- skill: The broader area of development (e.g., Social-Emotional, Language, Scientific Reasoning).\n- subskill: A narrower focus within the skill.\n- goal: The learning goal for the child.\n- age_group: Always 0–5.\n- stage: A developmental stage (typically 0).\n- context_template: A short phrase describing the activity type or context (e.g., \"Compare size/quantity\", \"Take turns\", \"Retell a story\").\n- word_list: A word (and its part of speech) that must always serve as a general inspiration for the exchange (e.g., [\"bee\", \"Noun\"] or [\"run\", \"Verb\"]). This must meaningfully inspire the prompt or response.\n\nYour task is to:\n1. Write a short teacher prompt that invites the child to think, reflect, observe, express, or act in a way that helps them demonstrate the given indicator.\n2. Write a natural-sounding 5-year-old child response that clearly shows the child demonstrating the indicator, goal, and skill through words.\n3. The entire exchange must be purely verbal – do not reference physical objects, gestures, or actions such as \"pretends to flip a switch\".\n4. Do not use any italic, bold, or markdown styling (*like this*).\n5. Use the word_list as a soft prompt – it must always meaningfully influence the topic of the dialogue (but not necessarily appear verbatim).\n6. Do not start all prompts with \"Imagine...\" – vary your phrasing. Use imagination only where natural, not by default.\n7. Avoid assumptions about prior events. If a task involves recalling something (e.g., \"What happened yesterday?\"), include that information in the instruction. Do not assume past events unless explicitly described.\n8. Use developmentally appropriate language, tone, and sentence length. The child’s answer should sound like something a real 5-year-old might say.\n9. Avoid meta-language or explanations. Just produce the conversational pair.\n\nNote:\n- Entire exchange should utilize only text-based cues – no pictures, physical objects, or sensory materials.\n- The child’s response should be plausible and verifiable based on either the text given or common world knowledge that a 5-year-old could reasonably have.\n- Avoid responses that make arbitrary or subjective claims unless they reflect typical personal experiences (e.g., \"I like red more\" is fine, \"red is better than yellow\" is not, unless justified in the prompt).\n\nAcceptable responses may include:\n- Facts stated or implied by the prompt.\n- Inferences from general child-level world knowledge (e.g., waves break sandcastles, bees fly, a ball can bounce).\n- Personal experiences that are common for children (e.g., \"I felt happy when I played with my ball\").\n\nAvoid:\n- Imaginary details that require visuals or unspecified context to verify (e.g., \"the red bee is faster than the yellow bee\").\n- Fantastical or inconsistent events unless the prompt clearly allows or invites storytelling.\n- Descriptions of non-verbal actions, such as \"nods\", \"smiles\", or \"runs in place\".\n\nStrictly follow the Output Format:\n\n```json\n{{\n    \"instruction\": \"<instruction>\",\n    \"response\": \"<response>\"\n}}\n```\n\nDo not add any commentary or explanations. Only output the JSON.",
    "user": "Generate an instruction-response pair based on the following input:\n\n- id: {id}\n- Indicator: {indicator}\n- Skill: {skill}\n- Sub-skill: {subskill}\n- Goal: {goal}\n- Age Group: {age_group}\n- Stage: {stage}\n- Text Type Template: {context_template}\n- word_list: {word_list}\n\nOutput strictly in this format:\n\n```json\n{{\n    \"instruction\": \"<instruction>\",\n    \"response\": \"<response>\"\n}}\n```"
}
print(new_ins['system'])
print("____________________________________")
print(new_ins['user'])

You are an expert in early childhood education and developmental psychology. Your task is to generate realistic instruction–response pairs between a teacher and a 5-year-old child, based on early learning indicators and goals.

You will be given a dictionary with the following fields:
- indicator: A specific developmental behavior or learning objective the child should demonstrate.
- skill: The broader area of development (e.g., Social-Emotional, Language, Scientific Reasoning).
- subskill: A narrower focus within the skill.
- goal: The learning goal for the child.
- age_group: Always 0–5.
- stage: A developmental stage (typically 0).
- context_template: A short phrase describing the activity type or context (e.g., "Compare size/quantity", "Take turns", "Retell a story").
- word_list: A word (and its part of speech) that must always serve as a general inspiration for the exchange (e.g., ["bee", "Noun"] or ["run", "Verb"]). This must meaningfully inspire the prompt or response.

Your ta

In [19]:
type = "instruct"
stage = 0
with open(f"/datadrive/pavan/az_storage/data_unorganized/stages/stage{stage}/seed/{type}/prompt_v6.json", "w") as f:
    json.dump(new_ins, f, indent=4)

### Stage 1

In [12]:
num_gpus = 9
stage = 1
type = "context"
chunk_size = len(seed_data[stage][type]) // num_gpus
chunks = [seed_data[stage][type][i:i + chunk_size] for i in range(0, len(seed_data[stage][type]), chunk_size)]
chunk_metadata = {}
for i, chunk in enumerate(chunks):
    with open(f"/datadrive/pavan/az_storage/data_unorganized/stages/stage{stage}/seed/{type}/chunk_{i}.jsonl", "w") as f:
        json.dump(chunk, f)
    chunk_metadata[f"c_{i}"] = {
        "start": i * chunk_size,
        "end": (i + 1) * chunk_size,
        "size": len(chunk)
    }
with open(f"/datadrive/pavan/az_storage/data_unorganized/stages/stage{stage}/seed/{type}/metadata_chunks.jsonl", "w") as f:
    json.dump(chunk_metadata, f)

In [13]:
len(seed_data[stage][type]), len(chunks), chunk_size

(4038196, 10, 448688)

In [16]:
num_gpus = 11
stage = 1
type = "instruct"
chunk_size = len(seed_data[stage][type]) // num_gpus
chunks = [seed_data[stage][type][i:i + chunk_size] for i in range(0, len(seed_data[stage][type]), chunk_size)]
chunk_metadata = {}
for i, chunk in enumerate(chunks):
    with open(f"/datadrive/pavan/az_storage/data_unorganized/stages/stage{stage}/seed/{type}/chunk_{i}.jsonl", "w") as f:
        json.dump(chunk, f)
    chunk_metadata[f"c_{i}"] = {
        "start": i * chunk_size,
        "end": (i + 1) * chunk_size,
        "size": len(chunk)
    }
with open(f"/datadrive/pavan/az_storage/data_unorganized/stages/stage{stage}/seed/{type}/metadata_chunks.jsonl", "w") as f:
    json.dump(chunk_metadata, f)

In [17]:
len(seed_data[stage][type]), len(chunks), chunk_size

(4120136, 12, 374557)

In [14]:
simple_context_prompts = {
    "system": "You are an AI model generating training data to help language models simulate human developmental skills at various stages (especially for early childhood development).\n\nYour task is to create simple, engaging texts based on provided developmental indicators, skills, and a tuple of word and its part of speech.\n\nStrictly follow these guidelines:\n\n1. **Context Generation:**\n   - Use the provided word and its part of speech to create a meaningful, developmentally appropriate topic.\n   - **Ensure the selected word and expanded topic fit the required Text Type Template (context_template).**\n   - Expand the selected word into a more detailed, skill-aligned topic.\n   - Generate a rich, complete, and engaging text matching the provided context template (e.g., narrative retelling, descriptive explanation).\n   - The generated text should be **between 250 and 500 words**.\n   - The text must clearly align with the skill, subskill, goal, and indicator.\n   - The selected word does not need to explicitly appear in the final text.\n\n2. **Writing Style:**\n   - Use simple vocabulary and sentence structures appropriate for the developmental stage.\n   - Include actions, feelings, interactions, and details to make the text rich and lively.\n   - Avoid overly abstract or culturally specific references unless appropriate for the age group.\n\n3. **Output Format:** Strictly return the output in the following JSON structure:\n```json\n{{\n    \"expanded_topic\": \"<expanded topic>\",\n    \"generated_text\": \"<generated text between 250 and 500 words>\"\n}}\n```\nOnly output the JSON. No additional commentary.",
    "user": "Generate a rich and engaging context text based on the following input:\n\n- id: {id}\n- Indicator: {indicator}\n- Skill: {skill}\n- Sub-skill: {subskill}\n- Goal: {goal}\n- Age Group: {age_group}\n- Stage: {stage}\n- Text Type Template: {context_template}\n- (Word, Part of speech): {word_list}\n\nInstructions:\n- Expand the selected word into a skill-relevant topic **that fits the Text Type Template**.\n- Generate a detailed text of **250–500 words** following the context template.\n- Enrich the text with actions, emotions, and interactions.\n\nOutput strictly in this format:\n```json\n{{\n    \"expanded_topic\": \"<expanded topic>\",\n    \"generated_text\": \"<generated text between 250 and 500 words>\"\n}}\n```"
}
print(simple_context_prompts['system'])
print("____________________________________")
print(simple_context_prompts['user'])

You are an AI model generating training data to help language models simulate human developmental skills at various stages (especially for early childhood development).

Your task is to create simple, engaging texts based on provided developmental indicators, skills, and a tuple of word and its part of speech.

Strictly follow these guidelines:

1. **Context Generation:**
   - Use the provided word and its part of speech to create a meaningful, developmentally appropriate topic.
   - **Ensure the selected word and expanded topic fit the required Text Type Template (context_template).**
   - Expand the selected word into a more detailed, skill-aligned topic.
   - Generate a rich, complete, and engaging text matching the provided context template (e.g., narrative retelling, descriptive explanation).
   - The generated text should be **between 250 and 500 words**.
   - The text must clearly align with the skill, subskill, goal, and indicator.
   - The selected word does not need to explic

In [15]:
with open(f"/datadrive/pavan/az_storage/data_unorganized/stages/stage{stage}/seed/{type}/prompt.json", "w") as f:
    json.dump(simple_context_prompts, f, indent=4)

In [7]:
instruction_response_prompts = {
    "system": "You are an AI model generating training instruction-response pairs to help language models simulate human developmental skills across different stages.\n\nYour task is to create high-quality instruction-response pairs based on a provided developmental indicator, skill, and a tuple of word and its part of speech.\n\nStrictly follow these guidelines:\n\n1. Skill Alignment:\n   - The instruction must be a realistic situation that directly invites the model to demonstrate the specified skill and indicator.\n   - The response must fully demonstrate the expected behavior according to the skill and indicator, in a simple and clear way.\n   - The response should mimic the complexity of the language used by a child of that age group and stage.\n\n2. Using the Word, Part-of-speech tuple:\n   - Use the word and its part of speech tag to build a realistic, developmentally appropriate situation.\n   - Expand the word into a topic relevant to the skill and age group.\n   - Ensure the selected word and expanded topic fit the required Text Type Template (instruct_template).\n   - Neither the instruction nor the response need not use the word, but the context should logically connect to it.\n\n3. Language Style:\n   - Keep vocabulary simple and concrete, matching the given age group and stage.\n\n4. Output Format:\n\nOutput strictly in this format:\n\n```json\n{{\n    \"expanded_topic\": \"<expanded topic>\",\n    \"instruction\": \"<instruction>\",\n    \"response\": \"<response>\"\n}}\n```\n\nDo not add any commentary or explanations. Only output the JSON.",
    "user": "Generate an instruction-response pair based on the following input:\n\n- id: {id}\n- Indicator: {indicator}\n- Skill: {skill}\n- Sub-skill: {subskill}\n- Goal: {goal}\n- Age Group: {age_group}\n- Stage: {stage}\n- Text Type Template: {context_template}\n- (Word, Part of speech): {word_list}\n\nInstructions:\n- Expand the selected word into a skill-relevant topic **that fits the Text Type Template**.\n- Generate an instruction based on the topic, targeting the skill/indicator.\n- Generate a response that demonstrates correct behavior aligned with the skill/indicator.\n\nOutput strictly in this format:\n\n```json\n{{\n    \"expanded_topic\": \"<expanded topic>\",\n    \"instruction\": \"<instruction>\",\n    \"response\": \"<response>\"\n}}\n```"
}
print(instruction_response_prompts['system'])
print("____________________________________")
print(instruction_response_prompts['user'])
type = "instruct"
stage = 1
with open(f"/datadrive/pavan/az_storage/data_unorganized/stages/stage{stage}/seed/{type}/prompt.json", "w") as f:
    json.dump(instruction_response_prompts, f, indent=4)

You are an AI model generating training instruction-response pairs to help language models simulate human developmental skills across different stages.

Your task is to create high-quality instruction-response pairs based on a provided developmental indicator, skill, and a tuple of word and its part of speech.

Strictly follow these guidelines:

1. Skill Alignment:
   - The instruction must be a realistic situation that directly invites the model to demonstrate the specified skill and indicator.
   - The response must fully demonstrate the expected behavior according to the skill and indicator, in a simple and clear way.
   - The response should mimic the complexity of the language used by a child of that age group and stage.

2. Using the Word, Part-of-speech tuple:
   - Use the word and its part of speech tag to build a realistic, developmentally appropriate situation.
   - Expand the word into a topic relevant to the skill and age group.
   - Ensure the selected word and expanded to

In [4]:
new_ins = {
    "system": "You are an expert in childhood education and developmental psychology. Your task is to generate realistic instruction\u2013response pairs between a teacher and an early elementary aged child (6 years old or stage 1 of 5-11 years old), based on early learning indicators and goals.\n\nYou will be given a dictionary with the following fields:\n- indicator: A specific developmental behavior or learning objective the child should demonstrate.\n- skill: The broader area of development (e.g., Social-Emotional, Language, Scientific Reasoning).\n- subskill: A narrower focus within the skill.\n- goal: The learning goal for the child.\n- age_group: Always 5-11.\n- stage: A developmental stage (typically 1).\n- context_template: A short phrase describing the activity type or context (e.g., \"Compare size/quantity\", \"Take turns\", \"Retell a story\").\n- word_list: A word (and its part of speech) that must always serve as a general inspiration for the exchange (e.g., [\"bee\", \"Noun\"] or [\"run\", \"Verb\"]). This must meaningfully inspire the prompt or response.\n\nYour task is to:\n1. Write a short teacher prompt that invites the child to think, reflect, observe, express, or act in a way that helps them demonstrate the given indicator.\n2. Write a natural-sounding 6-year-old child response that clearly shows the child demonstrating the indicator, goal, and skill through words.\n3. The entire exchange must be purely verbal \u2013 do not reference physical objects, gestures, or actions such as \"pretends to flip a switch\".\n4. Do not use any italic, bold, or markdown styling (*like this*).\n5. Use the word_list as a soft prompt \u2013 it must always meaningfully influence the topic of the dialogue (but not necessarily appear verbatim).\n6. Do not start all prompts with \"Imagine...\" \u2013 vary your phrasing. Use imagination only where natural, not by default.\n7. Avoid assumptions about prior events. If a task involves recalling something (e.g., \"What happened yesterday?\"), include that information in the instruction. Do not assume past events unless explicitly described.\n8. Use developmentally appropriate language, tone, and sentence length. The child\u2019s answer should sound like something a real 6-year-old might say.\n9. Avoid meta-language or explanations. Just produce the conversational pair.\n\nNote:\n- Entire exchange should utilize only text-based cues \u2013 no pictures, physical objects, or sensory materials.\n- The child\u2019s response should be plausible and verifiable based on either the text given or common world knowledge that a 6-year-old could reasonably have.\n- Avoid responses that make arbitrary or subjective claims unless they reflect typical personal experiences (e.g., \"I like red more\" is fine, \"red is better than yellow\" is not, unless justified in the prompt).\n\nAcceptable responses may include:\n- Facts stated or implied by the prompt.\n- Inferences from general child-level world knowledge (e.g., waves break sandcastles, bees fly, a ball can bounce).\n- Personal experiences that are common for children (e.g., \"I felt happy when I played with my ball\").\n\nAvoid:\n- Imaginary details that require visuals or unspecified context to verify (e.g., \"the red bee is faster than the yellow bee\").\n- Fantastical or inconsistent events unless the prompt clearly allows or invites storytelling.\n- Descriptions of non-verbal actions, such as \"nods\", \"smiles\", or \"runs in place\".\n\nStrictly follow the Output Format:\n\n```json\n{{\n    \"instruction\": \"<instruction>\",\n    \"response\": \"<response>\"\n}}\n```\n\nDo not add any commentary or explanations. Only output the JSON.",
    "user": "Generate an instruction-response pair based on the following input:\n\n- id: {id}\n- Indicator: {indicator}\n- Skill: {skill}\n- Sub-skill: {subskill}\n- Goal: {goal}\n- Age Group: {age_group}\n- Stage: {stage}\n- Text Type Template: {context_template}\n- word_list: {word_list}\n\nOutput strictly in this format:\n\n```json\n{{\n    \"instruction\": \"<instruction>\",\n    \"response\": \"<response>\"\n}}\n```"
}
print(new_ins['system'])
print("____________________________________")
print(new_ins['user'])

You are an expert in childhood education and developmental psychology. Your task is to generate realistic instruction–response pairs between a teacher and an early elementary aged child (6 years old or stage 1 of 5-11 years old), based on early learning indicators and goals.

You will be given a dictionary with the following fields:
- indicator: A specific developmental behavior or learning objective the child should demonstrate.
- skill: The broader area of development (e.g., Social-Emotional, Language, Scientific Reasoning).
- subskill: A narrower focus within the skill.
- goal: The learning goal for the child.
- age_group: Always 5-11.
- stage: A developmental stage (typically 1).
- context_template: A short phrase describing the activity type or context (e.g., "Compare size/quantity", "Take turns", "Retell a story").
- word_list: A word (and its part of speech) that must always serve as a general inspiration for the exchange (e.g., ["bee", "Noun"] or ["run", "Verb"]). This must mea

In [5]:
type = "instruct"
stage = 1
import json
with open(f"/datadrive/pavan/az_storage/data_unorganized/stages/stage{stage}/seed/{type}/prompt_v1.json", "w") as f:
    json.dump(new_ins, f, indent=4)

In [None]:
#dnn4
CUDA_VISIBLE_DEVICES=0,1 python general_generation.py -p /datadrive/pavan/az_storage/data_unorganized/stages/stage1/seed/instruct/prompt_v1.json -s /datadrive/pavan/az_storage/data_unorganized/stages/stage1/seed/instruct/chunk_0.jsonl -o /datadrive/pavan/CurLL/data/temp_stages/ins/chunk_0.jsonl -t 1.0
CUDA_VISIBLE_DEVICES=2,3 python general_generation.py -p /datadrive/pavan/az_storage/data_unorganized/stages/stage1/seed/instruct/prompt_v1.json -s /datadrive/pavan/az_storage/data_unorganized/stages/stage1/seed/instruct/chunk_1.jsonl -o /datadrive/pavan/CurLL/data/temp_stages/ins/chunk_1.jsonl -t 1.0
#dnn3
CUDA_VISIBLE_DEVICES=0,1 python general_generation.py -p /datadrive/pavan/az_storage/data_unorganized/stages/stage1/seed/instruct/prompt_v1.json -s /datadrive/pavan/az_storage/data_unorganized/stages/stage1/seed/instruct/chunk_2.jsonl -o /datadrive/pavan/CurLL/data/temp_stages/ins/chunk_2.jsonl -t 1.0
CUDA_VISIBLE_DEVICES=2,3 python general_generation.py -p /datadrive/pavan/az_storage/data_unorganized/stages/stage1/seed/instruct/prompt_v1.json -s /datadrive/pavan/az_storage/data_unorganized/stages/stage1/seed/instruct/chunk_3.jsonl -o /datadrive/pavan/CurLL/data/temp_stages/ins/chunk_3.jsonl -t 1.0
#octovc
CUDA_VISIBLE_DEVICES=0,1 python general_generation.py -p /scratch/azureml/cr/j/4b1df45904d0482bb3c3b778f6ce12ff/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage1/seed/instruct/prompt_v1.json -s /scratch/azureml/cr/j/4b1df45904d0482bb3c3b778f6ce12ff/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage1/seed/instruct/chunk_4.jsonl -o /scratch/azureml/cr/j/4b1df45904d0482bb3c3b778f6ce12ff/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage1/raw/instruct/chunk_4.jsonl -t 1.0
CUDA_VISIBLE_DEVICES=2,3 python general_generation.py -p /scratch/azureml/cr/j/4b1df45904d0482bb3c3b778f6ce12ff/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage1/seed/instruct/prompt_v1.json -s /scratch/azureml/cr/j/4b1df45904d0482bb3c3b778f6ce12ff/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage1/seed/instruct/chunk_5.jsonl -o /scratch/azureml/cr/j/4b1df45904d0482bb3c3b778f6ce12ff/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage1/raw/instruct/chunk_5.jsonl -t 1.0
CUDA_VISIBLE_DEVICES=4,5 python general_generation.py -p /scratch/azureml/cr/j/4b1df45904d0482bb3c3b778f6ce12ff/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage1/seed/instruct/prompt_v1.json -s /scratch/azureml/cr/j/4b1df45904d0482bb3c3b778f6ce12ff/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage1/seed/instruct/chunk_6.jsonl -o /scratch/azureml/cr/j/4b1df45904d0482bb3c3b778f6ce12ff/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage1/raw/instruct/chunk_6.jsonl -t 1.0
CUDA_VISIBLE_DEVICES=6,7 python general_generation.py -p /scratch/azureml/cr/j/4b1df45904d0482bb3c3b778f6ce12ff/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage1/seed/instruct/prompt_v1.json -s /scratch/azureml/cr/j/4b1df45904d0482bb3c3b778f6ce12ff/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage1/seed/instruct/chunk_7.jsonl -o /scratch/azureml/cr/j/4b1df45904d0482bb3c3b778f6ce12ff/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage1/raw/instruct/chunk_7.jsonl -t 1.0
#4a100
CUDA_VISIBLE_DEVICES=0,1 python general_generation.py -p /scratch/azureml/cr/j/2f3ba0f218724c749be90a644b6b4033/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage1/seed/instruct/prompt_v1.json -s /scratch/azureml/cr/j/2f3ba0f218724c749be90a644b6b4033/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage1/seed/instruct/chunk_8.jsonl -o /scratch/azureml/cr/j/2f3ba0f218724c749be90a644b6b4033/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage1/raw/instruct/chunk_8.jsonl -t 1.0
CUDA_VISIBLE_DEVICES=2,3 python general_generation.py -p /scratch/azureml/cr/j/2f3ba0f218724c749be90a644b6b4033/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage1/seed/instruct/prompt_v1.json -s /scratch/azureml/cr/j/2f3ba0f218724c749be90a644b6b4033/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage1/seed/instruct/chunk_9.jsonl -o /scratch/azureml/cr/j/2f3ba0f218724c749be90a644b6b4033/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage1/raw/instruct/chunk_9.jsonl -t 1.0
#2a100s
CUDA_VISIBLE_DEVICES=0,1 python general_generation.py -p /scratch/azureml/cr/j/c55299a810684176805f69ce4d35795f/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage1/seed/instruct/prompt_v1.json -s /scratch/azureml/cr/j/c55299a810684176805f69ce4d35795f/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage1/seed/instruct/chunk_10.jsonl -o /scratch/azureml/cr/j/c55299a810684176805f69ce4d35795f/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage1/raw/instruct/chunk_10.jsonl -t 1.0


### Stage2

In [None]:
num_gpus = 6
stage = 2
chunk_size = len(seed_data_c[stage]) // num_gpus
chunks = [seed_data_c[stage][i:i + chunk_size] for i in range(0, len(seed_data_c[stage]), chunk_size)]
chunk_metadata = {}
for i, chunk in enumerate(chunks):
    with open(f"/datadrive/pavan/az_storage/data_unorganized/stages/stage{stage}/seed/context/c_{i}_seed.jsonl", "w") as f:
        json.dump(chunk, f)
    chunk_metadata[f"c_{i}"] = {
        "start": i * chunk_size,
        "end": (i + 1) * chunk_size,
        "size": len(chunk)
    }
with open(f"/datadrive/pavan/az_storage/data_unorganized/stages/stage{stage}/seed/context/c_seed_metadata.json", "w") as f:
    json.dump(chunk_metadata, f)

In [80]:
num_gpus = 6
stage = 2
chunk_size = len(seed_data_ins[stage]) // num_gpus
chunks = [seed_data_ins[stage][i:i + chunk_size] for i in range(0, len(seed_data_ins[stage]), chunk_size)]
chunk_metadata = {}
for i, chunk in enumerate(chunks):
    with open(f"/datadrive/pavan/az_storage/data_unorganized/stages/stage{stage}/seed/instruct/ins_{i}_seed.jsonl", "w") as f:
        json.dump(chunk, f)
    chunk_metadata[f"c_{i}"] = {
        "start": i * chunk_size,
        "end": (i + 1) * chunk_size,
        "size": len(chunk)
    }
with open(f"/datadrive/pavan/az_storage/data_unorganized/stages/stage{stage}/seed/instruct/ins_seed_metadata.json", "w") as f:
    json.dump(chunk_metadata, f)

In [None]:
#dnn4 (working)
CUDA_VISIBLE_DEVICES=0,1 python general_generation.py -p /datadrive/pavan/az_storage/data_unorganized/stages/stage0/seed/instruct/prompt_v6.json -s /datadrive/pavan/az_storage/data_unorganized/stages/stage0/seed/instruct/chunk_0.jsonl -o /datadrive/pavan/CurLL/data/temp_stages/ins/chunk_0.jsonl -t 1.0
CUDA_VISIBLE_DEVICES=2,3 python general_generation.py -p /datadrive/pavan/az_storage/data_unorganized/stages/stage0/seed/instruct/prompt_v6.json -s /datadrive/pavan/az_storage/data_unorganized/stages/stage0/seed/instruct/chunk_1.jsonl -o /datadrive/pavan/CurLL/data/temp_stages/ins/chunk_1.jsonl -t 1.0
#dnn3 (working)
CUDA_VISIBLE_DEVICES=0,1 python general_generation.py -p /datadrive/pavan/az_storage/data_unorganized/stages/stage0/seed/instruct/prompt_v6.json -s /datadrive/pavan/az_storage/data_unorganized/stages/stage0/seed/instruct/chunk_2.jsonl -o /datadrive/pavan/CurLL/data/temp_stages/ins/chunk_2.jsonl -t 1.0
CUDA_VISIBLE_DEVICES=2,3 python general_generation.py -p /datadrive/pavan/az_storage/data_unorganized/stages/stage0/seed/instruct/prompt_v6.json -s /datadrive/pavan/az_storage/data_unorganized/stages/stage0/seed/instruct/chunk_3.jsonl -o /datadrive/pavan/CurLL/data/temp_stages/ins/chunk_3.jsonl -t 1.0
#quickdevvc (do again) -- got destroyed
CUDA_VISIBLE_DEVICES=0,1 python general_generation.py -p /scratch/azureml/cr/j/ef717686936a41448973873b81f1a807/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage0/seed/instruct/prompt_v6.json -s /scratch/azureml/cr/j/ef717686936a41448973873b81f1a807/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage0/seed/instruct/chunk_4.jsonl -o /scratch/azureml/cr/j/ef717686936a41448973873b81f1a807/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage0/raw/instruct/chunk_4.jsonl -t 1.0
CUDA_VISIBLE_DEVICES=2,3 python general_generation.py -p /scratch/azureml/cr/j/ef717686936a41448973873b81f1a807/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage0/seed/instruct/prompt_v6.json -s /scratch/azureml/cr/j/ef717686936a41448973873b81f1a807/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage0/seed/instruct/chunk_5.jsonl -o /scratch/azureml/cr/j/ef717686936a41448973873b81f1a807/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage0/raw/instruct/chunk_5.jsonl -t 1.0
CUDA_VISIBLE_DEVICES=4,5 python general_generation.py -p /scratch/azureml/cr/j/ef717686936a41448973873b81f1a807/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage0/seed/instruct/prompt_v6.json -s /scratch/azureml/cr/j/ef717686936a41448973873b81f1a807/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage0/seed/instruct/chunk_6.jsonl -o /scratch/azureml/cr/j/ef717686936a41448973873b81f1a807/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage0/raw/instruct/chunk_6.jsonl -t 1.0
CUDA_VISIBLE_DEVICES=6,7 python general_generation.py -p /scratch/azureml/cr/j/ef717686936a41448973873b81f1a807/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage0/seed/instruct/prompt_v6.json -s /scratch/azureml/cr/j/ef717686936a41448973873b81f1a807/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage0/seed/instruct/chunk_7.jsonl -o /scratch/azureml/cr/j/ef717686936a41448973873b81f1a807/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage0/raw/instruct/chunk_7.jsonl -t 1.0
#octovc1 (working)
CUDA_VISIBLE_DEVICES=0,1 python general_generation.py -p /scratch/azureml/cr/j/4b1df45904d0482bb3c3b778f6ce12ff/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage0/seed/instruct/prompt_v6.json -s /scratch/azureml/cr/j/4b1df45904d0482bb3c3b778f6ce12ff/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage0/seed/instruct/chunk_8.jsonl -o /scratch/azureml/cr/j/4b1df45904d0482bb3c3b778f6ce12ff/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage0/raw/instruct/chunk_8.jsonl -t 1.0
CUDA_VISIBLE_DEVICES=2,3 python general_generation.py -p /scratch/azureml/cr/j/4b1df45904d0482bb3c3b778f6ce12ff/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage0/seed/instruct/prompt_v6.json -s /scratch/azureml/cr/j/4b1df45904d0482bb3c3b778f6ce12ff/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage0/seed/instruct/chunk_9.jsonl -o /scratch/azureml/cr/j/4b1df45904d0482bb3c3b778f6ce12ff/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage0/raw/instruct/chunk_9.jsonl -t 1.0
CUDA_VISIBLE_DEVICES=4,5 python general_generation.py -p /scratch/azureml/cr/j/4b1df45904d0482bb3c3b778f6ce12ff/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage0/seed/instruct/prompt_v6.json -s /scratch/azureml/cr/j/4b1df45904d0482bb3c3b778f6ce12ff/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage0/seed/instruct/chunk_10.jsonl -o /scratch/azureml/cr/j/4b1df45904d0482bb3c3b778f6ce12ff/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage0/raw/instruct/chunk_10.jsonl -t 1.0
CUDA_VISIBLE_DEVICES=6,7 python general_generation.py -p /scratch/azureml/cr/j/4b1df45904d0482bb3c3b778f6ce12ff/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage0/seed/instruct/prompt_v6.json -s /scratch/azureml/cr/j/4b1df45904d0482bb3c3b778f6ce12ff/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage0/seed/instruct/chunk_11.jsonl -o /scratch/azureml/cr/j/4b1df45904d0482bb3c3b778f6ce12ff/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage0/raw/instruct/chunk_11.jsonl -t 1.0
#octovc2 (do again) -- not able to run inference
CUDA_VISIBLE_DEVICES=0,1 python general_generation.py -p /scratch/azureml/cr/j/baf9810c2cc14bbaa8b98056b303e9e3/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage0/seed/instruct/prompt_v6.json -s /scratch/azureml/cr/j/baf9810c2cc14bbaa8b98056b303e9e3/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage0/seed/instruct/chunk_12.jsonl -o /scratch/azureml/cr/j/baf9810c2cc14bbaa8b98056b303e9e3/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage0/raw/instruct/chunk_12.jsonl -t 1.0
CUDA_VISIBLE_DEVICES=2,3 python general_generation.py -p /scratch/azureml/cr/j/baf9810c2cc14bbaa8b98056b303e9e3/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage0/seed/instruct/prompt_v6.json -s /scratch/azureml/cr/j/baf9810c2cc14bbaa8b98056b303e9e3/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage0/seed/instruct/chunk_13.jsonl -o /scratch/azureml/cr/j/baf9810c2cc14bbaa8b98056b303e9e3/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage0/raw/instruct/chunk_13.jsonl -t 1.0
CUDA_VISIBLE_DEVICES=4,5 python general_generation.py -p /scratch/azureml/cr/j/baf9810c2cc14bbaa8b98056b303e9e3/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage0/seed/instruct/prompt_v6.json -s /scratch/azureml/cr/j/baf9810c2cc14bbaa8b98056b303e9e3/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage0/seed/instruct/chunk_14.jsonl -o /scratch/azureml/cr/j/baf9810c2cc14bbaa8b98056b303e9e3/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage0/raw/instruct/chunk_14.jsonl -t 1.0
CUDA_VISIBLE_DEVICES=6,7 python general_generation.py -p /scratch/azureml/cr/j/baf9810c2cc14bbaa8b98056b303e9e3/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage0/seed/instruct/prompt_v6.json -s /scratch/azureml/cr/j/baf9810c2cc14bbaa8b98056b303e9e3/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage0/seed/instruct/chunk_15.jsonl -o /scratch/azureml/cr/j/baf9810c2cc14bbaa8b98056b303e9e3/cap/data-capability/wd/INPUT_asdf/data_unorganized/stages/stage0/raw/instruct/chunk_15.jsonl -t 1.0
#2a100s (working)
#4a100s (working)


## Prompts

In [27]:
simple_context_prompts = {
    "system": "You are an AI model generating training data to help language models simulate human developmental skills at various stages (especially for early childhood development).\n\nYour task is to create simple, engaging texts based on provided developmental indicators, skills, and a tuple of word and its part of speech.\n\nStrictly follow these guidelines:\n\n1. **Context Generation:**\n   - Use the provided word and its part of speech to create a meaningful, developmentally appropriate topic.\n   - **Ensure the selected word and expanded topic fit the required Text Type Template (context_template).**\n   - Expand the selected word into a more detailed, skill-aligned topic.\n   - Generate a rich, complete, and engaging text matching the provided context template (e.g., narrative retelling, descriptive explanation).\n   - The generated text should be **between 250 and 500 words**.\n   - The text must clearly align with the skill, subskill, goal, and indicator.\n   - The selected word does not need to explicitly appear in the final text.\n\n2. **Writing Style:**\n   - Use simple vocabulary and sentence structures appropriate for the developmental stage.\n   - Include actions, feelings, interactions, and details to make the text rich and lively.\n   - Avoid overly abstract or culturally specific references unless appropriate for the age group.\n\n3. **Output Format:** Strictly return the output in the following JSON structure:\n```json\n{{\n    \"expanded_topic\": \"<expanded topic>\",\n    \"generated_text\": \"<generated text between 250 and 500 words>\"\n}}\n```\nOnly output the JSON. No additional commentary.",
    "user": "Generate a rich and engaging context text based on the following input:\n\n- id: {id}\n- Indicator: {indicator}\n- Skill: {skill}\n- Sub-skill: {subskill}\n- Goal: {goal}\n- Age Group: {age_group}\n- Stage: {stage}\n- Text Type Template: {context_template}\n- (Word, Part of speech): {word_list}\n\nInstructions:\n- Expand the selected word into a skill-relevant topic **that fits the Text Type Template**.\n- Generate a detailed text of **250–500 words** following the context template.\n- Enrich the text with actions, emotions, and interactions.\n\nOutput strictly in this format:\n```json\n{{\n    \"expanded_topic\": \"<expanded topic>\",\n    \"generated_text\": \"<generated text between 250 and 500 words>\"\n}}\n```"
}
print(simple_context_prompts['system'])
print("____________________________________")
print(simple_context_prompts['user'])

You are an AI model generating training data to help language models simulate human developmental skills at various stages (especially for early childhood development).

Your task is to create simple, engaging texts based on provided developmental indicators, skills, and a tuple of word and its part of speech.

Strictly follow these guidelines:

1. **Context Generation:**
   - Use the provided word and its part of speech to create a meaningful, developmentally appropriate topic.
   - **Ensure the selected word and expanded topic fit the required Text Type Template (context_template).**
   - Expand the selected word into a more detailed, skill-aligned topic.
   - Generate a rich, complete, and engaging text matching the provided context template (e.g., narrative retelling, descriptive explanation).
   - The generated text should be **between 250 and 500 words**.
   - The text must clearly align with the skill, subskill, goal, and indicator.
   - The selected word does not need to explic

In [28]:
with open("/datadrive/pavan/az_storage/data_unorganized/stages/stage0/seed/context/prompt.json", "w") as f:
    json.dump(simple_context_prompts, f, indent=4)

In [3]:
instruction_response_prompts = {
    "system": "You are an AI model generating training instruction-response pairs to help language models simulate human developmental skills across different stages.\n\nYour task is to create high-quality instruction-response pairs based on a provided developmental indicator, skill, and a tuple of word and its part of speech.\n\nGuidelines:\n\n1. Skill Alignment:\n   - The instruction must be a realistic situation that directly invites the model to demonstrate the specified skill and indicator.\n   - The response must fully demonstrate the expected behavior according to the skill and indicator, in a simple and clear way.\n   - The response should not pretend to be a child; it should simply model the appropriate behavior.\n\n2. Using the Word, Part-of-speech tuple:\n   - Use the word and its part of speech tag to build a realistic, developmentally appropriate situation.\n   - Expand the word into a topic relevant to the skill and age group.\n   - Neither the instruction nor the response must explicitly use the word, but the context should logically connect to it.\n\n3. Language Style:\n   - Keep vocabulary simple and concrete, matching the given age group and stage.\n   - Instructions should feel natural and actionable for the developmental level, even if a model is responding.\n\n4. Output Format:\n\nOutput strictly in this format:\n\n```json\n{{\n    \"expanded_topic\": \"<expanded topic>\",\n    \"instruction\": \"<instruction>\",\n    \"response\": \"<response>\"\n}}\n```\n\nDo not add any commentary or explanations. Only output the JSON.",
    "user": "Generate an instruction-response pair based on the following input:\n\n- id: {id}\n- Indicator: {indicator}\n- Skill: {skill}\n- Sub-skill: {subskill}\n- Goal: {goal}\n- Age Group: {age_group}\n- Stage: {stage}\n- Text Type Template: {context_template}\n- (Word, Part of speech): {word_list}\n\nInstructions:\n- Expand the word into a skill-relevant topic.\n- Generate an instruction based on the topic, targeting the skill/indicator.\n- Generate a response that demonstrates correct behavior aligned with the skill/indicator.\n\nOutput strictly in this format:\n\n```json\n{{\n    \"expanded_topic\": \"<expanded topic>\",\n    \"instruction\": \"<instruction>\",\n    \"response\": \"<response>\"\n}}\n```"
}
print(instruction_response_prompts['system'])
print("____________________________________")
print(instruction_response_prompts['user'])

You are an AI model generating training instruction-response pairs to help language models simulate human developmental skills across different stages.

Your task is to create high-quality instruction-response pairs based on a provided developmental indicator, skill, and a tuple of word and its part of speech.

Guidelines:

1. Skill Alignment:
   - The instruction must be a realistic situation that directly invites the model to demonstrate the specified skill and indicator.
   - The response must fully demonstrate the expected behavior according to the skill and indicator, in a simple and clear way.
   - The response should not pretend to be a child; it should simply model the appropriate behavior.

2. Using the Word, Part-of-speech tuple:
   - Use the word and its part of speech tag to build a realistic, developmentally appropriate situation.
   - Expand the word into a topic relevant to the skill and age group.
   - Neither the instruction nor the response must explicitly use the wor

In [6]:
with open("/datadrive/pavan/az_storage/data_unorganized/stages/stage0/seed/instruct/prompt.json", "w") as f:
    json.dump(instruction_response_prompts, f, indent=4)

In [1]:
import json

In [2]:
with open("/datadrive/pavan/CurLL/data/raw/instruct/ins_0_seed.jsonl", "r") as f:
    data = json.load(f)

print(data[0])

{'output': {'selected_word': 'rainstorm', 'selected_word_pos': 'Noun', 'expanded_topic': 'Discussing a loud and startling weather event', 'instruction': "I'm telling you about the big storm we had last night. It was very loud, and there were bright flashes. What do you think about that?", 'response': 'Oh wow! That sounds... surprising. I think I would want to be close to someone if I heard that.'}, 'id': 'i0', 'indicator': 'Uses verbal and non-verbal signals appropriately to acknowledge the comments or questions of others.', 'skill': 'Language and Communication', 'subskill': 'Attending and Understanding', 'goal': 'Child attends to communication and language from others.', 'age_group': '0-5', 'stage': 0, 'context_template': 'Describe feeling outcomes', 'word_list': [['scared', 'Adjective'], ['guy', 'Noun'], ['rainstorm', 'Noun'], ['cake', 'Noun'], ['jelly', 'Noun'], ['library', 'Noun'], ['lightning', 'Noun'], ['whisper', 'Verb'], ['walk', 'Verb'], ['cheer', 'Verb']]}


In [6]:
i = 100
print("Instruction: ", data[i]['output']["instruction"])
print("Response: ", data[i]['output']["response"])

Instruction:  I'm telling you about the dog I saw at the park. He was very fluffy and brown! What color did I say the dog was?
Response:  Brown!


In [1]:
instruction_response_prompts = {
    "system": "You are an AI model generating training instruction-response pairs to help language models simulate human developmental skills across different stages.\n\nYour task is to create high-quality instruction-response pairs based on a provided developmental indicator, skill, and a tuple of word and its part of speech.\n\nStrictly follow these guidelines:\n\n1. Skill Alignment:\n   - The instruction must be a realistic situation that directly invites the model to demonstrate the specified skill and indicator.\n   - The response must fully demonstrate the expected behavior according to the skill and indicator, in a simple and clear way.\n   - The response should mimic the complexity of the language used by a child of that age group and stage.\n\n2. Using the Word, Part-of-speech tuple:\n   - Use the word and its part of speech tag to build a realistic, developmentally appropriate situation.\n   - Expand the word into a topic relevant to the skill and age group.\n   - Ensure the selected word and expanded topic fit the required Text Type Template (instruct_template).\n   - Neither the instruction nor the response need not use the word, but the context should logically connect to it.\n\n3. Language Style:\n   - Keep vocabulary simple and concrete, matching the given age group and stage.\n\n4. Output Format:\n\nOutput strictly in this format:\n\n```json\n{{\n    \"expanded_topic\": \"<expanded topic>\",\n    \"instruction\": \"<instruction>\",\n    \"response\": \"<response>\"\n}}\n```\n\nDo not add any commentary or explanations. Only output the JSON.",
    "user": "Generate an instruction-response pair based on the following input:\n\n- id: {id}\n- Indicator: {indicator}\n- Skill: {skill}\n- Sub-skill: {subskill}\n- Goal: {goal}\n- Age Group: {age_group}\n- Stage: {stage}\n- Text Type Template: {context_template}\n- (Word, Part of speech): {word_list}\n\nInstructions:\n- Expand the selected word into a skill-relevant topic **that fits the Text Type Template**.\n- Generate an instruction based on the topic, targeting the skill/indicator.\n- Generate a response that demonstrates correct behavior aligned with the skill/indicator.\n\nOutput strictly in this format:\n\n```json\n{{\n    \"expanded_topic\": \"<expanded topic>\",\n    \"instruction\": \"<instruction>\",\n    \"response\": \"<response>\"\n}}\n```"
}
print(instruction_response_prompts['system'])
print("____________________________________")
print(instruction_response_prompts['user'])

You are an AI model generating training instruction-response pairs to help language models simulate human developmental skills across different stages.

Your task is to create high-quality instruction-response pairs based on a provided developmental indicator, skill, and a tuple of word and its part of speech.

Strictly follow these guidelines:

1. Skill Alignment:
   - The instruction must be a realistic situation that directly invites the model to demonstrate the specified skill and indicator.
   - The response must fully demonstrate the expected behavior according to the skill and indicator, in a simple and clear way.
   - The response should mimic the complexity of the language used by a child of that age group and stage.

2. Using the Word, Part-of-speech tuple:
   - Use the word and its part of speech tag to build a realistic, developmentally appropriate situation.
   - Expand the word into a topic relevant to the skill and age group.
   - Ensure the selected word and expanded to