In [1]:
import json
import re
from tqdm import tqdm

In [2]:
def load_data(metadata_path, seed=False):
    with open(metadata_path, "r") as f:
        metadata = json.load(f)
    num_files = len(metadata)
    data = []
    for i in tqdm(range(num_files)):
        file_name = f"chunk_{i}.jsonl"
        chunk_path = "/".join(metadata_path.split("/")[:-1] + [file_name]).replace("seed", "raw")
        # print(chunk_path)
        with open(chunk_path, "r") as f:
            chunk_data = json.load(f)
            # print(len(chunk_data))
        data.extend(chunk_data)
    print("Total samples: ",len(data))
    return data

def output_dict_check(data_list):
    good_list = [o for o in data_list if isinstance(o['output'], dict)]
    bad_list = [o for o in data_list if not isinstance(o['output'], dict)]
    print("Total samples: ",len(data_list))
    print("Good samples: ",len(good_list))
    print("Bad samples: ",len(bad_list))
    return good_list, bad_list

def extract_json_from_string(text):
    # Use regex to extract the JSON block between ```json and ```
    match = re.search(r"```json\n({.*?})\n```", text, re.DOTALL)
    if not match:
        raise ValueError("No valid JSON block found in the input string.")
    
    json_str = match.group(1)
    json_str = json_str.replace("{{", "{")
    json_str = json_str.replace("}}", "}")

    # Parse the JSON string
    try:
        data = json.loads(json_str)
        return data
    except json.JSONDecodeError as e:
        raise ValueError(f"Invalid JSON format: {e}")

def validate_instruction_response(dict_instance, output_format):
    required_keys = {}
    for k in output_format:
        required_keys[k] = type(output_format[k])
    for key, expected_type in required_keys.items():
        if key not in dict_instance:
            return False
    return True

def check_json_struct(data_list, prompt_path):
    with open(prompt_path, "r") as f:
        prompt = json.load(f)
    output_format = extract_json_from_string(prompt['user'])
    print(output_format)
    good_list = [o for o in data_list if validate_instruction_response(o['output'], output_format)]
    bad_list = [o for o in data_list if not validate_instruction_response(o['output'], output_format)]
    print("Total samples: ",len(data_list))
    print("Good samples: ",len(good_list))
    print("Bad samples: ",len(bad_list))
    return good_list, bad_list

def make_correction_prompt(prompt_path):
    with open(prompt_path, "r") as f:
        prompt = json.load(f)
    match = re.search(r"```json\n({.*?})\n```", prompt['user'], re.DOTALL)
    if not match:
        raise ValueError("No valid JSON block found in the input string.")
    json_str = match.group(1)
    print(json_str)
    return json_str.strip()


## Instructions

In [64]:
stage0 = load_data("/datadrive/pavan/az_storage/data_unorganized/stages/stage0/seed/context/metadata_chunks.jsonl")

  0%|          | 0/12 [00:00<?, ?it/s]

100%|██████████| 12/12 [00:31<00:00,  2.60s/it]

Total samples:  3006990





In [65]:
s0_g0, s0_b0 = output_dict_check(stage0)

Total samples:  3006990
Good samples:  2929377
Bad samples:  77613


In [66]:
s0_g1, s0_b1 = check_json_struct(s0_g0, "/datadrive/pavan/az_storage/data_unorganized/stages/stage0/seed/context/prompt.json")

{'expanded_topic': '<expanded topic>', 'generated_text': '<generated text between 250 and 500 words>'}


Total samples:  2929377
Good samples:  2929087
Bad samples:  290


In [67]:
s0_bad_list = s0_b0 + s0_b1

In [32]:
s0_bad_list = s0_b0 + s0_b1
with open("/datadrive/pavan/az_storage/data_unorganized/stages/stage0/raw/context/bad_list.jsonl", "w") as f:
    json.dump(s0_bad_list, f, indent=4)
with open("/datadrive/pavan/az_storage/data_unorganized/stages/stage0/raw/context/good_list.jsonl", "w") as f:
    json.dump(s0_g1, f, indent=4)

In [33]:
len(s0_g1), len(s0_bad_list), len(s0_g1) + len(s0_bad_list)

(2929087, 77903, 3006990)

In [43]:
correct_format = make_correction_prompt("/datadrive/pavan/az_storage/data_unorganized/stages/stage0/seed/context/prompt.json")

{{
    "expanded_topic": "<expanded topic>",
    "generated_text": "<generated text between 250 and 500 words>"
}}


In [37]:
prompt = {"system": "You are a JSON repair assistant. The input will always be invalid JSON with possible issues like: unescaped characters, missing or extra quotes, incorrect key-value syntax, mismatched brackets, or trailing commas. Your task is to fix all such errors and return a valid JSON. Do not explain anything. Just return the corrected JSON in the format specified by the user.",
          "user": "You are a helpful assistant. Please correct the JSON below to make it valid JSON. The JSON is always invalid and may have issues like escaping errors, incorrect or missing key names, extra or missing brackets or quotes, or malformed structure. Fix all these issues completely.\n\nInput:\n\n{output_str}\n\nYour response must follow this exact format — nothing more, nothing less:\n\n```json\n{correct_format}\n```"}
print(prompt["system"])
print("____________________________")
print(prompt["user"])

You are a JSON repair assistant. The input will always be invalid JSON with possible issues like: unescaped characters, missing or extra quotes, incorrect key-value syntax, mismatched brackets, or trailing commas. Your task is to fix all such errors and return a valid JSON. Do not explain anything. Just return the corrected JSON in the format specified by the user.
____________________________
You are a helpful assistant. Please correct the JSON below to make it valid JSON. The JSON is always invalid and may have issues like escaping errors, incorrect or missing key names, extra or missing brackets or quotes, or malformed structure. Fix all these issues completely.

Input:

{output_str}

Your response must follow this exact format — nothing more, nothing less:

```json
{correct_format}
```


In [68]:
s0_bad_list[0]

{'output': '```json\n{\n    "expanded_topic": "A little bear cub learns about listening and responding to his mother\'s directions while exploring the forest.",\n    "generated_text": "Barnaby the bear cub was a very busy little bear! He loved exploring. One sunny morning, Mama Bear said, “Barnaby, please stay close while we look for berries. And Barnaby, *listen* when I call your name!”\\n\\nBarnaby nodded, but soon, a bright blue butterfly fluttered past. “Ooh!” he squealed, and ran after it. He didn\'t look back. He didn’t hear Mama Bear calling, “Barnaby! Barnaby, where are you?”\\n\\nHe chased the butterfly deeper and deeper into the woods. He bumped into a big tree and then splashed through a puddle! It was very fun, but he was all alone. Barnaby started to feel a little bit scared.\\n\\n“Barnaby!” Mama Bear’s voice sounded far away. Barnaby stopped. He tilted his head, listening very carefully. He heard her again, “Barnaby! This way!”\\n\\nHe turned and ran towards the sound of 

In [69]:
bad_seed_list = []
for i in range(len(s0_bad_list)):
    #change the key output to output_prev
    s0_bad_list[i]['output_prev'] = s0_bad_list[i]['output']
    s0_bad_list[i].pop('output')
    bad_seed_list.append({"output_str": s0_bad_list[i]['output_prev'],
                          "correct_format": correct_format,
                          **s0_bad_list[i]})

In [70]:
bad_seed_list[0]

{'output_str': '```json\n{\n    "expanded_topic": "A little bear cub learns about listening and responding to his mother\'s directions while exploring the forest.",\n    "generated_text": "Barnaby the bear cub was a very busy little bear! He loved exploring. One sunny morning, Mama Bear said, “Barnaby, please stay close while we look for berries. And Barnaby, *listen* when I call your name!”\\n\\nBarnaby nodded, but soon, a bright blue butterfly fluttered past. “Ooh!” he squealed, and ran after it. He didn\'t look back. He didn’t hear Mama Bear calling, “Barnaby! Barnaby, where are you?”\\n\\nHe chased the butterfly deeper and deeper into the woods. He bumped into a big tree and then splashed through a puddle! It was very fun, but he was all alone. Barnaby started to feel a little bit scared.\\n\\n“Barnaby!” Mama Bear’s voice sounded far away. Barnaby stopped. He tilted his head, listening very carefully. He heard her again, “Barnaby! This way!”\\n\\nHe turned and ran towards the sound

In [72]:
num_gpus = 4
stage = 0
type = "context"
chunk_size = len(bad_seed_list) // num_gpus
chunks = [bad_seed_list[i:i + chunk_size] for i in range(0, len(bad_seed_list), chunk_size)]
chunk_metadata = {}
for i, chunk in enumerate(chunks):
    with open(f"/datadrive/pavan/az_storage/data_unorganized/stages/stage{stage}/seed/{type}/bad_chunk_{i}.jsonl", "w") as f:
        json.dump(chunk, f)
    chunk_metadata[f"c_{i}"] = {
        "start": i * chunk_size,
        "end": (i + 1) * chunk_size,
        "size": len(chunk)
    }
with open(f"/datadrive/pavan/az_storage/data_unorganized/stages/stage{stage}/seed/{type}/bad_metadata_chunks.jsonl", "w") as f:
    json.dump(chunk_metadata, f)
# with open("/datadrive/pavan/az_storage/data_unorganized/stages/stage0/seed/context/bad_seed_list.jsonl", "w") as f:
#     json.dump(bad_seed_list, f, indent=4)
# with open("/datadrive/pavan/az_storage/data_unorganized/stages/stage0/seed/context/bad_list_prompt.json", "w") as f:
#     json.dump(prompt, f, indent=4)

## Clean corrected

In [4]:
def load_data_corrected(metadata_path, seed=False):
    with open(metadata_path, "r") as f:
        metadata = json.load(f)
    num_files = len(metadata)
    data = []
    for i in tqdm(range(num_files)):
        file_name = f"bad_chunk_{i}.jsonl"
        chunk_path = "/".join(metadata_path.split("/")[:-1] + [file_name]).replace("seed", "raw")
        # print(chunk_path)
        with open(chunk_path, "r") as f:
            chunk_data = json.load(f)
            # print(len(chunk_data))
        data.extend(chunk_data)
    print("Total samples: ",len(data))
    return data

In [30]:
stage0 = load_data_corrected("/datadrive/pavan/az_storage/data_unorganized/stages/stage0/seed/context/bad_metadata_chunks.jsonl")

  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 5/5 [00:03<00:00,  1.55it/s]

Total samples:  77903





In [31]:
s0_g0, s0_b0 = output_dict_check(stage0)

Total samples:  77903
Good samples:  49594
Bad samples:  28309


In [32]:
import re

def extract_key_values(s, keys):
    entry = {}
    for i, key in enumerate(keys):
        # Regex pattern: look for "key": and capture everything until the next key or end
        pattern = rf'"{key}"\s*:\s*(.*?)\s*(?="{keys[i+1]}"\s*:|$)' if i + 1 < len(keys) else rf'"{key}"\s*:\s*(.*)'
        match = re.search(pattern, s, re.DOTALL)
        if match:
            value = match.group(1).strip().strip('",')
            entry[key] = value.strip("```").strip("\n").strip("}").strip("\n")
        else:
            entry[key] = None
    return entry

In [33]:
s0_b0[0]

{'output': '```json\n{\n    "expanded_topic": "Getting Ready for a Walk with Mommy",\n    "generated_text": "Mommy was getting ready to go for a walk! Leo watched her. First, Mommy put on her shoes. \\"Are you ready to go, Leo?\\" she asked. Leo bounced up and down and clapped his hands. He loves walks! \\n\\nMommy smiled. “Okay, let’s find your hat.” She held up a blue hat. “Will this one work?” Leo shook his head. He pointed to the red hat on the chair. “Red!” he said happily. Mommy put the red hat on his head. It was a little big, but Leo didn’t mind. He giggled.\\n\\nNext, Mommy looked for her scarf. “It’s chilly today,” she said. She wrapped the scarf around her neck. Then she reached for something else. It was her coat!  She held it up. “Hmm, is this the right color?” she asked, showing Leo the purple coat. Leo looked carefully. He tilted his head and then pointed to a different coat. It was blue! Mommy laughed. “You are right! The blue coat is perfect.”\\n\\nShe put on the blue 

In [34]:
for i in range(len(s0_b0)):
    s0_b0[i]['output'] = extract_key_values(s0_b0[i]['output'], ["expanded_topic", "generated_text"])

In [35]:
s0_b0[0]

{'output': {'expanded_topic': 'Getting Ready for a Walk with Mommy',
  'generated_text': 'Mommy was getting ready to go for a walk! Leo watched her. First, Mommy put on her shoes. \\"Are you ready to go, Leo?\\" she asked. Leo bounced up and down and clapped his hands. He loves walks! \\n\\nMommy smiled. “Okay, let’s find your hat.” She held up a blue hat. “Will this one work?” Leo shook his head. He pointed to the red hat on the chair. “Red!” he said happily. Mommy put the red hat on his head. It was a little big, but Leo didn’t mind. He giggled.\\n\\nNext, Mommy looked for her scarf. “It’s chilly today,” she said. She wrapped the scarf around her neck. Then she reached for something else. It was her coat!  She held it up. “Hmm, is this the right color?” she asked, showing Leo the purple coat. Leo looked carefully. He tilted his head and then pointed to a different coat. It was blue! Mommy laughed. “You are right! The blue coat is perfect.”\\n\\nShe put on the blue coat and zipped it 

In [36]:
c = 0
for i in range(len(s0_b0)):
    if s0_b0[i]['output']["expanded_topic"] is None or s0_b0[i]['output']["generated_text"] is None:
        c+=1

In [37]:
print(c)

0


In [38]:
corrected_list = s0_g0 + s0_b0
with open("/datadrive/pavan/az_storage/data_unorganized/stages/stage0/raw/context/good_list.jsonl", "r") as f:
    good_list = json.load(f)
all_data = good_list + corrected_list
print(len(all_data))

3006990


In [40]:
with open("/datadrive/pavan/az_storage/data_unorganized/stages/stage0/raw/context/all_data.jsonl", "w") as f:
    json.dump(all_data, f, indent=4)

In [1]:
print("#"*40)

########################################


## Context

In [111]:
with open('/datadrive/pavan/az_storage/data_unorganized/seed_data/raw/c_2_seed.jsonl', 'r') as file:
    data = json.load(file)

In [112]:
# verify the number of samples in the data
len(data)

527700

In [113]:
data =[o for o in data if isinstance(o, dict)]

In [114]:
good_data = [o for o in data if isinstance(o['output'], dict)]
bad_data = [o for o in data if not isinstance(o['output'], dict)]
print(len(good_data), len(bad_data))

503696 24004


In [115]:
bad_data[0]

{'output': '```json\n{\n    "selected_word": "megaphone",\n    "selected_word_pos": "Noun",\n    "expanded_topic": "Describing the items needed for a school fair and how they are used, focusing on clear descriptions and punctuation to aid understanding.",\n    "generated_text": "Our school is having a fair! It’s going to be so much fun. We need lots of things to make it work. Let’s make a list!\n\nFirst, we need tables. Big, strong tables to sell all our yummy treats and cool toys. They need to be covered with bright, colorful cloths! Without tables, where would we put everything?\n\nNext, we need a **megaphone**! Mr. Peterson, the principal, will use it to tell everyone important things, like when the raffle will start. He’ll say, “Attention, everyone! The raffle will begin in five minutes!” A megaphone helps his voice travel far. It\'s red and a little bit scratchy when he speaks into it, but everyone can hear!\n\nThen, we’ll need balloons. Lots and lots of balloons! Red balloons, bl

In [116]:
import re

# Define the patterns for each field
PAT_SELECTED_WORD = re.compile(r'"selected_word":\s*"([^"]*)"')
PAT_SELECTED_POS = re.compile(r'"selected_word_pos":\s*"([^"]*)"')
PAT_TOPIC = re.compile(r'"expanded_topic":\s*"([^"]*)"')

def extract_fields_from_bad_string(text):
    try:
        # Extract the fields with regex
        fields = {
            "selected_word": PAT_SELECTED_WORD.search(text).group(1),
            "selected_word_pos": PAT_SELECTED_POS.search(text).group(1),
            "expanded_topic": PAT_TOPIC.search(text).group(1),
        }
        
        # Simple approach: Find the position after "generated_text": " 
        start_marker = '"generated_text": "'
        start_index = text.find(start_marker)
        
        if start_index != -1:
            # Position after the opening quote
            content_start = start_index + len(start_marker)
            
            
            # Look for the last quote before the closing brace of the JSON
            end_index = text.rfind('"', content_start, text.rfind('}'))
            
            if end_index != -1:
                generated_text = text[content_start:end_index]
                # Replace escaped characters
                fields["generated_text"] = generated_text.replace('\\n', '\n').replace('\\"', '"').replace('\\\'', "'")
                return fields
            else:
                end_index = text.rfind('}')
                generated_text = text[content_start:end_index]
                fields["generated_text"] = generated_text.replace('\\n', '\n').replace('\\"', '"').replace('\\\'', "'")
                return fields

        
        print("Could not extract generated_text field")
        return None
            
    except AttributeError as e:
        print(f"Error extracting fields: {e}")
        return None


In [120]:
unparse_data = []
parse_data = []

for s in tqdm(bad_data):
    entry = extract_fields_from_bad_string(s['output'])
    if entry:
        s['output'] = entry
        parse_data.append(s)
    else:
        unparse_data.append(s)

100%|██████████| 24004/24004 [00:00<00:00, 119525.40it/s]

Error extracting fields: 'NoneType' object has no attribute 'group'
Error extracting fields: 'NoneType' object has no attribute 'group'
Error extracting fields: 'NoneType' object has no attribute 'group'
Error extracting fields: 'NoneType' object has no attribute 'group'
Error extracting fields: 'NoneType' object has no attribute 'group'
Error extracting fields: 'NoneType' object has no attribute 'group'





In [140]:
v_data = parse_data + good_data
va_data = [o for o in v_data if isinstance(o['output'], dict)]
inva_data = [o for o in v_data if not isinstance(o['output'], dict)]
print(len(va_data), len(inva_data))

527694 0


In [141]:
def get_words(tuple_list):
    return [item[0] for item in tuple_list]

def get_pos(tuple_list):
    return [item[1] for item in tuple_list]

valid_data = [o for o in va_data if 'selected_word' in o['output']]
invalid_data = [o for o in va_data if 'selected_word' not in o['output']]
print(len(valid_data), len(invalid_data))
va_data = valid_data
valid_data = [o for o in va_data if o['output']['selected_word'] in get_words(o['word_list'])]
invalid_data = [o for o in va_data if o['output']['selected_word'] not in get_words(o['word_list'])]
print(len(valid_data), len(invalid_data))
va_data = valid_data
valid_data = [o for o in va_data if 'selected_word_pos' in o['output']]
invalid_data = [o for o in va_data if 'selected_word_pos' not in o['output']]
print(len(valid_data), len(invalid_data))
va_data = valid_data
valid_data = [o for o in va_data if o['output']['selected_word_pos'] in get_pos(o['word_list'])]
invalid_data = [o for o in va_data if o['output']['selected_word_pos'] not in get_pos(o['word_list'])]
print(len(valid_data), len(invalid_data))
va_data = valid_data

527626 68
527291 335
527241 50
526821 420


In [139]:
invalid_data[1]

{'output': {'selected_word': 'copy',
  'selected_word_pos': ' ',
  'expanded_topic': 'Comparing how different animals learn by copying others to help them survive – focusing on birds and their song.',
  'generated_text': "Have you ever heard a bird sing a beautiful song? Sometimes, those songs aren’t made up! Some birds *learn* songs by listening to their parents and other birds. It's like a game of copycat!\n\nOld Man Fitzwilliam, the birdwatcher, showed us two kinds of birds today. First, we saw a robin. He explained robins usually sing the same songs, just like their dads. Baby robins listen very carefully and try to copy the sounds. If they don’t quite get it right at first, they keep practicing! It’s how they learn to talk to other robins. Old Man Fitzwilliam said it’s like learning to tie your shoes – you copy what someone else does, and then you practice and practice.\n\nThen, we saw a mockingbird. These birds are *amazing*! They don't just copy their parents. They copy *lots* o

## Stage 1

In [1]:
import json
import re
from tqdm import tqdm

def load_data(metadata_path, seed=False):
    with open(metadata_path, "r") as f:
        metadata = json.load(f)
    num_files = len(metadata)
    data = []
    for i in tqdm(range(num_files)):
        file_name = f"chunk_{i}.jsonl"
        chunk_path = "/".join(metadata_path.split("/")[:-1] + [file_name]).replace("seed", "raw")
        # print(chunk_path)
        with open(chunk_path, "r") as f:
            chunk_data = json.load(f)
            # print(len(chunk_data))
        data.extend(chunk_data)
    print("Total samples: ",len(data))
    return data

def output_dict_check(data_list):
    good_list = [o for o in data_list if isinstance(o['output'], dict)]
    bad_list = [o for o in data_list if not isinstance(o['output'], dict)]
    print("Total samples: ",len(data_list))
    print("Good samples: ",len(good_list))
    print("Bad samples: ",len(bad_list))
    return good_list, bad_list

def extract_json_from_string(text):
    # Use regex to extract the JSON block between ```json and ```
    match = re.search(r"```json\n({.*?})\n```", text, re.DOTALL)
    if not match:
        raise ValueError("No valid JSON block found in the input string.")
    
    json_str = match.group(1)
    json_str = json_str.replace("{{", "{")
    json_str = json_str.replace("}}", "}")

    # Parse the JSON string
    try:
        data = json.loads(json_str)
        return data
    except json.JSONDecodeError as e:
        raise ValueError(f"Invalid JSON format: {e}")

def validate_instruction_response(dict_instance, output_format):
    required_keys = {}
    for k in output_format:
        required_keys[k] = type(output_format[k])
    for key, expected_type in required_keys.items():
        if key not in dict_instance:
            return False
    return True

def check_json_struct(data_list, prompt_path):
    with open(prompt_path, "r") as f:
        prompt = json.load(f)
    output_format = extract_json_from_string(prompt['user'])
    print(output_format)
    good_list = [o for o in data_list if validate_instruction_response(o['output'], output_format)]
    bad_list = [o for o in data_list if not validate_instruction_response(o['output'], output_format)]
    print("Total samples: ",len(data_list))
    print("Good samples: ",len(good_list))
    print("Bad samples: ",len(bad_list))
    return good_list, bad_list

def extract_key_values(s, keys):
    entry = {}
    for i, key in enumerate(keys):
        # Regex pattern: look for "key": and capture everything until the next key or end
        pattern = rf'"{key}"\s*:\s*(.*?)\s*(?="{keys[i+1]}"\s*:|$)' if i + 1 < len(keys) else rf'"{key}"\s*:\s*(.*)'
        match = re.search(pattern, s, re.DOTALL)
        if match:
            value = match.group(1).strip().strip('",')
            entry[key] = value.strip("```").strip("\n").strip("}").strip("\n")
        else:
            entry[key] = None
    return entry

def fix_text(bad_list):
    for i in range(len(bad_list)):
        bad_list[i]['output'] = extract_key_values(bad_list[i]['output'], ["expanded_topic", "generated_text"])
    return bad_list

stage = 1
metadata_path = f"/datadrive/pavan/az_storage/data_unorganized/stages/stage{stage}/seed/context/metadata_chunks.jsonl"
prompt_path = f"/datadrive/pavan/az_storage/data_unorganized/stages/stage{stage}/seed/context/prompt.json"

data = load_data(metadata_path)
print("Total number of text snippets: ", len(data))

g0, b0 = output_dict_check(data)

print(b0[0])

g1, b1 = check_json_struct(g0, prompt_path)

bad_list = b0
print("Total number of bad samples: ", len(bad_list))

bad_list = fix_text(bad_list)


100%|██████████| 10/10 [00:51<00:00,  5.15s/it]


Total samples:  4038196
Total number of text snippets:  4038196
Total samples:  4038196
Good samples:  3890507
Bad samples:  147689
{'output': '```json\n{\n    "expanded_topic": "Describing a beach scene and identifying objects starting with different letters of the alphabet.",\n    "generated_text": "The beach was sunny and warm! Leo and Mia were building a big sandcastle. The sand felt soft and cool between their toes. They looked all around at everything they could see. \\n\\n\\"Look!\\" shouted Leo, pointing. \\"An **a**nc**h**or! I see an anchor on that boat way out in the water!\\" Mia giggled. \\n\\nShe pointed too. “And there’s a **b**ucket! We need a bucket to carry more water for our castle!” \\n\\nThey walked along the shore, their eyes scanning the sand. Leo picked up a shiny **c**onch shell. “Wow! This shell is pink inside!”\\n\\nMia found a beautiful, smooth **d**riftwood piece. \\"This will be the tower of our castle!\\" she said, carefully placing it on top of the wet s

In [3]:
for i in range(len(b1)):
    output_new = {}
    for k in b1[i]['output'].keys():
        key_name = k.lower()
        if "expanded" in key_name or "topic" in key_name:
            output_new["expanded_topic"] = b1[i]['output'][k]
        if "generated" in key_name or "text" in key_name:
            output_new["generated_text"] = b1[i]['output'][k]
    b1[i]['output'] = output_new

In [4]:
a, b = output_dict_check(b1)

Total samples:  572
Good samples:  572
Bad samples:  0


In [5]:
a, b = check_json_struct(a, prompt_path)

{'expanded_topic': '<expanded topic>', 'generated_text': '<generated text between 250 and 500 words>'}
Total samples:  572
Good samples:  572
Bad samples:  0
