# Read files

In [2]:
import json
import os
import pandas as pd
import random

file_path = "/Users/bytedance/Workspace/rlmf/data/RLMF 16052025_5_V1_105/RLMF 16052025_5_V1_105.json"

with open(file_path, 'r') as f:
    data = json.loads(f.read())

fields_to_extract = [
    'task id', 
    "model name",
    'user prompt', 
    # 'system prompt', 
    # 'intervene_system_prompt',
    # 'intervene prompt', 
    # 'ground_truth_answer', 
    'abandon_prompt',
    'abandon_prompt_reason', 
    # 'response', 
    # 'final_answer', 
    'temperature',
    'total_tokens',
    "intervention rounds", 
    'CoT quality', 
    'Model performance classification', 
    'remarks', 
    'level',
    'codeforces_submission_id',
    'programming_language',
]

all_processed_items = [] # To store data for the DataFrame

# sampled_data = random.sample(data, int(len(data) * 0.1))  # Sample 10% of the data randomly
# sampled_data = data[:10]  # Sample the first 10 items
# for d_item in sampled_data:  # Loop through the sampled items
for d_item in data:  # Loop through the sampled items
    processed_item_data = {}
    for field_name in fields_to_extract:
        value = d_item.get(field_name) # Use .get() for safe access

        processed_item_data[field_name] = value
        if field_name == "response":
            # replace all instances of <abandon>, <reason>, and <intervene> with demarcations for easier reading
            if isinstance(value, str):
                # Start with the original string for this field
                modified_value = value 
                
                # Replace <abandon> tags
                modified_value = modified_value.replace("<abandon>", f"\n\n{'='*5}ABANDON_START{'='*15}\n")
                modified_value = modified_value.replace("</abandon>", f"\n{'='*15}ABANDON_END{'='*5}\n\n")

                # Replace <reason> tags
                modified_value = modified_value.replace("<reason>", f"\n\n{'='*5}REASON_START{'='*15}\n")
                modified_value = modified_value.replace("</reason>", f"\n{'='*15}REASON_END{'='*5}\n\n")
                
                # Replace <intervene> tags
                modified_value = modified_value.replace("<intervene>", f"\n\n{'='*5}INTERVENE_START{'='*15}\n")
                modified_value = modified_value.replace("</intervene>", f"\n{'='*15}INTERVENE_END{'='*5}\n\n")

                # Update the dictionary with the fully modified string for "response"
                processed_item_data[field_name] = modified_value

        # if field_name == "remarks":
        #     if isinstance(value, str):
        #         try:
        #             # Attempt to fix single quotes and parse as JSON
        #             parsed_remark = json.loads(value.replace("'", '"'))
        #             processed_item_data[field_name] = parsed_remark
        #         except json.JSONDecodeError:
        #             # If parsing fails, store the original string
        #             processed_item_data[field_name] = value
        #         except Exception: # Catch any other potential errors during remark processing
        #             processed_item_data[field_name] = value # Fallback to original value
        #     else:
        #         # If remarks is not a string (e.g., None, or already a dict/list), store as is
        #         processed_item_data[field_name] = value
        # else:
        #     processed_item_data[field_name] = value
            
    all_processed_items.append(processed_item_data)

# Create DataFrame
df = pd.DataFrame(all_processed_items, columns=fields_to_extract)

# # Display the first 5 rows of the DataFrame
# print("DataFrame Head:")
# print(df.head())
# print("\nDataFrame Info:")
# df.info()

output_path = os.path.join(os.path.dirname(file_path), f"{os.path.splitext(os.path.basename(file_path))[0]}_processed.xlsx")
df.to_excel(output_path, index=False)

# The original printing logic for the first item (if still needed for debugging):
if all_processed_items:
    first_item = all_processed_items[0]
    print("Details of the first item:")
    for k, v in first_item.items():
        if k in fields_to_extract:
            if k == "remarks":
                try:
                    # Ensure v is a string before calling .replace()
                    if isinstance(v, str):
                        parsed_remarks = json.loads(v.replace("'", '"'))
                        print(f"# {k}:\n{json.dumps(parsed_remarks, indent=4)}")
                    else: # If v is not a string (e.g. None, or already parsed)
                        print(f"# {k}:\n{json.dumps(v, indent=4) if v is not None else 'None'}")
                except json.JSONDecodeError:
                    print(f"# {k}:\n{v} (Could not parse as JSON after replacing quotes)")
                except Exception as e:
                    print(f"# {k}:\n{v} (Error processing remarks: {e})")
            else:
                print(f"# {k}:\n{v}")
            print('-'*50)
    print('='*100)

Details of the first item:
# task id:
55376
--------------------------------------------------
# model name:
ep-20250312111804-p4w5h
--------------------------------------------------
# user prompt:
Vasya is sitting on an extremely boring math class. To have fun, he took a piece of paper and wrote out \\( n \\) numbers on a single line. After that, Vasya began to write out different ways to put pluses ("+") in the line between certain digits in the line so that the result was a correct arithmetic expression; formally, no two pluses in such a partition can stand together (between any two adjacent pluses there must be at least one digit), and no plus can stand at the beginning or the end of a line. For example, in the string 100500, ways 100500 (add no pluses), 1+00+500 or 10050+0 are correct, and ways 100++500, +1+0+0+5+0+0 or 100500+ are incorrect.\n\nThe lesson was long, and Vasya has written all the correct ways to place exactly \\( k \\) pluses in a string of digits. At this point, 

# Get metadata of the batch

In [None]:
import csv
import json
import pandas as pd
import os

# Define the new column mapping dictionary
# Keys are the (lowercase) first original names, values are the standardized names.
COLUMN_MAPPING = {
    'task id': 'task_id',
    'trainer id': 'trainer_id',
    'model name': 'model_name',
    'question_id': 'question_id',
    # 'user prompt': 'user_prompt',
    'level': 'level',
    'ACC': 'acc',
    # 'intervene prompt': 'intervene_prompt',
    # 'ground_truth_answer': 'ground_truth_answer',
    # 'initial_response': 'initial_response',
    # 'response': 'response',
    # 'final_answer': 'final_answer',
    'abandon_prompt': 'abandon_prompt',
    'abandon_prompt_reason': 'abandon_prompt_reason',
    # 'temperature': 'temperature',
    'total_latency (ms)': 'total_latency_ms',
    'total_tokens': 'total_tokens',
    'intervention rounds': 'intervention_rounds',
    'CoT quality': 'cot_quality',
    'remarks': 'remarks',
    'Model performance classification': 'model_performance_classification',
    'codeforces_submission_id':'codeforces_submission_id',
    'programming_language': 'programming_language',
}

def parse_csv_file(file_path):
    extracted_data = []
    try:
        with open(file_path, mode='r', encoding='utf-8-sig') as csvfile: # utf-8-sig to handle BOM
            reader = csv.DictReader(csvfile)
            
            header_to_standard_map = {}
            original_fieldnames = reader.fieldnames
            if not original_fieldnames:
                print(f"Warning: CSV file {file_path} is empty or has no headers.")
                return extracted_data

            # print original fieldnames for reference
            print(f"Original fieldnames in {file_path}:\n{json.dumps(original_fieldnames, indent=4)}")

            normalized_fieldnames = [name.lower().strip() for name in original_fieldnames]

            for i, norm_header in enumerate(normalized_fieldnames):
                if norm_header in COLUMN_MAPPING:
                    header_to_standard_map[original_fieldnames[i]] = COLUMN_MAPPING[norm_header]

            if not header_to_standard_map:
                print(f"Warning: No mappable headers found in {file_path} based on COLUMN_MAPPING. Fields looked for: {list(COLUMN_MAPPING.keys())}")
                # If no headers map, we might still want to process if other files have these columns.
                # The DataFrame creation will handle potentially different sets of columns across files.

            for row in reader:
                extracted_fields = {} # Initialize as empty dict
                has_data = False
                for original_header, standard_name in header_to_standard_map.items():
                    if original_header in row and row[original_header] is not None and row[original_header] != '':
                        extracted_fields[standard_name] = row[original_header]
                        has_data = True
                
                if has_data: # Only add if we actually extracted some mapped data
                    extracted_data.append(extracted_fields)
                # If a row has no data for any of the mapped columns, it's skipped.

    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
    except Exception as e:
        print(f"Error parsing CSV file {file_path}: {e}")
    return extracted_data

def parse_json_file(file_path):
    extracted_data = []
    
    try:
        with open(file_path, mode='r', encoding='utf-8') as jsonfile:
            data = json.load(jsonfile)
            
            items_to_parse = []
            if isinstance(data, list):
                items_to_parse = data
            elif isinstance(data, dict):
                found_list = False
                for _, value in data.items():
                    if isinstance(value, list) and all(isinstance(i, dict) for i in value):
                        items_to_parse = value
                        found_list = True
                        break
                if not found_list:
                    items_to_parse = [data]
            else:
                print(f"Warning: JSON file {file_path} does not contain a list or dictionary at the root.")
                return extracted_data
            
            # Print original fieldnames for reference
            if items_to_parse: # Check if the list is not empty
                if isinstance(items_to_parse[0], dict):
                    original_fieldnames = list(items_to_parse[0].keys())
                    # json module is imported at the cell level, so json.dumps is available
                    print(f"Original fieldnames in {file_path} (from first item):\n{json.dumps(original_fieldnames, indent=4)}")
                else:
                    print(f"Warning: First item in {file_path} is not a dictionary (type: {type(items_to_parse[0])}). Cannot print original fieldnames from it.")
            else:
                print(f"Warning: No items found to parse in {file_path}. Cannot print original fieldnames.")

            for item in items_to_parse:
                if isinstance(item, dict):
                    extracted_fields = {} # Initialize as empty dict
                    has_data = False
                    for original_key, value in item.items():
                        normalized_key = original_key.lower().strip()
                        if normalized_key in COLUMN_MAPPING:
                            standard_name = COLUMN_MAPPING[normalized_key]
                            if value is not None and value != '':
                                extracted_fields[standard_name] = value
                                has_data = True
                    
                    if has_data: # Only add if we actually extracted some mapped data
                        extracted_data.append(extracted_fields)
                else:
                    print(f"Warning: Skipping non-dictionary item in JSON file {file_path}: {type(item)}")
                    
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON in file {file_path}. Trying to parse as JSONL...")
        print("Fallback to JSONL parsing not implemented in this snippet.")
    except Exception as e:
        print(f"Error parsing JSON file {file_path}: {e}")
    return extracted_data


file_path = "/Users/bytedance/Workspace/project_rlmf/data/Batch15_RLMF 16062025_15_V1_60/RLMF 16062025_15_V1_60.json"
file_extension = os.path.basename(file_path).split('.')[-1].lower()

try:
    if file_extension == 'csv':
        extracted_data_list = parse_csv_file(file_path)
    elif file_extension in ['json', 'jsonl']:
        extracted_data_list = parse_json_file(file_path)
    else:
        print(f"Unsupported file type: {file_extension}")
        extracted_data_list = []
    
    df = pd.DataFrame(extracted_data_list)

    # Define the desired column order based on COLUMN_MAPPING values
    # Only include columns that are actually present in the DataFrame to avoid errors
    desired_column_order = [col for col in COLUMN_MAPPING.values() if col in df.columns]
    
    # Reindex the DataFrame to ensure the desired column order
    if desired_column_order:
        df = df[desired_column_order]
    
    # Save the DataFrame to the dynamically generated CSV file path
    input_dir = os.path.dirname(file_path)
    input_basename_no_ext = os.path.splitext(os.path.basename(file_path))[0]
    output_filename = f"{input_basename_no_ext}_metadata.csv"
    output_csv_path = os.path.join(input_dir, output_filename)
    df.to_csv(output_csv_path, index=False, encoding='utf-8')
    print(f"\nDataFrame successfully saved to: {output_csv_path}")

except Exception as e:
    print(f"\nError in processing or saving DataFrame: {e}")

Original fieldnames in /Users/bytedance/Workspace/project_rlmf/data/Batch15_RLMF 16062025_15_V1_60/RLMF 16062025_15_V1_60.json (from first item):
[
    "task id",
    "trainer id",
    "model name",
    "user prompt",
    "system prompt",
    "intervene_system_prompt",
    "intervene prompt",
    "ground_truth_answer",
    "abandon_prompt",
    "abandon_prompt_reason",
    "initial_reasoning",
    "response",
    "final_answer",
    "temperature",
    "total_tokens",
    "intervention rounds",
    "CoT quality",
    "Model performance classification",
    "remarks",
    "question_id",
    "level",
    "codeforces_submission_id",
    "programming_language",
    "ACC",
    "total_latency (ms)"
]

DataFrame successfully saved to: /Users/bytedance/Workspace/project_rlmf/data/Batch15_RLMF 16062025_15_V1_60/RLMF 16062025_15_V1_60_metadata.csv


# Get metadata of ALL batches

In [None]:
import json
import os
import glob
import pandas as pd

COLUMN_MAPPING = {
    'task id': 'task_id',
    'trainer id': 'trainer_id',
    'model name': 'model_name',
    'question_id': 'question_id',
    'system prompt': 'system_prompt',
    'user prompt': 'user_prompt',
    'level': 'level',
    'ACC': 'acc',
    'intervene_system_prompt': 'intervene_system_prompt',
    'intervene prompt': 'intervene_prompt',
    'ground_truth_answer': 'ground_truth_answer',
    'temperature': 'temperature',
    'total_tokens': 'total_tokens',
    'total_latency (ms)': 'total_latency_ms',
    'initial_reasoning': 'initial_reasoning',
    'response': 'response',
    # 'initial_response': 'initial_response',
    'final_answer': 'final_answer',
    'abandon_prompt': 'abandon_prompt',
    'abandon_prompt_reason': 'abandon_prompt_reason',
    'intervention rounds': 'intervention_rounds',
    'CoT quality': 'cot_quality',
    'remarks': 'remarks',
    'Model performance classification': 'model_performance_classification',
    'codeforces_submission_id':'codeforces_submission_id',
    'programming_language': 'programming_language',
}

DATA_DIRECTORY = '/Users/bytedance/Workspace/project_rlmf/data/ALL'

def process_json_files(data_dir):
    all_processed_items = []
    all_column_names = set(COLUMN_MAPPING.values()) # Start with mapped names

    # Recursively find all .json and .jsonl files
    json_files = glob.glob(os.path.join(data_dir, '**', '*.json'), recursive=True)
    jsonl_files = glob.glob(os.path.join(data_dir, '**', '*.jsonl'), recursive=True)
    all_input_files = json_files + jsonl_files

    for file_path in all_input_files:
        print(f"Processing file: {file_path}")
        # Extract batch_id from filename
        basename = os.path.basename(file_path)
        batch_id = os.path.splitext(basename)[0]
        
        items = []
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                if file_path.endswith('.jsonl'):
                    for line in f:
                        try:
                            items.append(json.loads(line))
                        except json.JSONDecodeError as e_line:
                            print(f"Skipping line in {file_path} due to JSON decode error: {e_line}")
                elif file_path.endswith('.json'):
                    try:
                        content = json.load(f)
                        if isinstance(content, list):
                            items = content
                        elif isinstance(content, dict): # Handle case where a .json file might contain a single object
                            items = [content]
                        else:
                            print(f"Skipping file {file_path}: content is not a list or dict of JSON objects.")
                    except json.JSONDecodeError as e_file:
                        print(f"Skipping file {file_path} due to JSON decode error: {e_file}")
                        # As a fallback for .json, try to read as JSONL if initial parse fails
                        # This might happen if a .json file is actually in JSONL format
                        f.seek(0) # Reset file pointer to the beginning
                        try:
                            print(f"Attempting to read {file_path} as JSONL...")
                            current_items = []
                            for line in f:
                                try:
                                    current_items.append(json.loads(line))
                                except json.JSONDecodeError as e_line_fallback:
                                    print(f"Skipping line in {file_path} (fallback JSONL) due to JSON decode error: {e_line_fallback}")
                            items = current_items
                        except Exception as e_fallback:
                            print(f"Failed to read {file_path} as JSONL fallback: {e_fallback}")

        except Exception as e:
            print(f"Could not read or process file {file_path}: {e}")
            continue

        for item in items:
            if not isinstance(item, dict):
                print(f"Skipping non-dictionary item in {file_path}: {item}")
                continue
            
            processed_item = {}
            # Apply column mapping and collect all original column names
            for original_key, value in item.items():
                if original_key in COLUMN_MAPPING:
                    new_key = COLUMN_MAPPING.get(original_key, original_key) # Use original key if not in mapping
                    processed_item[new_key] = value
                    all_column_names.add(new_key) # Add the key used (either mapped or original)
            
            # Add batch_id
            processed_item['batch_id'] = batch_id
            all_column_names.add('batch_id')
            
            all_processed_items.append(processed_item)

    # # Filter items where abandon_prompt == "Yes"
    # filtered_items = [
    #     item for item in all_processed_items 
    #     if isinstance(item.get('abandon_prompt'), str) and item.get('abandon_prompt').lower() == 'yes'
    # ]

    # No filter
    filtered_items = all_processed_items

    # Ensure all filtered items have all collected columns
    final_output_items = []
    for item in filtered_items:
        standardized_item = {}
        for col_name in all_column_names:
            standardized_item[col_name] = item.get(col_name) # Defaults to None if key is missing
        final_output_items.append(standardized_item)
        
    return final_output_items

# Ensure the DATA_DIRECTORY exists
if not os.path.isdir(DATA_DIRECTORY):
    raise ValueError(f"Error: Data directory '{DATA_DIRECTORY}' not found.")
else:
    output_data = process_json_files(DATA_DIRECTORY)

if output_data:
    # print(f"\nFound {len(output_data)} items with 'abandon_prompt' == 'Yes'.")
    print(f"\nFound {len(output_data)} items.")
    print("Details of the first item (if any):")
    print(json.dumps(output_data[0], indent=4))
    
    # # Save the output_data to 'abandoned.json'
    # output_file_path = os.path.join(DATA_DIRECTORY, 'abandoned.json')
    # try:
    #     with open(output_file_path, 'w', encoding='utf-8') as outfile:
    #         json.dump(output_data, outfile, indent=4)
    #     print(f"\nOutput successfully written to {output_file_path}")
    # except IOError as e:
    #     print(f"\nError writing output to {output_file_path}: {e}")

    # Save the output_data to 'metadata.json'
    df = pd.DataFrame(output_data)

    # Reorder the DataFrame columns
    final_ordered_columns = list(COLUMN_MAPPING.values()) + ['batch_id']
    df = df[final_ordered_columns]

    output_file_path = os.path.join(DATA_DIRECTORY, 'metadata.csv')
    try:
        df.to_csv(output_file_path, index=False, encoding='utf-8')
        print(f"\nOutput successfully written to {output_file_path}")
    except IOError as e:
        print(f"\nError writing output to {output_file_path}: {e}")
    
else:
    # print("No items found with 'abandon_prompt' == 'Yes'.")
    print("No items found.")

Processing file: /Users/bytedance/Workspace/project_rlmf/data/ALL/RLMF 13062025_14_V2_60.json
Processing file: /Users/bytedance/Workspace/project_rlmf/data/ALL/RLMF 10062025_11_V2_61.json
Processing file: /Users/bytedance/Workspace/project_rlmf/data/ALL/RLMF 02052025_3_V2_36.json
Processing file: /Users/bytedance/Workspace/project_rlmf/data/ALL/RLMF 05062025_9_V2_100.json
Processing file: /Users/bytedance/Workspace/project_rlmf/data/ALL/RLMF 17062025_16_V1_60.json
Processing file: /Users/bytedance/Workspace/project_rlmf/data/ALL/RLMF 20062025_20_V1_60.json
Processing file: /Users/bytedance/Workspace/project_rlmf/data/ALL/RLMF 12062025_13_V2_60.json
Processing file: /Users/bytedance/Workspace/project_rlmf/data/ALL/RLMF 09062025_10_V2_60.json
Processing file: /Users/bytedance/Workspace/project_rlmf/data/ALL/RLMF 04062025_8_V2_100.json
Processing file: /Users/bytedance/Workspace/project_rlmf/data/ALL/RLMF 16052025_5_V2_105.json
Processing file: /Users/bytedance/Workspace/project_rlmf/data

# Extract abandoned cases

In [None]:
import json
import os
import glob

COLUMN_MAPPING = {
    'task id': 'task_id',
    'trainer id': 'trainer_id',
    'model name': 'model_name',
    'question_id': 'question_id',
    'system prompt': 'system_prompt',
    'user prompt': 'user_prompt',
    'level': 'level',
    'ACC': 'acc',
    'intervene_system_prompt': 'intervene_system_prompt',
    'intervene prompt': 'intervene_prompt',
    'ground_truth_answer': 'ground_truth_answer',
    'temperature': 'temperature',
    'total_tokens': 'total_tokens',
    'total_latency (ms)': 'total_latency_ms',
    'initial_reasoning': 'initial_reasoning',
    'response': 'response',
    # 'initial_response': 'initial_response',
    'final_answer': 'final_answer',
    'abandon_prompt': 'abandon_prompt',
    'abandon_prompt_reason': 'abandon_prompt_reason',
    'intervention rounds': 'intervention_rounds',
    'CoT quality': 'cot_quality',
    'remarks': 'remarks',
    'Model performance classification': 'model_performance_classification',
    'codeforces_submission_id':'codeforces_submission_id',
    'programming_language': 'programming_language',
}

DATA_DIRECTORY = '/Users/bytedance/Workspace/project_rlmf/data/ALL'

def process_json_files(data_dir):
    all_processed_items = []
    all_column_names = set(COLUMN_MAPPING.values()) # Start with mapped names

    # Recursively find all .json and .jsonl files
    json_files = glob.glob(os.path.join(data_dir, '**', '*.json'), recursive=True)
    jsonl_files = glob.glob(os.path.join(data_dir, '**', '*.jsonl'), recursive=True)
    all_input_files = json_files + jsonl_files

    for file_path in all_input_files:
        print(f"Processing file: {file_path}")
        # Extract batch_id from filename
        basename = os.path.basename(file_path)
        batch_id = os.path.splitext(basename)[0]
        
        items = []
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                if file_path.endswith('.jsonl'):
                    for line in f:
                        try:
                            items.append(json.loads(line))
                        except json.JSONDecodeError as e_line:
                            print(f"Skipping line in {file_path} due to JSON decode error: {e_line}")
                elif file_path.endswith('.json'):
                    try:
                        content = json.load(f)
                        if isinstance(content, list):
                            items = content
                        elif isinstance(content, dict): # Handle case where a .json file might contain a single object
                            items = [content]
                        else:
                            print(f"Skipping file {file_path}: content is not a list or dict of JSON objects.")
                    except json.JSONDecodeError as e_file:
                        print(f"Skipping file {file_path} due to JSON decode error: {e_file}")
                        # As a fallback for .json, try to read as JSONL if initial parse fails
                        # This might happen if a .json file is actually in JSONL format
                        f.seek(0) # Reset file pointer to the beginning
                        try:
                            print(f"Attempting to read {file_path} as JSONL...")
                            current_items = []
                            for line in f:
                                try:
                                    current_items.append(json.loads(line))
                                except json.JSONDecodeError as e_line_fallback:
                                    print(f"Skipping line in {file_path} (fallback JSONL) due to JSON decode error: {e_line_fallback}")
                            items = current_items
                        except Exception as e_fallback:
                            print(f"Failed to read {file_path} as JSONL fallback: {e_fallback}")

        except Exception as e:
            print(f"Could not read or process file {file_path}: {e}")
            continue

        for item in items:
            if not isinstance(item, dict):
                print(f"Skipping non-dictionary item in {file_path}: {item}")
                continue
            
            processed_item = {}
            # Apply column mapping and collect all original column names
            for original_key, value in item.items():
                new_key = COLUMN_MAPPING.get(original_key, original_key) # Use original key if not in mapping
                processed_item[new_key] = value
                all_column_names.add(new_key) # Add the key used (either mapped or original)
            
            # Add batch_id
            # processed_item = {'batch_id': batch_id, **processed_item}  # Add `batch_id` as the first key/column
            processed_item['batch_id'] = batch_id
            all_column_names.add('batch_id')
            
            all_processed_items.append(processed_item)

    # Filter items where abandon_prompt == "Yes"
    filtered_items = [
        item for item in all_processed_items 
        if isinstance(item.get('abandon_prompt'), str) and item.get('abandon_prompt').lower() == 'yes'
    ]

    # Ensure all filtered items have all collected columns
    final_output_items = []
    for item in filtered_items:
        standardized_item = {}
        for col_name in all_column_names:
            standardized_item[col_name] = item.get(col_name) # Defaults to None if key is missing
        final_output_items.append(standardized_item)
        
    return final_output_items

if __name__ == '__main__':
    # Ensure the DATA_DIRECTORY exists
    if not os.path.isdir(DATA_DIRECTORY):
        raise ValueError(f"Error: Data directory '{DATA_DIRECTORY}' not found.")
    else:
        output_data = process_json_files(DATA_DIRECTORY)

    if output_data:
        print(f"\nFound {len(output_data)} items with 'abandon_prompt' == 'Yes'.")
        print("Details of the first item (if any):")
        print(json.dumps(output_data[0], indent=4))
        
        # Example: Print all task_ids of abandoned prompts
        # print("\nTask IDs of abandoned prompts:")
        # for item in output_data:
        #     print(item.get('task_id'))

        # Save the output_data to 'abandoned.json'
        output_file_path = os.path.join(DATA_DIRECTORY, 'abandoned.json')
        try:
            with open(output_file_path, 'w', encoding='utf-8') as outfile:
                json.dump(output_data, outfile, indent=4)
            print(f"\nOutput successfully written to {output_file_path}")
        except IOError as e:
            print(f"\nError writing output to {output_file_path}: {e}")
    else:
        print("No items found with 'abandon_prompt' == 'Yes'.")


Processing file: /Users/bytedance/Workspace/project_rlmf/data/ALL/RLMF 13062025_14_V2_60.json
Processing file: /Users/bytedance/Workspace/project_rlmf/data/ALL/RLMF 10062025_11_V2_61.json
Processing file: /Users/bytedance/Workspace/project_rlmf/data/ALL/RLMF 02052025_3_V2_36.json
Processing file: /Users/bytedance/Workspace/project_rlmf/data/ALL/RLMF 05062025_9_V2_100.json
Processing file: /Users/bytedance/Workspace/project_rlmf/data/ALL/RLMF 17062025_16_V1_60.json
Processing file: /Users/bytedance/Workspace/project_rlmf/data/ALL/RLMF 12062025_13_V2_60.json
Processing file: /Users/bytedance/Workspace/project_rlmf/data/ALL/RLMF 09062025_10_V2_60.json
Processing file: /Users/bytedance/Workspace/project_rlmf/data/ALL/RLMF 04062025_8_V2_100.json
Processing file: /Users/bytedance/Workspace/project_rlmf/data/ALL/RLMF 16052025_5_V2_105.json
Processing file: /Users/bytedance/Workspace/project_rlmf/data/ALL/RLMF 11062025_12_V2_60.json
Processing file: /Users/bytedance/Workspace/project_rlmf/data