In [1]:
import pandas as pd
import numpy as np

In [6]:
import os # To check if the file exists
import json
# Assuming your full JSON data is in a file named 'your_dataset.json'
# This file should contain top-level keys like "train", "test", "valid"
file_path = 'final_output.json'

# Names of the embedding fields to check
embedding_fields = ['face_embedding', 'pose_embedding']

# Load the full JSON data
try:
    with open(file_path, 'r') as f:
        full_data = json.load(f)

    print(f"Successfully loaded data from {file_path}\n")

    # Process each section (train, test, valid)
    for section_name in ['train', 'test', 'val']:
        if section_name in full_data and isinstance(full_data[section_name], list):
            print(f"--- Processing '{section_name}' section ---")
            section_data = full_data[section_name]

            if not section_data:
                print(f"'{section_name}' section is empty. No records to check.\n")
                continue

            # Convert the list of records to a DataFrame
            df_section = pd.DataFrame(section_data)
            print(f"Loaded {len(df_section)} records into a DataFrame.")

            # Check each embedding field
            for field in embedding_fields:
                print(f"\nChecking field: '{field}'")
                if field in df_section.columns:
                    # Check for explicit nulls (None/NaN)
                    is_explicitly_null = df_section[field].isna()
                    count_explicitly_null = is_explicitly_null.sum()

                    # Check for empty lists (only if the value is a list)
                    # Need to handle potential None/NaN values in the apply function
                    is_empty_list = df_section[field].apply(
                        lambda x: isinstance(x, list) and len(x) == 0
                    )
                    count_empty_list = is_empty_list.sum()

                    # Check for cases where the key might be missing (values are NaN if not explicitly present)
                    # Note: pd.read_json often puts NaN for missing keys when records have different structures
                    # We can check isna() again, which will catch NaNs from missing keys too
                    is_missing_key_or_null = df_section[field].isna()
                    # Recalculate explicit null count including missing keys handled as NaN
                    count_missing_key_or_null = is_missing_key_or_null.sum() - count_empty_list # Subtract empty lists already counted

                    # Combine check: is it explicitly null/missing OR an empty list?
                    is_null_or_empty = is_explicitly_null | is_empty_list
                    count_null_or_empty = is_null_or_empty.sum()


                    print(f"  Records with explicit null/missing key ({field} is None/NaN): {count_missing_key_or_null}")
                    print(f"  Records with empty list ({field} is []): {count_empty_list}")
                    print(f"  Records with {field} being null/missing OR empty: {count_null_or_empty} out of {len(df_section)}")

                    # Optional: Display rows where the field is null/empty
                    # if count_null_or_empty > 0:
                    #     print(f"  Sample rows where '{field}' is null/empty:")
                    #     print(df_section[is_null_or_empty])


                else:
                    print(f"  Field '{field}' not found in this section's DataFrame.")
                    # If the column doesn't exist, maybe all records are missing this key?
                    # Check if the original json data confirms this
                    # (More advanced check, simplified here by just reporting)
                    all_missing = all(field not in record for record in section_data)
                    if all_missing:
                         print(f"  Note: The field '{field}' appears to be missing from ALL records in the '{section_name}' section in the original JSON.")


            print(f"\n{'-'*len(f'--- Processing {section_name} section ---')}\n")

        elif section_name not in full_data:
            print(f"Warning: Section '{section_name}' not found in the JSON data.\n")
        else:
            print(f"Warning: Section '{section_name}' in JSON is not a list of records (type: {type(full_data[section_name]).__name__}). Skipping.\n")


except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except json.JSONDecodeError:
    print(f"Error: Could not decode JSON from '{file_path}'. Please check the file format.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Successfully loaded data from final_output.json

--- Processing 'train' section ---
Loaded 16659 records into a DataFrame.

Checking field: 'face_embedding'
  Records with explicit null/missing key (face_embedding is None/NaN): 0
  Records with empty list (face_embedding is []): 0
  Records with face_embedding being null/missing OR empty: 0 out of 16659

Checking field: 'pose_embedding'
  Records with explicit null/missing key (pose_embedding is None/NaN): 9
  Records with empty list (pose_embedding is []): 0
  Records with pose_embedding being null/missing OR empty: 9 out of 16659

--------------------------------

--- Processing 'test' section ---
Loaded 3544 records into a DataFrame.

Checking field: 'face_embedding'
  Records with explicit null/missing key (face_embedding is None/NaN): 0
  Records with empty list (face_embedding is []): 0
  Records with face_embedding being null/missing OR empty: 0 out of 3544

Checking field: 'pose_embedding'
  Records with explicit null/missing k

In [7]:
import pandas as pd
import json
import os

# --- Configuration ---
# Path to your input JSON file
input_file_path = 'final_output.json'

# Path where the cleaned output JSON file will be saved
output_file_path = 'check.json'

# Names of the embedding fields that, if missing/empty, should cause the record to be eliminated
embedding_fields_for_elimination = ['face_embedding', 'pose_embedding']

# Expected top-level sections in your JSON file (adjust if you have different keys)
sections_to_process = ['train', 'test', 'val']

# --- End Configuration ---


# Dictionary to store the cleaned dataframes for each section
cleaned_dataframes = {}
full_data = {} # Dictionary to hold the data loaded from the input JSON

# Load the full JSON data
try:
    if not os.path.exists(input_file_path):
        raise FileNotFoundError(f"Input file not found at: {input_file_path}")

    with open(input_file_path, 'r') as f:
        full_data = json.load(f)

    print(f"Successfully loaded data from {input_file_path}\n")

    # Process each specified section
    for section_name in sections_to_process:
        if section_name in full_data and isinstance(full_data[section_name], list):
            print(f"--- Processing and Cleaning '{section_name}' section ---")
            section_data = full_data[section_name]

            if not section_data:
                print(f"'{section_name}' section is empty. No records to process.\n")
                cleaned_dataframes[section_name] = pd.DataFrame() # Store an empty DataFrame
                continue

            # Convert the list of records to a DataFrame
            df_section = pd.DataFrame(section_data)
            initial_rows = len(df_section)
            print(f"'{section_name}': Initial records = {initial_rows}")

            # --- Identify rows to keep (where fields are NOT null/missing AND NOT empty) ---
            # Start with a condition that keeps all rows (all True) for this section's index
            rows_to_keep_condition = pd.Series(True, index=df_section.index)

            for field in embedding_fields_for_elimination:
                if field in df_section.columns:
                    # A value is considered 'good' for filtering if it's NOT pd.isna() AND (it's NOT a list OR it's a list with length > 0)
                    # Handle cases where the value might not be a list before checking length
                    is_field_good = ~df_section[field].isna() & \
                                    ( ~df_section[field].apply(lambda x: isinstance(x, list)) | \
                                      df_section[field].apply(lambda x: isinstance(x, list) and len(x) > 0) )

                    # Update the overall 'rows_to_keep_condition'
                    # A row is kept only if it satisfied the 'good' condition for *all* checked fields so far
                    rows_to_keep_condition = rows_to_keep_condition & is_field_good

                    # Optional: Report how many rows meet the good condition for this field
                    # print(f"  - {is_field_good.sum()} rows in '{section_name}' have valid (not null/empty) '{field}'.")

                else:
                     print(f"  - Warning: Field '{field}' not found in '{section_name}' section DataFrame. Records cannot be filtered based on '{field}' in this section.")
                     # Decide how to handle a missing column if it's critical:
                     # Option A (Remove all rows if a critical column is missing):
                     # print(f"  - Treating missing critical field '{field}' as condition to remove all rows in '{section_name}'.")
                     # rows_to_keep_condition = pd.Series(False, index=df_section.index)
                     # break # Exit the inner loop as all rows are marked for removal
                     # Option B (Just skip filtering for this field in this section - current behavior):
                     pass


            # Filter the DataFrame using the combined condition: keep rows where rows_to_keep_condition is True
            df_section_cleaned = df_section[rows_to_keep_condition].copy() # Use .copy() to avoid SettingWithCopyWarning

            removed_rows = initial_rows - len(df_section_cleaned)
            print(f"'{section_name}': Removed {removed_rows} records with missing/empty embeddings. Remaining records = {len(df_section_cleaned)}")

            # Store the cleaned DataFrame in the dictionary
            cleaned_dataframes[section_name] = df_section_cleaned

            print(f"\n{'-'*50}\n")

        elif section_name not in full_data:
            print(f"Warning: Section '{section_name}' not found in the JSON data. Skipping.\n")
        else:
            print(f"Warning: Section '{section_name}' in JSON is not a list of records (type: {type(full_data[section_name]).__name__}). Skipping.\n")

    # --- Assemble the final output JSON structure ---
    final_output_data = {}
    for section_name in sections_to_process:
        if section_name in cleaned_dataframes:
             # Convert cleaned DataFrame back to a list of dictionaries (JSON structure)
            final_output_data[section_name] = cleaned_dataframes[section_name].to_dict(orient='records')
        elif section_name in full_data:
             # If a section existed but wasn't a list or was empty, include it as it was
             final_output_data[section_name] = full_data[section_name]
        # If a section wasn't in full_data at all, it won't be added to final_output_data

    # --- Save the cleaned data to a new JSON file ---
    print(f"Saving cleaned data to {output_file_path}...")
    with open(output_file_path, 'w') as f:
        json.dump(final_output_data, f, indent=4) # Use indent for pretty printing

    print("Saving complete.")
    print(f"Cleaned data saved to '{output_file_path}'.")


except FileNotFoundError as e:
    print(f"Error: {e}")
except json.JSONDecodeError:
    print(f"Error: Could not decode JSON from '{input_file_path}'. Please check the file format.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Successfully loaded data from final_output.json

--- Processing and Cleaning 'train' section ---
'train': Initial records = 16659
'train': Removed 9 records with missing/empty embeddings. Remaining records = 16650

--------------------------------------------------

--- Processing and Cleaning 'test' section ---
'test': Initial records = 3544
'test': Removed 2 records with missing/empty embeddings. Remaining records = 3542

--------------------------------------------------

--- Processing and Cleaning 'val' section ---
'val': Initial records = 3503
'val': Removed 2 records with missing/empty embeddings. Remaining records = 3501

--------------------------------------------------

Saving cleaned data to check.json...
Saving complete.
Cleaned data saved to 'check.json'.
