# JSON Explorer Notebook

This notebook uses the my_json_explorer.py script to analyze JSON files.

## Import Script Functions

First, let's import the functions from our script:

In [None]:
# Import functions from my_json_explorer.py
import sys
import os
sys.path.append(os.getcwd())
from my_json_explorer import explore_json, extract_text_values

## Load and Explore a JSON File

Now we can load and explore a JSON file:

In [None]:
import json
import pandas as pd

def load_json_file(file_path):
    """Load a JSON file and return its contents."""
    with open(file_path, 'r') as f:
        return json.load(f)

In [None]:
# Replace with your JSON file path
json_file_path = "path/to/your/file.json"
data = load_json_file(json_file_path)

print(f"Loaded JSON file: {json_file_path}")
print(f"Top level type: {type(data).__name__}")

if isinstance(data, dict):
    print(f"Keys: {', '.join(list(data.keys()))}")
elif isinstance(data, list):
    print(f"List length: {len(data)}")

## Explore the Structure

Explore the structure of the JSON data:

In [None]:
# Explore with a depth of 3 levels
# The explore_json function prints directly rather than returning values
print("Structure exploration:")
explore_json(data, max_depth=3)

## Extract Text Values

Extract text values from the "extracted_lines" field:

In [None]:
text_values = extract_text_values(data)

if text_values:
    print(f"Found {len(text_values)} text values:")
    for i, text in enumerate(text_values[:10]):
        print(f"{i+1}. {text}")
    if len(text_values) > 10:
        print(f"... and {len(text_values) - 10} more")
    
    # Create a DataFrame for better visualization
    df = pd.DataFrame({"text": text_values})
    df.head(10)

## Call my_json_explorer.py Directly

You can also call the script directly using the shell magic:

In [None]:
# Replace with your JSON file path
file_path = "path/to/your/file.json"
!python my_json_explorer.py {file_path} --extract-text --depth 3

## Custom JSON Path Query

Extract data using a custom path:

In [None]:
def query_json(data, path_str):
    """Extract data from a nested JSON structure using a dot-notation path."""
    result = data
    path = path_str.split('.')
    
    for key in path:
        # Handle array indexing with [n]
        if '[' in key and ']' in key:
            base_key, idx_str = key.split('[', 1)
            idx = int(idx_str.rstrip(']'))
            
            if base_key:
                result = result[base_key][idx]
            else:
                result = result[idx]
        else:
            result = result[key]
    
    return result

# Example: query_json(data, "extracted_lines.0.text")
path_to_query = "extracted_lines"
try:
    result = query_json(data, path_to_query)
    if isinstance(result, (dict, list)):
        from pprint import pprint
        pprint(result, depth=2)
    else:
        print(result)
except (KeyError, IndexError, TypeError) as e:
    print(f"Error: {e}")

## Analyze the Structure of df_check.csv Files

This can help diagnose issues with the prepare_annotations.py script:

In [ ]:
import pandas as pd
from pathlib import Path

# Replace with your actual path - make sure it's a Path object
cases_dir = Path("/Users/tod/Desktop/LayoutLM_annotation_ms/du_cases")

# Use pathlib methods
case_dirs = [d for d in cases_dir.iterdir() if d.is_dir()]

csv_files = []
for case_path in case_dirs:
    csv_path = case_path / "processing" / "form-recogniser" / "df_check.csv"
    if csv_path.exists():
        csv_files.append((case_path.name, csv_path))

print(f"Found {len(csv_files)} df_check.csv files")

# Analyze the first file to understand its structure
if csv_files:
    case_id, csv_path = csv_files[0]
    print(f"\nAnalyzing file for case: {case_id}")
    df = pd.read_csv(csv_path)
    
    print(f"Columns: {', '.join(df.columns)}")
    print(f"Number of rows: {len(df)}")
    
    # Check what values are in the image_id column
    if 'image_id' in df.columns:
        unique_ids = df['image_id'].unique()
        print(f"\nUnique image_id values ({len(unique_ids)}):\n{unique_ids}")
    
    # Display the first few rows
    print("\nFirst 5 rows:")
    df.head()