# JSON Explorer Notebook

This notebook provides interactive tools for exploring and extracting data from JSON files.

In [None]:
import json
import pandas as pd
from IPython.display import display, HTML
import ipywidgets as widgets
from pprint import pprint

## Load JSON File

First, let's define a function to load a JSON file:

In [None]:
def load_json_file(file_path):
    """Load JSON data from a file."""
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
        return data
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found")
        return None
    except json.JSONDecodeError as e:
        print(f"Error: Invalid JSON in '{file_path}': {e}")
        return None
    except Exception as e:
        print(f"Error: {e}")
        return None

## Explore JSON Structure

Let's create a function to explore the structure of a JSON object:

In [None]:
def explore_json(data, path="", max_depth=3, current_depth=0):
    """Recursively explore and print the structure of a JSON object."""
    result = []
    
    if current_depth >= max_depth:
        result.append(f"{path} ... (max depth reached)")
        return result

    if isinstance(data, dict):
        result.append(f"{path} (dict with {len(data)} keys)")
        for key, value in list(data.items())[:5]:  # Show first 5 keys
            new_path = f"{path}.{key}" if path else key
            result.extend(explore_json(value, new_path, max_depth, current_depth + 1))
        if len(data) > 5:
            result.append(f"{path} ... ({len(data) - 5} more keys)")
    elif isinstance(data, list):
        result.append(f"{path} (list with {len(data)} items)")
        if data and len(data) > 0:
            # Show structure of first item
            result.extend(explore_json(data[0], f"{path}[0]", max_depth, current_depth + 1))
            if len(data) > 1:
                result.append(f"{path} ... ({len(data) - 1} more items)")
    else:
        # For primitive values, show type and truncated content
        value_str = str(data)
        if len(value_str) > 50:
            value_str = value_str[:47] + "..."
        result.append(f"{path} = {type(data).__name__}: {value_str}")
    
    return result

## Extract Text Values

Let's create a function to extract text values from extracted_lines:

In [None]:
def extract_text_values(data):
    """Extract text values from extracted_lines in a JSON structure."""
    if "extracted_lines" not in data:
        print("Error: No 'extracted_lines' key found in JSON")
        return None

    extracted_lines = data["extracted_lines"]
    
    if isinstance(extracted_lines, list):
        # Handle list structure
        text_values = []
        for item in extracted_lines:
            if isinstance(item, dict) and "text" in item:
                text_values.append(item["text"])
        
        if not text_values:
            print("Warning: No 'text' fields found in 'extracted_lines' list items")
        
        return text_values
    
    elif isinstance(extracted_lines, dict):
        # Handle dictionary structure
        text_values = []
        for key, item in extracted_lines.items():
            if isinstance(item, dict) and "text" in item:
                text_values.append(item["text"])
        
        if not text_values:
            print("Warning: No 'text' fields found in 'extracted_lines' dictionary values")
        
        return text_values
    
    else:
        print(f"Error: 'extracted_lines' is neither a list nor a dictionary (type: {type(extracted_lines).__name__})")
        return None

## Interactive File Selection

Create an interactive file selector:

In [None]:
file_path = widgets.Text(
    value='',
    placeholder='Enter the path to your JSON file',
    description='File path:',
    disabled=False,
    style={'description_width': 'initial'}
)

max_depth = widgets.IntSlider(
    value=3,
    min=1,
    max=10,
    step=1,
    description='Exploration depth:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d',
    style={'description_width': 'initial'}
)

extract_text = widgets.Checkbox(
    value=False,
    description='Extract text from extracted_lines',
    disabled=False,
    style={'description_width': 'initial'}
)

load_button = widgets.Button(
    description='Load and Explore',
    disabled=False,
    button_style='primary',
    tooltip='Click to load and explore the JSON file',
    icon='search'
)

output = widgets.Output()

display(file_path, max_depth, extract_text, load_button, output)

In [None]:
def on_load_button_clicked(b):
    with output:
        output.clear_output()
        
        if not file_path.value:
            print("Please enter a file path.")
            return
        
        data = load_json_file(file_path.value)
        if data is None:
            return
        
        print(f"\nExploring JSON file: {file_path.value}")
        print(f"Top level type: {type(data).__name__}")
        
        if isinstance(data, dict):
            print(f"Keys: {', '.join(list(data.keys()))}")
        elif isinstance(data, list):
            print(f"List length: {len(data)}")
        
        print("\nStructure exploration:")
        structure = explore_json(data, max_depth=max_depth.value)
        for line in structure:
            print(line)
        
        if extract_text.value:
            print("\nExtracting text values from 'extracted_lines':")
            text_values = extract_text_values(data)
            if text_values:
                print(f"Found {len(text_values)} text values:")
                for i, text in enumerate(text_values[:10]):
                    print(f"  {i+1}. {text}")
                if len(text_values) > 10:
                    print(f"  ... and {len(text_values) - 10} more")
                
                # Create a DataFrame for better visualization
                df = pd.DataFrame({"text": text_values})
                display(df.head(10))

load_button.on_click(on_load_button_clicked)

## Advanced JSON Path Query

Use JSONPath-like syntax to query specific paths in the JSON:

In [None]:
def query_json(data, query):
    """Query a JSON object using a simplified JSONPath-like syntax."""
    if not query:
        return data
    
    # Split the query by dots, but handle array indices correctly
    parts = []
    current = ""
    in_brackets = False
    
    for char in query:
        if char == '.' and not in_brackets:
            if current:
                parts.append(current)
                current = ""
        elif char == '[':
            in_brackets = True
            current += char
        elif char == ']':
            in_brackets = False
            current += char
        else:
            current += char
    
    if current:
        parts.append(current)
    
    # Navigate the JSON object
    result = data
    try:
        for part in parts:
            if '[' in part and ']' in part:
                key, idx_str = part.split('[', 1)
                idx = int(idx_str.rstrip(']'))
                if key:
                    result = result[key][idx]
                else:
                    result = result[idx]
            else:
                result = result[part]
        return result
    except (KeyError, IndexError, TypeError) as e:
        print(f"Error: Could not find path '{query}': {e}")
        return None

In [None]:
query_path = widgets.Text(
    value='',
    placeholder='Enter a JSON path (e.g., extracted_lines[0].text)',
    description='JSON Path:',
    disabled=False,
    style={'description_width': 'initial'}
)

query_button = widgets.Button(
    description='Query JSON',
    disabled=False,
    button_style='info',
    tooltip='Click to query the JSON using the provided path',
    icon='filter'
)

query_output = widgets.Output()

display(query_path, query_button, query_output)

In [None]:
def on_query_button_clicked(b):
    with query_output:
        query_output.clear_output()
        
        if not file_path.value:
            print("Please load a JSON file first.")
            return
        
        if not query_path.value:
            print("Please enter a JSON path to query.")
            return
        
        data = load_json_file(file_path.value)
        if data is None:
            return
        
        result = query_json(data, query_path.value)
        if result is not None:
            print(f"Query result for '{query_path.value}':")
            if isinstance(result, (dict, list)):
                if len(str(result)) > 1000:
                    print(f"Result is large ({len(str(result))} characters). Showing first part:")
                    pprint(result, depth=2)
                    
                    # Try to convert to DataFrame if it's a list of dictionaries
                    if isinstance(result, list) and result and isinstance(result[0], dict):
                        print("\nAs DataFrame:")
                        df = pd.DataFrame(result)
                        display(df.head(10))
                    elif isinstance(result, dict):
                        print("\nAs DataFrame:")
                        df = pd.DataFrame([result])
                        display(df.head(10))
                else:
                    pprint(result)
            else:
                print(result)

query_button.on_click(on_query_button_clicked)

## Custom Data Analysis

Add your own analysis code below. The JSON data will be available after you load a file.

In [None]:
# Your custom analysis code here
# Example:
# 
# # Load your JSON file
# my_data = load_json_file('path/to/your/file.json')
# 
# # Analyze extracted_lines if they exist
# if my_data and 'extracted_lines' in my_data:
#     extracted_lines = my_data['extracted_lines']
#     
#     # If extracted_lines is a dictionary
#     if isinstance(extracted_lines, dict):
#         print(f"Found {len(extracted_lines)} items in extracted_lines dictionary")
#         
#         # Count items that have a 'text' field
#         text_count = sum(1 for item in extracted_lines.values() if isinstance(item, dict) and 'text' in item)
#         print(f"{text_count} items have a 'text' field")
#     
#     # If extracted_lines is a list
#     elif isinstance(extracted_lines, list):
#         print(f"Found {len(extracted_lines)} items in extracted_lines list")
#         
#         # Count items that have a 'text' field
#         text_count = sum(1 for item in extracted_lines if isinstance(item, dict) and 'text' in item)
#         print(f"{text_count} items have a 'text' field")