In [2]:
import json
from difflib import SequenceMatcher

def compare_json_recursively(json1, json2, path="", matches=0, total_comparisons=0, SIMILARITY_THRESHOLD=0.9, ERROR_MARGIN=0.05):
    """
    Recursively compares two JSON-like structures (dicts or lists) and returns
    the number of matches and total comparable data points.

    Args:
        json1 (dict or list): The first JSON structure to compare.
        json2 (dict or list): The second JSON structure to compare.
        path (str): The current path in the JSON structure (for debugging/tracking).
        matches (int): Current count of matching data points.
        total_comparisons (int): Current count of total comparable data points.
        SIMILARITY_THRESHOLD (float): Minimum similarity ratio for strings to be considered a match.
        ERROR_MARGIN (float): Maximum relative error for numeric values to be considered a match.

    Returns:
        tuple: A tuple containing (matches, total_comparisons).
    """
    if isinstance(json1, dict) and isinstance(json2, dict):
        # Get all unique keys from both dictionaries
        all_keys = set(json1.keys()).union(json2.keys())
        for key in all_keys:
            new_path = f"{path}.{key}" if path else key
            val1 = json1.get(key)
            val2 = json2.get(key)

            if val1 is None and val2 is None:
                # Both keys are missing or both values are None, do not count as a comparison
                pass
            elif val1 is None or val2 is None:
                # One key/value is missing, it's a mismatch
                total_comparisons += 1
            else:
                # Both values exist, recurse into them
                current_matches, current_total = compare_json_recursively(val1, val2, new_path, 0, 0, SIMILARITY_THRESHOLD, ERROR_MARGIN)
                matches += current_matches
                total_comparisons += current_total

                # If the recursive call didn't find any further nested structures (i.e., val1/val2 were leaf nodes),
                # then compare them here. This ensures leaf nodes directly under a dict are counted.
                if current_total == 0:
                    total_comparisons += 1 # Count this leaf node as a comparison
                    if val1 == val2:
                        matches += 1
                    elif isinstance(val1, str) and isinstance(val2, str):
                        # Compare strings using SequenceMatcher
                        ratio = SequenceMatcher(None, str(val1), str(val2)).ratio()
                        if ratio >= SIMILARITY_THRESHOLD:
                            matches += 1
                    elif isinstance(val1, (int, float)) and isinstance(val2, (int, float)):
                        # Compare numeric values with an error margin
                        if val2 == 0 and val1 == 0:
                            matches += 1 # Both are zero, consider them a match
                        elif val2 == 0 or val1 == 0:
                            # If one is zero, check absolute difference for small values
                            if abs(val1 - val2) < ERROR_MARGIN:
                                matches += 1
                        elif abs(val1 - val2) / max(abs(val1), abs(val2)) < ERROR_MARGIN:
                            matches += 1
                    # If types are different or not handled, it's a mismatch (no match added)

    elif isinstance(json1, list) and isinstance(json2, list):
        # For lists, compare elements one by one up to the minimum length.
        # Any extra elements in the longer list are considered mismatches.
        min_len = min(len(json1), len(json2))
        max_len = max(len(json1), len(json2))
        
        # Each extra element in the longer list contributes to total_comparisons as a mismatch
        total_comparisons += (max_len - min_len)

        for i in range(min_len):
            new_path = f"{path}[{i}]"
            current_matches, current_total = compare_json_recursively(json1[i], json2[i], new_path, 0, 0, SIMILARITY_THRESHOLD, ERROR_MARGIN)
            matches += current_matches
            total_comparisons += current_total

            # If the recursive call didn't find any further nested structures (i.e., json1[i]/json2[i] were leaf nodes),
            # then compare them here. This ensures leaf nodes directly under a list are counted.
            if current_total == 0:
                total_comparisons += 1 # Count this leaf node as a comparison
                if json1[i] == json2[i]:
                    matches += 1
                elif isinstance(json1[i], str) and isinstance(json2[i], str):
                    ratio = SequenceMatcher(None, str(json1[i]), str(json2[i])).ratio()
                    if ratio >= SIMILARITY_THRESHOLD:
                        matches += 1
                elif isinstance(json1[i], (int, float)) and isinstance(json2[i], (int, float)):
                    if json2[i] == 0 and json1[i] == 0:
                        matches += 1
                    elif json2[i] == 0 or json1[i] == 0:
                        if abs(json1[i] - json2[i]) < ERROR_MARGIN:
                            matches += 1
                    elif abs(json1[i] - json2[i]) / max(abs(json1[i]), abs(json2[i])) < ERROR_MARGIN:
                        matches += 1

    else: # This block handles comparisons of non-dict/non-list types (leaf nodes)
        # This case is primarily handled by the parent recursive calls when current_total is 0.
        # No direct increment here as it's handled by the parent's logic after the recursive call returns 0.
        pass

    return matches, total_comparisons

# --- Main execution ---

try:
    # Load JSON files using the filenames from the uploaded files
    # Note: The original paths like r"C:\\Users\\CEL\\Desktop\\..." are not accessible
    # in this environment. Using just the filenames as they were uploaded.
    with open(r"C:\Users\CEL\Desktop\verbal_autopsy\Open AI\api_outputs\VA_Case_1121585_OpenAI API.json") as f1, \
         open(r"C:\Users\CEL\Desktop\verbal_autopsy\Open AI\chatgpt_outputs\VA_Case_1121585_ChatGPT.json") as f2:
        api_data = json.load(f1)
        chatgpt_data = json.load(f2)

    # Define similarity threshold for strings and error margin for numeric values
    SIMILARITY_THRESHOLD = 0.9
    ERROR_MARGIN = 0.05

    # Perform the recursive comparison
    matches, total_comparisons = compare_json_recursively(api_data, chatgpt_data, 
                                                          SIMILARITY_THRESHOLD=SIMILARITY_THRESHOLD, 
                                                          ERROR_MARGIN=ERROR_MARGIN)

    # Calculate and print the accuracy
    accuracy = (matches / total_comparisons) * 100 if total_comparisons > 0 else 0
    print(f"Total comparable data points (including nested keys): {total_comparisons}")
    print(f"Matching data points: {matches}")
    print(f"Overall matching accuracy (including nested keys): {accuracy:.2f}%")

except FileNotFoundError as e:
    print(f"Error: One of the files was not found. Please ensure the filenames are correct and the files are in the expected location.")
    print(f"Missing file: {e.filename}")
except json.JSONDecodeError:
    print("Error: Could not decode one of the JSON files. Please check if they are valid JSON.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


Total comparable data points (including nested keys): 138
Matching data points: 39
Overall matching accuracy (including nested keys): 28.26%
