In [1]:
import json
import re
from pathlib import Path
from typing import Any, Dict, List, Tuple, Optional

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

# ---------------- Configuration -----------------------------

CONFIG = {
    # Root folder that contains many subfolders with JSON files
    "root_dir": r"G:\MAS_CAD\FINAL_OUTPUT\4_FINAL_SHAFT",

    # Output files
    "output_csv": r"G:\MAS_CAD\Shaft_json_eval_recursive_metrics.csv",
    "output_excel": r"G:\MAS_CAD\Shaft_json_eval_recursive_metrics.xlsx",

    # File scanning options
    "glob_pattern": "**/*.json",    # recursive
    "max_size_mb": 25,              # skip files larger than this (to avoid huge logs)
    "deduplicate_ids": True,        # when same filename appears in multiple places
    "dedup_policy": "latest_mtime", # "first" or "latest_mtime"

    # Include short error strings in the table
    "include_error_samples": True,
}

# ---------------- Utilities ---------------------------------

def iter_paths(d: Any, path: Tuple = ()) -> List[Tuple[Tuple, Any]]:
    """Yield (path, value) pairs for all nodes in nested dict/list structures."""
    if isinstance(d, dict):
        for k, v in d.items():
            yield from iter_paths(v, path + (k,))
    elif isinstance(d, list):
        for i, v in enumerate(d):
            yield from iter_paths(v, path + (i,))
    else:
        yield (path, d)

def dict_stats(d: Any) -> Dict[str, int]:
    """Compute structural stats: max depth, total keys, total nodes."""
    def _max_depth(x, depth=0):
        if isinstance(x, dict):
            if not x: return depth + 1
            return max(_max_depth(v, depth + 1) for v in x.values())
        if isinstance(x, list):
            if not x: return depth + 1
            return max(_max_depth(v, depth + 1) for v in x)
        return depth + 1

    total_keys = 0
    total_nodes = 0
    if isinstance(d, dict):
        for _path, _val in iter_paths(d):
            total_nodes += 1
            if _path and isinstance(_path[-1], str):
                total_keys += 1
    elif isinstance(d, list):
        for _path, _val in iter_paths(d):
            total_nodes += 1
    else:
        total_nodes = 1

    depth = _max_depth(d)
    return {"max_depth": depth, "total_keys": total_keys, "total_nodes": total_nodes}

# ---------------- Consistency rules (edit for your domain) ---

def rule_min_le_max(data: Any, base: Tuple[str, ...], min_key: str, max_key: str) -> Optional[str]:
    """Ensure base[min_key] <= base[max_key] when both exist and numeric."""
    curr = data
    for p in base:
        if p == "[]":
            if isinstance(curr, list) and curr:
                curr = curr[0]
            else:
                return None
        else:
            if isinstance(curr, dict) and p in curr:
                curr = curr[p]
            else:
                return None
    if not isinstance(curr, dict):
        return None
    if min_key in curr and max_key in curr:
        try:
            vmin = float(curr[min_key])
            vmax = float(curr[max_key])
            if vmin > vmax:
                return f"{'.'.join(base) or '<root>'}: {min_key}({vmin}) > {max_key}({vmax})"
        except Exception:
            pass
    return None

def rule_enum_membership(data: Any, path: Tuple[str, ...], allowed: List[Any]) -> Optional[str]:
    """Ensure value at 'path' (if scalar) is in allowed list."""
    curr = data
    for p in path:
        if p == "[]":
            if isinstance(curr, list) and curr:
                curr = curr[0]
            else:
                return None
        else:
            if isinstance(curr, dict) and p in curr:
                curr = curr[p]
            else:
                return None
    if isinstance(curr, (str, int, float)) and curr not in allowed:
        return f"{'.'.join(path)}='{curr}' not in allowed {allowed}"
    return None

# <<< ADD/EDIT YOUR DOMAIN RULES HERE >>>
CONSISTENCY_RULES = [
    lambda d: rule_min_le_max(d, base=tuple(),      min_key="min_diameter", max_key="max_diameter"),
    lambda d: rule_min_le_max(d, base=("specs",),   min_key="min_diameter", max_key="max_diameter"),
    lambda d: rule_enum_membership(d, path=("material",), allowed=["PLA", "ABS", "PETG", "Nylon", "Aluminum", "Steel"]),
]

# ---------------- File discovery & dedup ---------------------

def discover_json_files(root_dir: Path, pattern: str, max_size_mb: int) -> List[Path]:
    files = []
    max_bytes = max_size_mb * 1024 * 1024
    for fp in root_dir.rglob(pattern.replace("**/", "")) if pattern.startswith("**/") else root_dir.rglob(pattern):
        if fp.is_file():
            try:
                if fp.stat().st_size <= max_bytes:
                    files.append(fp)
            except Exception:
                continue
    return files

def deduplicate_by_stem(paths: List[Path], policy: str = "latest_mtime") -> List[Path]:
    """If multiple files share the same stem, keep one according to policy."""
    by_stem: Dict[str, List[Path]] = {}
    for p in paths:
        by_stem.setdefault(p.stem, []).append(p)

    kept = []
    for stem, plist in by_stem.items():
        if len(plist) == 1:
            kept.append(plist[0])
        else:
            if policy == "first":
                kept.append(sorted(plist)[0])
            else:
                kept.append(sorted(plist, key=lambda x: x.stat().st_mtime, reverse=True)[0])
    return kept

# ---------------- Main evaluation ---------------------------

def main():
    root = Path(CONFIG["root_dir"])
    assert root.exists(), f"Root dir not found: {root}"

    # Discover files recursively
    print("🔎 Scanning for JSON files (recursive)…")
    files = discover_json_files(root, CONFIG["glob_pattern"], CONFIG["max_size_mb"])
    if CONFIG["deduplicate_ids"]:
        files = deduplicate_by_stem(files, CONFIG["dedup_policy"])
    files = sorted(files)

    if not files:
        print("⚠️ No JSON files found with current settings.")
        return

    print(f"📦 Found {len(files)} JSON file(s) to evaluate.")

    rows = []
    for fp in tqdm(files, desc="Evaluating JSON files"):
        text = fp.read_text(encoding="utf-8", errors="ignore")
        file_bytes = len(text.encode("utf-8"))
        file_id = fp.stem  # filename without extension

        # Parse validity
        valid = True
        data = None
        parse_error = ""
        try:
            data = json.loads(text)
        except Exception as e:
            valid = False
            parse_error = str(e)

        # Compactness
        try:
            if valid:
                minified = json.dumps(data, separators=(",", ":"), ensure_ascii=False)
                minified_bytes = len(minified.encode("utf-8"))
            else:
                minified = re.sub(r"\s+", "", text)
                minified_bytes = len(minified.encode("utf-8"))
            compactness_ratio = round(minified_bytes / file_bytes, 6) if file_bytes > 0 else float("nan")
        except Exception:
            minified_bytes = None
            compactness_ratio = float("nan")

        # Structure stats
        stats = dict_stats(data) if valid else {"max_depth": float("nan"), "total_keys": float("nan"), "total_nodes": float("nan")}

        # Consistency rules
        consistency_errors = []
        if valid:
            for rule in CONSISTENCY_RULES:
                try:
                    msg = rule(data)
                    if msg:
                        consistency_errors.append(msg)
                except Exception as e:
                    consistency_errors.append(f"RuleError: {e}")
        consistency_ok = (len(consistency_errors) == 0) if valid else None

        rows.append({
            "id": file_id,
            "file_path": str(fp),
            "valid_json": bool(valid),
            "parse_error": (parse_error[:160] + "…") if (parse_error and CONFIG["include_error_samples"]) else ("" if valid else "parse_error"),
            "bytes": file_bytes,
            "minified_bytes": minified_bytes,
            "compactness_ratio": compactness_ratio,   # ~1.0 = compact; lower = more whitespace
            "consistency_ok": consistency_ok,
            "num_consistency_errors": len(consistency_errors) if valid else None,
            "consistency_error_sample": (consistency_errors[0][:200] + "…") if (consistency_errors and CONFIG["include_error_samples"]) else ("" if valid else ""),
            "max_depth": stats["max_depth"],
            "total_keys": stats["total_keys"],
            "total_nodes": stats["total_nodes"],
        })

    df = pd.DataFrame(rows)

    # Column order
    cols = [
        "id", "file_path",
        "valid_json", "parse_error",
        "bytes", "minified_bytes", "compactness_ratio",
        "consistency_ok", "num_consistency_errors", "consistency_error_sample",
        "max_depth", "total_keys", "total_nodes",
    ]
    df = df[cols]

    # Clean presentation (no None/NaN strings for text columns)
    for c in ["parse_error", "consistency_error_sample"]:
        if c in df.columns:
            df[c] = df[c].fillna("")

    # Display and save
    pd.set_option("display.max_colwidth", 140)
    display(df)

    out_csv = Path(CONFIG["output_csv"])
    out_xlsx = Path(CONFIG["output_excel"])
    out_csv.parent.mkdir(parents=True, exist_ok=True)
    out_xlsx.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(out_csv, index=False)
    df.to_excel(out_xlsx, index=False)

    # Lightweight summary (useful for your dissertation)
    total = len(df)
    valid_n = int(df["valid_json"].sum())
    invalid_n = total - valid_n
    avg_depth = float(df["max_depth"].replace([np.inf, -np.inf], np.nan).mean())
    avg_compact = float(df["compactness_ratio"].replace([np.inf, -np.inf], np.nan).mean())

    print("\n✅ Saved JSON evaluation to:")
    print(f"  - {out_csv}")
    print(f"  - {out_xlsx}")
    print("\n📈 Summary:")
    print(f"  Files evaluated:       {total}")
    print(f"  Valid JSON:            {valid_n}")
    print(f"  Invalid JSON:          {invalid_n}")
    print(f"  Avg max depth:         {avg_depth:.2f}")
    print(f"  Avg compactness ratio: {avg_compact:.3f}")

if __name__ == "__main__":
    main()


  from .autonotebook import tqdm as notebook_tqdm


🔎 Scanning for JSON files (recursive)…
📦 Found 50 JSON file(s) to evaluate.


Evaluating JSON files: 100%|██████████| 50/50 [00:00<00:00, 62.55it/s]


Unnamed: 0,id,file_path,valid_json,parse_error,bytes,minified_bytes,compactness_ratio,consistency_ok,num_consistency_errors,consistency_error_sample,max_depth,total_keys,total_nodes
0,CS_0001,G:\MAS_CAD\FINAL_OUTPUT\4_FINAL_SHAFT\CS_0001\CS_0001.json,True,,2305,1723,0.747505,True,0,,5,34,47
1,CS_0002,G:\MAS_CAD\FINAL_OUTPUT\4_FINAL_SHAFT\CS_0002\CS_0002.json,True,,2401,1853,0.771762,True,0,,5,32,44
2,CYL_0001,G:\MAS_CAD\FINAL_OUTPUT\4_FINAL_SHAFT\CYL_0001\CYL_0001.json,True,,2153,1665,0.77334,True,0,,5,30,41
3,CYL_0002,G:\MAS_CAD\FINAL_OUTPUT\4_FINAL_SHAFT\CYL_0002\CYL_0002.json,True,,2152,1582,0.73513,True,0,,6,30,45
4,CYL_0003,G:\MAS_CAD\FINAL_OUTPUT\4_FINAL_SHAFT\CYL_0003\CYL_0003.json,True,,2093,1515,0.723841,True,0,,6,30,45
5,PT_0001,G:\MAS_CAD\FINAL_OUTPUT\4_FINAL_SHAFT\PT_0001\PT_0001.json,True,,1982,1449,0.73108,True,0,,5,32,43
6,SFT_0001,G:\MAS_CAD\FINAL_OUTPUT\4_FINAL_SHAFT\SFT_0001\SFT_0001.json,True,,1892,1422,0.751586,True,0,,5,30,39
7,SFT_0002,G:\MAS_CAD\FINAL_OUTPUT\4_FINAL_SHAFT\SFT_0002\SFT_0002.json,True,,1826,1360,0.744797,True,0,,5,30,39
8,SFT_0003,G:\MAS_CAD\FINAL_OUTPUT\4_FINAL_SHAFT\SFT_0003\SFT_0003.json,True,,2202,1670,0.758401,True,0,,6,30,44
9,SFT_0004,G:\MAS_CAD\FINAL_OUTPUT\4_FINAL_SHAFT\SFT_0004\SFT_0004.json,True,,1947,1474,0.757062,True,0,,5,30,40



✅ Saved JSON evaluation to:
  - G:\MAS_CAD\Shaft_json_eval_recursive_metrics.csv
  - G:\MAS_CAD\Shaft_json_eval_recursive_metrics.xlsx

📈 Summary:
  Files evaluated:       50
  Valid JSON:            50
  Invalid JSON:          0
  Avg max depth:         5.44
  Avg compactness ratio: 0.745
