# SQuAD / CUAD quick inspector (Notebook)

- Paste & run cells in order.
- Set `DATA_PATH` to your JSON file path in the example cell.
- For very large files, install `ijson` (`pip install ijson`) and use the streaming option (comment included).


In [3]:
!pip install pandas

Collecting pandas
  Using cached pandas-2.3.2-cp313-cp313-macosx_11_0_arm64.whl.metadata (91 kB)
Collecting numpy>=1.26.0 (from pandas)
  Using cached numpy-2.3.3-cp313-cp313-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.3.2-cp313-cp313-macosx_11_0_arm64.whl (10.7 MB)
Using cached numpy-2.3.3-cp313-cp313-macosx_14_0_arm64.whl (5.1 MB)
Using cached pytz-2025.2-py2.py3-none-any.whl (509 kB)
Using cached tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: pytz, tzdata, numpy, pandas
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4/4[0m [pandas]2m3/4[0m [pandas]
[1A[2KSuccessfully installed numpy-2.3.3 pandas-2.3.2 pytz-2025.2 tzdata-2025.2


In [5]:
# Cell 2: imports + helper functions
import json
import random
from pathlib import Path
from typing import List, Dict, Any, Iterable, Tuple
from IPython.display import display
import pandas as pd

def load_squad_flat(path: str) -> List[Dict[str, Any]]:
    """
    Load a SQuAD-style JSON and return a flattened list of records:
    {title, paragraph_id, context, qa_id, question, answers, is_impossible}
    """
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(f"{path} not found")
    with p.open("r", encoding="utf-8") as f:
        doc = json.load(f)
    flat = []
    for article in doc.get("data", []):
        title = article.get("title", "")
        for pid, para in enumerate(article.get("paragraphs", [])):
            context = para.get("context", "")
            for qa in para.get("qas", []):
                flat.append({
                    "title": title,
                    "paragraph_id": pid,
                    "context": context,
                    "qa_id": qa.get("id"),
                    "question": qa.get("question"),
                    "answers": qa.get("answers", []),
                    "is_impossible": qa.get("is_impossible", False)
                })
    return flat

def counts(flat: Iterable[Dict[str, Any]]) -> Dict[str,int]:
    flat = list(flat)
    num_qas = len(flat)
    num_paras = len({(r["title"], r["paragraph_id"]) for r in flat})
    num_titles = len({r["title"] for r in flat})
    return {"titles": num_titles, "paragraphs": num_paras, "qas": num_qas}

def sample_random(flat: List[Dict[str, Any]], n: int = 5) -> List[Dict[str, Any]]:
    n = min(n, len(flat))
    return random.sample(flat, n)

def sample_first(flat: List[Dict[str, Any]], n: int = 5) -> List[Dict[str, Any]]:
    return flat[:n]

def search_by_keyword(flat: List[Dict[str, Any]], keyword: str, field: str = "context", max_results: int = 20):
    """
    field: 'context'|'question'|'answer'
    If 'answer', it will search inside each answer['text'].
    """
    out = []
    keyword_l = keyword.lower()
    for r in flat:
        if field == "answer":
            if any(keyword_l in (a.get("text","").lower()) for a in r["answers"]):
                out.append(r)
        else:
            if keyword_l in (r.get(field,"") or "").lower():
                out.append(r)
        if len(out) >= max_results:
            break
    return out

def validate_answer_offsets(flat: List[Dict[str, Any]], limit: int = 200) -> List[Tuple[int,str,int,str]]:
    """
    Validate that each answer's answer_start points to the exact answer substring.
    Returns up to `limit` mismatches as tuples:
     (index_in_flat, qa_id, answer_start, message)
    """
    mismatches = []
    for idx, r in enumerate(flat):
        ctx = r.get("context","") or ""
        for a in r.get("answers", []):
            text = a.get("text","")
            start = a.get("answer_start")
            if start is None:
                mismatches.append((idx, r.get("qa_id"), -1, f"missing answer_start for '{text}'"))
                continue
            # basic type/bounds checks
            try:
                start_int = int(start)
            except Exception:
                mismatches.append((idx, r.get("qa_id"), start, f"non-integer answer_start for '{text}'"))
                continue

            if start_int < 0 or start_int + len(text) > len(ctx):
                mismatches.append((idx, r.get("qa_id"), start_int, f"out-of-bounds (len ctx={len(ctx)})"))
            else:
                extracted = ctx[start_int:start_int+len(text)]
                if extracted != text:
                    mismatches.append((idx, r.get("qa_id"), start_int, f"expected='{text}' got='{extracted}'"))
            if len(mismatches) >= limit:
                return mismatches
    return mismatches

def save_samples_jsonl(samples: Iterable[Dict[str, Any]], outpath: str):
    p = Path(outpath)
    with p.open("w", encoding="utf-8") as f:
        for s in samples:
            f.write(json.dumps(s, ensure_ascii=False) + "\n")


In [12]:
# Cell 3: set path and load
DATA_PATH = "/Users/nileshmishra/LegalAgenticRag/data/RawData/CUAD_v1.json" 
flat = load_squad_flat(DATA_PATH)

# show counts
c = counts(flat)
display(pd.DataFrame([c], index=["counts"]).T)

# show first 3 QA records as a DataFrame for quick view
df_first = pd.DataFrame(sample_first(flat, 3))
# truncate long context for display
df_first["context_short"] = df_first["context"].str.slice(0,180).str.replace("\n"," ")
display(df_first[["qa_id","question","context_short","answers","is_impossible"]])


Unnamed: 0,counts
titles,510
paragraphs,510
qas,20910


Unnamed: 0,qa_id,question,context_short,answers,is_impossible
0,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,Highlight the parts (if any) of this contract ...,EXHIBIT 10.6 DI...,"[{'text': 'DISTRIBUTOR AGREEMENT', 'answer_sta...",False
1,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,Highlight the parts (if any) of this contract ...,EXHIBIT 10.6 DI...,"[{'text': 'Distributor', 'answer_start': 244},...",False
2,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,Highlight the parts (if any) of this contract ...,EXHIBIT 10.6 DI...,"[{'text': '7th day of September, 1999.', 'answ...",False


In [13]:
# Cell 4: sampling & search examples
print("Random sample IDs & questions:")
for r in sample_random(flat, 5):
    print("-", r["qa_id"], ":", r["question"])

# Search example (context)
keyword_results = search_by_keyword(flat, "rent", field="context", max_results=5)
print(f"\nFound {len(keyword_results)} paragraphs mentioning 'rent' (showing up to 5).")
if keyword_results:
    display(pd.DataFrame([
        {"qa_id": r["qa_id"], "title": r["title"], "paragraph_id": r["paragraph_id"],
         "question": r["question"], "context_short": r["context"][:200].replace('\n',' ')}
        for r in keyword_results
    ]))


Random sample IDs & questions:
- MFAFINANCIAL,INC_07_06_2020-EX-99.D-JOINT FILING AGREEMENT__Audit Rights : Highlight the parts (if any) of this contract related to "Audit Rights" that should be reviewed by a lawyer. Details: Does a party have the right to  audit the books, records, or physical locations of the counterparty to ensure compliance with the contract?
- TURNKEYCAPITAL,INC_07_20_2017-EX-1.1-Strategic Alliance Agreement__Affiliate License-Licensor : Highlight the parts (if any) of this contract related to "Affiliate License-Licensor" that should be reviewed by a lawyer. Details: Does the contract contain a license grant by affiliates of the licensor or that includes intellectual property of affiliates of the licensor? 
- ALLISONTRANSMISSIONHOLDINGSINC_12_15_2014-EX-99.1-COOPERATION AGREEMENT__Revenue/Profit Sharing : Highlight the parts (if any) of this contract related to "Revenue/Profit Sharing" that should be reviewed by a lawyer. Details: Is one party required to share re

Unnamed: 0,qa_id,title,paragraph_id,question,context_short
0,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,0,Highlight the parts (if any) of this contract ...,EXHIBIT 10.6 DI...
1,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,0,Highlight the parts (if any) of this contract ...,EXHIBIT 10.6 DI...
2,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,0,Highlight the parts (if any) of this contract ...,EXHIBIT 10.6 DI...
3,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,0,Highlight the parts (if any) of this contract ...,EXHIBIT 10.6 DI...
4,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGRE...,0,Highlight the parts (if any) of this contract ...,EXHIBIT 10.6 DI...


In [14]:
# Cell 5: validate answer offsets (may take a while on very large datasets)
mismatches = validate_answer_offsets(flat, limit=50)
print("Mismatch count (preview limit=50):", len(mismatches))
if mismatches:
    df_mm = pd.DataFrame(mismatches, columns=["flat_index","qa_id","answer_start","message"])
    display(df_mm)
    # show the offending record example
    idx0 = mismatches[0][0]
    print("\nExample context (short) and expected answer:")
    print(flat[idx0]["context"][:400].replace("\n"," "))
    print("Answers in record:", flat[idx0]["answers"])


Mismatch count (preview limit=50): 0


In [15]:
# Cell 6: save 100 random QAs to sample.jsonl
sample_100 = sample_random(flat, min(100, len(flat)))
save_samples_jsonl(sample_100, "sample_qa_100.jsonl")
print("Saved sample_qa_100.jsonl")


Saved sample_qa_100.jsonl


In [16]:
# Debug: run this cell in the notebook
from pathlib import Path
import os, sys, traceback

P = Path("./data/RawData/CUAD_v1.json")
print("Notebook cwd:", Path.cwd())
print("Resolved path:", P.resolve())
print("Exists:", P.exists())
print("Is file:", P.is_file())
try:
    print("File size (MB):", round(P.stat().st_size / (1024*1024), 2) if P.exists() else "n/a")
except Exception as e:
    print("Could not stat file:", e)
print("Python executable:", sys.executable)
print("Python version:", sys.version.splitlines()[0])

# Try a small read to ensure file is readable (won't load entire JSON)
if P.exists():
    try:
        with P.open("r", encoding="utf-8") as f:
            head = f.read(1024)
        print("First 200 chars of file:\n", head[:200].replace("\n","\\n"))
    except Exception:
        print("Error reading file:")
        traceback.print_exc()


Notebook cwd: /Users/nileshmishra/LegalAgenticRag/data/dataPreprocessing
Resolved path: /Users/nileshmishra/LegalAgenticRag/data/dataPreprocessing/data/RawData/CUAD_v1.json
Exists: False
Is file: False
File size (MB): n/a
Python executable: /Users/nileshmishra/LegalAgenticRag/.venv/bin/python
Python version: 3.13.7 (main, Aug 14 2025, 11:12:11) [Clang 17.0.0 (clang-1700.3.19.1)]
