In [None]:
# ============================================================
# Yumna Hussain - Lexis Court Opinion Parser & Descriptive Analytics
# ------------------------------------------------------------
# - Installs libraries quietly
# - Mounts Google Drive
# - Points to input TXT directory and output folder
# ============================================================

%pip install -q pandas pyarrow charset-normalizer python-dateutil tqdm

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

from pathlib import Path
from itertools import islice

TXT_DIR = Path("/content/drive/MyDrive/Stigler Center DT/case_text/case_text")
OUT_DIR = Path("/content/drive/MyDrive/Stigler Center DT/out")

OUT_DIR.mkdir(parents=True, exist_ok=True)

txts = list(TXT_DIR.rglob("*.txt"))
print("Found", len(txts), "txt files")
for p in islice(txts, 5):
    print(" -", p)


In [None]:
# ============================================================
# Lexis parser
# ------------------------------------------------------------
# - Handles non-strict JSON-like text (arrays with ']' inside quotes)
# - Extracts core variables used downstream
# - Lightly normalizes court names and derives jurisdiction/level/state
# - Builds best-available opinion text
# ============================================================

import re, csv, sys
import pandas as pd
from tqdm.auto import tqdm
from dateutil import parser as dateparser
from charset_normalizer import from_path

def read_text(p: Path) -> str:
    # tolerant reader for varied encodings
    try:
        return p.read_text(encoding="utf-8", errors="replace")
    except Exception:
        best = from_path(str(p)).best()
        return str(best) if best is not None else p.read_bytes().decode("latin-1", errors="replace")

def _extract_array_strings(text: str, key: str):
    # pull strings from `"key": [ "…", "…" ]` while honoring quotes/escapes
    m = re.search(rf'"{re.escape(key)}"\s*:', text, flags=re.I)
    if not m: return []
    pos = m.end()
    i = text.find('[', pos)
    if i == -1: return []
    i += 1
    depth, in_str, esc = 1, False, False
    buf = []
    for ch in text[i:]:
        buf.append(ch)
        if in_str:
            if esc: esc = False
            elif ch == '\\': esc = True
            elif ch == '"': in_str = False
        else:
            if ch == '"': in_str = True
            elif ch == '[': depth += 1
            elif ch == ']':
                depth -= 1
                if depth == 0:
                    break
    segment = ''.join(buf)
    vals, cur, in_str, esc = [], "", False, False
    for ch in segment:
        if in_str:
            if esc:
                cur += ch; esc = False
            elif ch == '\\':
                esc = True
            elif ch == '"':
                in_str = False; vals.append(cur); cur = ""
            else:
                cur += ch
        else:
            if ch == '"': in_str = True
            elif ch == ']': break
    return [v.strip() for v in vals if v is not None]

def grab_list(text: str, key: str):
    vals = _extract_array_strings(text, key)
    return vals if vals else []

def grab_one(text: str, key: str):
    # array first, then scalar `"key": "value"`
    vals = grab_list(text, key)
    if vals: return vals[0]
    m = re.search(rf'"{re.escape(key)}"\s*:\s*"([^"]+)"', text, flags=re.I|re.S)
    return m.group(1).strip() if m else None

def smart_space_fix(s: str|None) -> str|None:
    if not s: return s
    s = re.sub(r'([a-z])([A-Z])', r'\1 \2', s)
    s = (s.replace("UnitedStates", "United States")
           .replace("Court ofAppeals", "Court of Appeals")
           .replace("Appealsfor", "Appeals for")
           .replace("DistrictCourt", "District Court")
           .replace("Courtfor", "Court for"))
    return re.sub(r'\s+', ' ', s).strip()

STATE_NAMES = {
    "Alabama","Alaska","Arizona","Arkansas","California","Colorado","Connecticut","Delaware","Florida","Georgia",
    "Hawaii","Idaho","Illinois","Indiana","Iowa","Kansas","Kentucky","Louisiana","Maine","Maryland",
    "Massachusetts","Michigan","Minnesota","Mississippi","Missouri","Montana","Nebraska","Nevada",
    "New Hampshire","New Jersey","New Mexico","New York","North Carolina","North Dakota","Ohio","Oklahoma",
    "Oregon","Pennsylvania","Rhode Island","South Carolina","South Dakota","Tennessee","Texas","Utah",
    "Vermont","Virginia","Washington","West Virginia","Wisconsin","Wyoming","District of Columbia"
}

def parse_decision_date(text: str):
    # tries several keys; returns (ISO date or None, year or None)
    for key in ("decisionDate","filedDate","dateText","decisionDates","filedDates"):
        v = grab_one(text, key)
        if v:
            head = ", ".join(v.split(",")[:2]).strip()
            try:
                dt = dateparser.parse(head, fuzzy=True)
                return dt.date().isoformat(), dt.year
            except Exception:
                pass
    m = re.search(r'\b(19|20)\d{2}\b', text)
    return (None, int(m.group(0))) if m else (None, None)

def extract_opinion_text(raw: str) -> str:
    # picks the most structured field; falls back to scanning for "OPINION"
    for key in ("courtCaseDocBody","caseOpinions","opinion","opinionText","bodyText","p"):
        arr = grab_list(raw, key)
        if arr: return "\n\n".join(arr).strip()
        one = grab_one(raw, key)
        if one: return one.strip()
    m = re.search(r'(?:^|\n)\s*(OPINION|Per Curiam|Opinion)\b(.*)$', raw, flags=re.I|re.S)
    return m.group(2).strip() if m else ""

OFFICIAL_REPORTER_RE = re.compile(
    r'\b(U\.S\.|S\.\s*Ct\.|L\.\s*Ed\.|F\.\d+d?|F\.Supp\.|F\.App\'?x|P\.\d+d?|N\.[EW]\.\d+d?|So\.\d+d?|Cal\.\s*App\.|A\.\d+d?)\b'
)
def infer_publication_status(citations: str|None) -> str|None:
    if not citations: return None
    if OFFICIAL_REPORTER_RE.search(citations): return "Published"
    if re.search(r'\b(LEXIS|WL)\b', citations): return "Unpublished"
    return None

def derive_jurisdiction(court_name: str|None) -> str|None:
    if not court_name: return None
    s = court_name.lower()
    if "united states" in s or "u.s." in s or "federal" in s: return "federal"
    return "state"

def derive_court_level(court_name: str|None, jurisdiction: str|None) -> str|None:
    if not court_name: return None
    s = court_name.lower()
    if jurisdiction == "federal":
        if "supreme court" in s: return "federal_supreme"
        if "court of appeals" in s or "circuit" in s: return "federal_appellate"
        if "district court" in s: return "federal_district"
    if "supreme judicial court" in s or "supreme court" in s: return "state_supreme"
    if "court of criminal appeals" in s or "court of appeals" in s or "appellate" in s: return "intermediate_appellate"
    if "district court" in s or "superior court" in s or "trial court" in s: return "trial"
    return None

def derive_state(court_name: str|None, jurisdiction: str|None) -> str|None:
    if not court_name or jurisdiction == "federal": return None
    m = re.search(r'\bof\s+([A-Za-z\s\.]+)$', court_name)
    if m:
        tail = m.group(1).strip().rstrip('.')
        for st in sorted(STATE_NAMES, key=len, reverse=True):
            if re.search(rf'\b{re.escape(st)}\b', tail): return st
        return tail
    return None

def split_citations(text: str) -> str|None:
    cites = grab_list(text, "citeForThisResource")
    if cites:
        seen, out = set(), []
        for c in cites:
            if c not in seen:
                out.append(c); seen.add(c)
        return "; ".join(out)
    c = grab_one(text, "citations")
    if c:
        parts = re.split(r'(?=(?:\b[A-Z][\w\.]*\s+\d+\b)|(?:\b\d{4}\s+(?:WL|LEXIS)\b))', c)
        parts = [p.strip() for p in parts if p.strip()]
        return "; ".join(dict.fromkeys(parts))
    return None

def safe_short_case(case_name: str|None) -> str|None:
    if not case_name: return None
    m = re.search(r'((?:In re|In the matter of)[^,\n]+)', case_name, flags=re.I)
    if m: return m.group(1).strip()
    m = re.search(r'([^,\n]*?\s+v\.\s+[^,\n]+)', case_name)
    return m.group(1).strip() if m else case_name

def parse_file(p: Path) -> dict:
    raw = read_text(p)
    case_name  = grab_one(raw, "fullCaseName") or grab_one(raw, "caseName")
    short_case = grab_one(raw, "shortCaseName") or safe_short_case(case_name)
    docket     = grab_one(raw, "docketNumber")
    court_name = smart_space_fix(grab_one(raw, "courtName") or grab_one(raw, "courtInfo"))
    decided_date, year = parse_decision_date(raw)
    citations = split_citations(raw)
    publication_status = grab_one(raw, "publicationStatus") or grab_one(raw, "statusNotice") or infer_publication_status(citations)

    # disposition (simple pattern search)
    dispo_patterns = [
        r'\bAffirmed\.?', r'\bReversed\.?', r'\bVacated\.?', r'\bRemanded\.?',
        r'\bDenied\.?', r'\bDismissed\.?', r'\bPetition for review denied\.?',
        r'\bJudgment (?:is )?affirmed\.?', r'\bJudgment (?:is )?reversed\.?'
    ]
    dispo_re = re.compile("|".join(dispo_patterns), flags=re.I)
    joined = " ".join(grab_list(raw, "p") + grab_list(raw, "courtCaseDocBody") + grab_list(raw, "bodyText"))
    m = dispo_re.search(joined or raw)
    disposition = m.group(0).rstrip('.').strip().capitalize() if m else None

    jurisdiction = derive_jurisdiction(court_name)
    court_level  = derive_court_level(court_name, jurisdiction)
    state        = derive_state(court_name, jurisdiction)

    if not docket:
        m = re.search(r'\bS\d{5,6}\b', raw) or re.search(r'\b[A-Z]{1,3}-\d{2}-\d{4,6}\b', raw) \
            or re.search(r'\b\d{2,4}[A-Z]{0,3}\d{2,6}-?[A-Z]{0,3}\b', raw)
        docket = m.group(0) if m else None

    text = extract_opinion_text(raw)

    return {
        "doc_id": p.stem,
        "case_name": case_name,
        "short_case_name": short_case,
        "docket_number": docket,
        "court_name": court_name,
        "jurisdiction": jurisdiction,
        "court_level": court_level,
        "state": state,
        "decided_date": decided_date,
        "year": year,
        "citations": citations,
        "publication_status": publication_status,
        "disposition": disposition,
        "text": text
    }



In [None]:
# ============================================================
# Parse all TXT files, tidy DataFrame, save TSV/CSV/Parquet
# ------------------------------------------------------------
# - Iterates through TXT_DIR
# - Catches errors but preserves row with _error note
# - Quick QC print for suspicious rows
# - Writes cases_clean.(tsv|csv|parquet) to OUT_DIR
# ============================================================

paths = sorted(Path(TXT_DIR).rglob("*.txt"))
rows = []
for p in tqdm(paths, desc="Parsing"):
    try:
        rows.append(parse_file(p))
    except Exception as e:
        rows.append({
            "doc_id": p.stem, "case_name": None, "short_case_name": None, "docket_number": None,
            "court_name": None, "jurisdiction": None, "court_level": None, "state": None,
            "decided_date": None, "year": None, "citations": None, "publication_status": None,
            "disposition": None, "text": "", "_error": str(e)
        })

df = pd.DataFrame(rows)

bad_mask = (df["text"].fillna("") == "") | (df["text"].str.startswith("{", na=False))
print("Potentially bad rows:", bad_mask.sum())
print(df.loc[bad_mask, ["doc_id","court_name","decided_date","citations"]].head(10))

cols = ["doc_id","case_name","short_case_name","docket_number","court_name","jurisdiction",
        "court_level","state","decided_date","year","citations","publication_status",
        "disposition","text"]
for c in cols:
    if c not in df.columns:
        df[c] = None
df = df[cols + [c for c in df.columns if c not in cols]]

tsv_path  = OUT_DIR / "cases_clean.tsv"
csv_path  = OUT_DIR / "cases_clean.csv"
parq_path = OUT_DIR / "cases_clean.parquet"

df.to_csv(tsv_path, sep="\t", index=False)
df.to_csv(csv_path, index=False, quoting=csv.QUOTE_ALL, escapechar='\\')
try:
    df.to_parquet(parq_path, index=False)
except Exception as e:
    print("Parquet write skipped:", e)

print("Wrote:\n -", tsv_path, "\n -", csv_path, "\n -", parq_path if parq_path.exists() else "(parquet skipped)")


In [None]:
# ============================================================
# (Q1): Case distribution over time
# ------------------------------------------------------------
# - Builds yearly counts (and 3-year MA + share) from cases_clean
# - Saves summary table to OUT_DIR/T1_cases_by_year.csv
# - Saves figure PNG/PDF to OUT_DIR/figures
# ============================================================

import pandas as pd, re
from dateutil import parser as dateparser
import matplotlib.pyplot as plt

FIG_DIR = (OUT_DIR / "figures"); FIG_DIR.mkdir(parents=True, exist_ok=True)
by_year_path = OUT_DIR / "T1_cases_by_year.csv"

# load cases
tsv = OUT_DIR / "cases_clean.tsv"
csv = OUT_DIR / "cases_clean.csv"
if tsv.exists():
    df_cases = pd.read_csv(tsv, sep="\t", dtype=str, keep_default_na=False)
else:
    df_cases = pd.read_csv(csv, dtype=str, keep_default_na=False)

# coerce year (explicit year > decided_date > scan text)
def coerce_year(row):
    y = row.get("year")
    if isinstance(y, str) and y.strip().isdigit():
        return int(y)
    d = row.get("decided_date")
    if isinstance(d, str) and d.strip():
        try:
            return dateparser.parse(d, fuzzy=True).year
        except Exception:
            pass
    txt = row.get("text") or ""
    m = re.search(r'\b(19|20)\d{2}\b', txt)
    return int(m.group(0)) if m else None

df_cases["_year_coerced"] = df_cases.apply(coerce_year, axis=1)
by_year = (df_cases.dropna(subset=["_year_coerced"])
            .assign(_year_coerced=lambda x: x["_year_coerced"].astype(int))
            .groupby("_year_coerced").size().reset_index(name="n_cases")
            .sort_values("_year_coerced"))

total = by_year["n_cases"].sum()
by_year["share_of_total"] = (by_year["n_cases"] / total).round(4)
by_year["n_cases_3yr_ma"] = by_year["n_cases"].rolling(window=3, min_periods=1).mean().round(1)

by_year.to_csv(by_year_path, index=False)
print("Saved:", by_year_path)

# line chart (no explicit colors per requirement)
fig, ax = plt.subplots(figsize=(10, 4))
ax.plot(by_year["_year_coerced"], by_year["n_cases"])
ax.set_title("Case Distribution by Year")
ax.set_xlabel("Year")
ax.set_ylabel("Number of cases")
fig.tight_layout()

png_path = FIG_DIR / "cases_by_year.png"
pdf_path = FIG_DIR / "cases_by_year.pdf"
fig.savefig(png_path, dpi=300, bbox_inches="tight")
fig.savefig(pdf_path, bbox_inches="tight")
plt.close(fig)
print("Saved figure:\n -", png_path, "\n -", pdf_path)


In [None]:
# ============================================================
# (Q2): Top-5 judges — Supreme Court of Florida
# ------------------------------------------------------------
# - Filters to Florida Supreme Court
# - Cleans judge tokens (drops titles like CJ/J./Chief Justice)
# - Merges small variants by last name (canonical key)
# - Counts unique cases per judge and saves table
# ============================================================

import pandas as pd, numpy as np, re

READ_TSV = OUT_DIR / "cases_clean.tsv"
READ_CSV = OUT_DIR / "cases_clean.csv"
fl_df = pd.read_csv(READ_TSV, sep="\t", dtype=str) if READ_TSV.exists() else pd.read_csv(READ_CSV, dtype=str)

def is_fl_sct(cname: str|None) -> bool:
    if not isinstance(cname, str): return False
    s = cname.lower().strip()
    return ("supreme court of florida" in s) or (s == "florida supreme court")

fl = fl_df[fl_df["court_name"].map(is_fl_sct)].copy()
fl["judges"] = fl.get("judges", pd.Series(index=fl.index, dtype="object")).fillna("")

TITLE_TOKENS = re.compile(
    r'\b(?:C\.?J\.?|P\.?J\.?|J\.|JJ\.|Chief Justice|Presiding Justice|Justice|Acting|Temporarily Assigned|For the Court)\b',
    re.I
)
DROP_SET = {"", "cj", "j", "jj", "chief justice", "justice", "per curiam", "the court"}
SUFFIXES = {"Jr","Jr.","Sr","Sr.","II","III","IV"}

def clean_token(tok: str) -> str|None:
    if not isinstance(tok, str): return None
    s = tok.strip()
    if not s: return None
    s = re.sub(r'\([^)]*\)', '', s)                 # remove (dissenting), etc.
    s = TITLE_TOKENS.sub('', s)                     # drop titles
    s = re.sub(r'^(?:and|the|hon\.?|honorable)\s+', '', s, flags=re.I)
    s = re.sub(r'[;:]+', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip(' ,;:')
    low = s.lower()
    if low in DROP_SET or 'per curiam' in low: return None
    if len(re.sub(r'[^A-Za-z]', '', s)) < 2: return None
    if ',' in s and not re.search(r'\b(?:Jr\.?|Sr\.?|II|III|IV)\b$', s):
        left, right = [p.strip() for p in s.split(',', 1)]
        if len(left.split()) == 1 and len(right) >= 2:
            s = (right + ' ' + left).strip()
    if s.isupper(): s = s.title()
    return s

def canonical_last(name: str) -> str|None:
    if not name: return None
    parts = [p for p in name.split() if p not in SUFFIXES]
    if not parts: return None
    return parts[-1].upper()

def split_judges(val: str):
    if not isinstance(val, str) or not val.strip(): return []
    return [j.strip() for j in val.split("|") if j.strip()]

rows = []
for _, r in fl.iterrows():
    for j in split_judges(r["judges"]):
        cj = clean_token(j)
        if cj:
            rows.append({"judge_full": cj, "judge_key": canonical_last(cj), "doc_id": r["doc_id"]})
panel_df = pd.DataFrame(rows)

if panel_df.empty:
    print("No judge names found in FL Supreme Court entries.")
else:
    name_choice = (panel_df.groupby(["judge_key","judge_full"]).size()
                     .reset_index(name="n")
                     .sort_values(["judge_key","n"], ascending=[True, False])
                     .drop_duplicates("judge_key")[["judge_key","judge_full"]])

    counts = (panel_df.drop_duplicates(["judge_key","doc_id"])
                        .groupby("judge_key").size()
                        .reset_index(name="n_cases"))

    out = (counts.merge(name_choice, on="judge_key", how="left")
                 .rename(columns={"judge_full":"judge"})
                 .sort_values(["n_cases","judge"], ascending=[False, True])
                 .head(5))

    display(out[["judge","n_cases"]])
    out[["judge","n_cases"]].to_csv(OUT_DIR / "fl_supreme_top5_judges_by_cases.csv", index=False)
    print("Saved:", OUT_DIR / "fl_supreme_top5_judges_by_cases.csv")


No judge names found in FL Supreme Court entries.


In [None]:
# ============================================================
# (Q3): Opinion-length distributions — courts of last resort
# ------------------------------------------------------------
# - Computes opinion length (words) from text if missing
# - Identifies state courts of last resort (with NY/MA/WV exceptions)
# - Summarizes by state: N, median, p25, p75, mean, min, max
# - Saves two figures:
#     A) heatmap of binned shares (appendix)
#     B) ranked medians with IQR for top 15 states (main text)
# - Writes summary tables to OUT_DIR
# ============================================================

import numpy as np
import matplotlib.pyplot as plt

# load cleaned cases
READ_TSV = OUT_DIR / "cases_clean.tsv"
READ_CSV = OUT_DIR / "cases_clean.csv"
cases = pd.read_csv(READ_TSV, sep="\t", dtype=str, keep_default_na=False) if READ_TSV.exists() else pd.read_csv(READ_CSV, dtype=str, keep_default_na=False)

# opinion length (words)
cases["op_length_words"] = pd.to_numeric(cases.get("op_length_words", pd.Series(index=cases.index)), errors="coerce")
if cases["op_length_words"].isna().any():
    cases["op_length_words"] = cases["op_length_words"].fillna(
        cases.get("text", pd.Series(index=cases.index, dtype="object")).fillna("").map(lambda s: len(s.split()))
    )
cases["op_length_words"] = pd.to_numeric(cases["op_length_words"], errors="coerce").fillna(0)

# infer state from court_name if missing
STATE_LIST = {
    "Alabama","Alaska","Arizona","Arkansas","California","Colorado","Connecticut","Delaware","Florida","Georgia",
    "Hawaii","Idaho","Illinois","Indiana","Iowa","Kansas","Kentucky","Louisiana","Maine","Maryland",
    "Massachusetts","Michigan","Minnesota","Mississippi","Missouri","Montana","Nebraska","Nevada",
    "New Hampshire","New Jersey","New Mexico","New York","North Carolina","North Dakota","Ohio","Oklahoma",
    "Oregon","Pennsylvania","Rhode Island","South Carolina","South Dakota","Tennessee","Texas","Utah",
    "Vermont","Virginia","Washington","West Virginia","Wisconsin","Wyoming"
}
def infer_state_from_court(court_name: str) -> str|None:
    if not isinstance(court_name, str): return None
    for st in sorted(STATE_LIST, key=len, reverse=True):
        if re.search(rf"\b{re.escape(st)}\b", court_name): return st
    return None

if "state" not in cases.columns:
    cases["state"] = cases["court_name"].map(infer_state_from_court)
else:
    missing = cases["state"].isna() | (cases["state"].str.strip() == "")
    cases.loc[missing, "state"] = cases.loc[missing, "court_name"].map(infer_state_from_court)

# identify last-resort courts (default Supreme Court, plus exceptions)
def is_last_resort(court_name: str, state_val: str) -> bool:
    if not isinstance(court_name, str): return False
    s = court_name.lower().strip()
    st = (state_val or "").lower().strip()
    if st == "new york":        # Court of Appeals of New York
        return ("court of appeals" in s) and ("new york" in s)
    if st == "massachusetts":   # Supreme Judicial Court
        return "supreme judicial court" in s
    if st == "west virginia":   # Supreme Court of Appeals of West Virginia
        return "supreme court of appeals" in s and "west virginia" in s
    if "supreme court" in s and st and st in s: return True
    if re.search(rf"\b{re.escape(st)}\b.*\bsupreme court\b", s): return True
    if re.search(r"\bsupreme court of the state of\b", s) and st in s: return True
    if "court_level" in cases.columns:
        try:
            cl = cases.loc[cases["court_name"]==court_name, "court_level"].head(1).iloc[0]
            if cl == "state_supreme": return True
        except Exception: pass
    return False

mask_lr = cases.apply(lambda r: is_last_resort(r.get("court_name",""), r.get("state","")), axis=1)
last_resort = cases[mask_lr].copy()
last_resort = last_resort[(last_resort["op_length_words"] > 0) & last_resort["state"].notna()]

# summaries by state
def summarize(g):
    x = g["op_length_words"].astype(float)
    return pd.Series({
        "n_cases": int(x.count()),
        "median": float(np.median(x)) if len(x) else np.nan,
        "p25": float(np.percentile(x, 25)) if len(x) else np.nan,
        "p75": float(np.percentile(x, 75)) if len(x) else np.nan,
        "mean": float(np.mean(x)) if len(x) else np.nan,
        "min": float(np.min(x)) if len(x) else np.nan,
        "max": float(np.max(x)) if len(x) else np.nan,
    })
summary_by_state = (last_resort.groupby("state", as_index=True)
                    .apply(summarize)
                    .sort_values(["n_cases","median"], ascending=[False, False]))

summary_by_state.to_csv(OUT_DIR / "opinion_length_summary_by_last_resort_state.csv")
display(summary_by_state.head(10))

# binned shares (appendix heatmap)
bins = [0, 1000, 2500, 5000, 10000, 20000, np.inf]
labels = ["0–1k","1k–2.5k","2.5k–5k","5k–10k","10k–20k","20k+"]
tmp = last_resort.copy()
tmp["len_bin"] = pd.cut(tmp["op_length_words"], bins=bins, labels=labels, right=False)
bin_counts = (tmp.groupby(["state","len_bin"]).size()
                .reset_index(name="n")
                .pivot(index="state", columns="len_bin", values="n")
                .fillna(0).astype(int))
bin_share = bin_counts.div(bin_counts.sum(axis=1), axis=0).fillna(0).round(3)
bin_counts.to_csv(OUT_DIR / "opinion_length_bins_by_last_resort_state_counts.csv")
bin_share.to_csv(OUT_DIR / "opinion_length_bins_by_last_resort_state_shares.csv")

# figures
FIG_DIR = (OUT_DIR / "figures"); FIG_DIR.mkdir(parents=True, exist_ok=True)

# A) heatmap (appendix) — show states with at least MIN_CASES to avoid noise
MIN_CASES = 5
states_heat = summary_by_state.index[summary_by_state["n_cases"] >= MIN_CASES].tolist()
heat = bin_share.loc[states_heat, labels] if len(states_heat) else bin_share[labels]

fig_h, ax_h = plt.subplots(figsize=(10, max(4, 0.35*len(states_heat))))
im = ax_h.imshow(heat.values, aspect='auto')
ax_h.set_yticks(range(heat.shape[0]))
ax_h.set_yticklabels(list(heat.index))
ax_h.set_xticks(range(len(labels)))
ax_h.set_xticklabels(labels)
ax_h.set_xlabel("Opinion length (word bins)")
ax_h.set_title("Opinion-length distribution shares — courts of last resort (all states)")
fig_h.colorbar(im, ax=ax_h, label="Share")
fig_h.tight_layout()
heat_path = FIG_DIR / "opinion_length_heatmap_last_resort_states.png"
fig_h.savefig(heat_path, dpi=300, bbox_inches="tight")
plt.close(fig_h)
print("Saved heatmap:", heat_path)

# B) ranked medians with IQR (main text) — top 15 by N
TOP_N = 15
top = summary_by_state.head(TOP_N).copy()
order = top.sort_values("median", ascending=True).index.tolist()

fig_d, ax_d = plt.subplots(figsize=(10, 0.5*len(order)))
for i, st in enumerate(order, start=1):
    p25, p50, p75 = top.loc[st, "p25"], top.loc[st, "median"], top.loc[st, "p75"]
    ax_d.hlines(y=i, xmin=p25, xmax=p75)     # IQR line
    ax_d.plot(p50, i, marker='o')            # median point
ax_d.set_yticks(range(1, len(order)+1))
ax_d.set_yticklabels(order)
ax_d.set_xlabel("Opinion length (words)")
ax_d.set_title(f"Opinion-length medians with IQR — top {TOP_N} states by cases")
fig_d.tight_layout()
rank_path = FIG_DIR / "opinion_length_ranked_median_iqr_top15.png"
fig_d.savefig(rank_path, dpi=300, bbox_inches="tight")
plt.close(fig_d)
print("Saved ranked plot:", rank_path)
