# Code Snippet 1. Harmonization script

In [2]:
from __future__ import annotations

import json
import re
from ast import literal_eval
from typing import Any, List, Optional, Tuple

import numpy as np
import pandas as pd


# ---------------------------
# 0) File paths (project local)
# ---------------------------
S1_PATH = "/content/S1_NYCLGBT Historic Sites Project.csv"
S2_PATH = "/content/S2_Addresses project.csv"
S3_PATH = "/content/S3_Gwendolyn Stegall data.xlsx"

OUT_HARMONIZED = "harmonized_core.csv"


# ---------------------------
# 1) Minimal string helpers
# ---------------------------
def norm_ws(x: Any) -> Optional[str]:
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return None
    s = str(x).strip()
    s = re.sub(r"\s+", " ", s)
    return s if s else None


def canonicalize_name(name_raw: Optional[str]) -> Optional[str]:
    if not name_raw:
        return None
    s = name_raw
    s = s.replace("‐", "-").replace("–", "-").replace("—", "-")
    s = s.replace("“", '"').replace("”", '"').replace("’", "'")
    s = re.sub(r"\s+", " ", s).strip()
    return s if s else None


def parse_pylist_str(x: Any) -> List[Any]:
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return []
    if isinstance(x, list):
        return x
    s = str(x).strip()
    try:
        out = literal_eval(s)
        return out if isinstance(out, list) else []
    except Exception:
        return []


# ---------------------------
# 2) Temporalization helpers
# ---------------------------
def decade_token_to_int(dec: str) -> Optional[int]:
    if not dec:
        return None
    m = re.match(r"^\s*(\d{4})s\s*$", dec)
    return int(m.group(1)) if m else None


def normalize_decade_tokens(tokens: List[str]) -> Tuple[List[str], Optional[str]]:
    clean: List[str] = []
    audit: List[str] = []

    for raw in tokens:
        if not raw:
            continue
        t = str(raw).strip()
        t = t.replace("‐", "-").replace("–", "-").replace("—", "-")

        if "?" in t:
            audit.append(f"uncertain_decade:{t}")
            t = t.replace("?", "").strip()

        if re.match(r"^\d{4}s$", t):
            clean.append(t)
        elif re.match(r"^\d{4}s-\d{4}s$", t):
            a, b = t.split("-")
            ai, bi = decade_token_to_int(a), decade_token_to_int(b)
            if ai is None or bi is None or ai > bi:
                audit.append(f"bad_decade_range:{t}")
            else:
                for y in range(ai, bi + 1, 10):
                    clean.append(f"{y}s")
        else:
            audit.append(f"unparsed_decade:{t}")

    clean = sorted(set(clean), key=lambda d: decade_token_to_int(d) or 99999)
    return clean, ("; ".join(audit) if audit else None)


def approx_start_end_from_decades(decades: List[str]) -> Tuple[Optional[int], Optional[int]]:
    if not decades:
        return None, None
    ints = [decade_token_to_int(d) for d in decades]
    ints = [i for i in ints if i is not None]
    if not ints:
        return None, None
    return min(ints), max(ints) + 9


def decades_from_free_text(text: Optional[str]) -> Tuple[List[str], Optional[int], Optional[int], Optional[str]]:
    if not text:
        return [], None, None, None

    s = text.replace("‐", "-").replace("–", "-").replace("—", "-")
    audit: List[str] = []

    years = [int(y) for y in re.findall(r"(?<!\d)(18\d{2}|19\d{2}|20\d{2})(?!\d)", s)]
    start_year = min(years) if years else None
    end_year = max(years) if years else None

    dec_tokens = re.findall(r"(?<!\d)(18\d0s|19\d0s|20\d0s)(?!\d)", s)
    decades_norm, dec_audit = normalize_decade_tokens(dec_tokens)
    if dec_audit:
        audit.append(dec_audit)

    if not decades_norm and years:
        decades_norm = sorted(set(f"{(y // 10) * 10}s" for y in years), key=decade_token_to_int)

    if re.search(r"\b(today|present|current|ongoing)\b", s, flags=re.I):
        end_year = None
        audit.append("relative_time_token")

    return decades_norm, start_year, end_year, ("; ".join(audit) if audit else None)


def to_json_list(x: List[str]) -> str:
    return json.dumps(x, ensure_ascii=False)


def enforce_schema_types(out: pd.DataFrame) -> pd.DataFrame:
    out["lat"] = pd.to_numeric(out["lat"], errors="coerce").astype("Float64")
    out["lon"] = pd.to_numeric(out["lon"], errors="coerce").astype("Float64")
    out["start_year"] = pd.to_numeric(out["start_year"], errors="coerce").astype("Int64")
    out["end_year"] = pd.to_numeric(out["end_year"], errors="coerce").astype("Int64")
    return out


# ---------------------------
# 3) Source adapters → harmonized schema
# ---------------------------
HARMONIZED_COLS = [
    "entry_id",
    "source_dataset",
    "source_record_id",
    "name_raw",
    "name_canonical",
    "site_type_tags",
    "evidence_mode",
    "primary_source",
    "corroboration_status",
    "corroborating_sources",
    "active_raw",
    "start_year",
    "end_year",
    "decades_norm",
    "time_basis",
    "address_raw",
    "lat",
    "lon",
    "space_basis",
    "spatial_precision",
    "audit_note",
]

def harmonize_s1(df: pd.DataFrame) -> pd.DataFrame:
    out = pd.DataFrame(index=df.index)
    out["source_dataset"] = "S1"
    out["source_record_id"] = df["source_id"].astype(str)
    out["entry_id"] = [f"S1:{rid}" for rid in out["source_record_id"]]

    out["name_raw"] = df["name"].map(norm_ws)
    out["name_canonical"] = out["name_raw"].map(canonicalize_name)
    out["address_raw"] = df["address_label"].map(norm_ws)

    # Spatial
    out["lat"] = pd.to_numeric(df["latitude"], errors="coerce")
    out["lon"] = pd.to_numeric(df["longitude"], errors="coerce")
    out["space_basis"] = np.where(
        out["lat"].notna() & out["lon"].notna(),
        "source_coordinates",
        "address_only"
    )
    out["spatial_precision"] = np.where(out["space_basis"].eq("source_coordinates"), "point", None)

    # Temporal (decade-level only; NO year derivation)
    raw_dec = df["decades"].map(parse_pylist_str)
    dec_norm: List[List[str]] = []
    audit: List[Optional[str]] = []

    for toks in raw_dec:
        clean, a = normalize_decade_tokens([str(t) for t in toks])
        dec_norm.append(clean)
        audit.append(a)

    out["decades_norm"] = [to_json_list(x) for x in dec_norm]

    # Keep year fields empty for S1 (avoid implying continuous occupation)
    out["start_year"] = pd.Series([pd.NA] * len(out), dtype="Int64")
    out["end_year"] = pd.Series([pd.NA] * len(out), dtype="Int64")

    # More explicit basis label (significance timeframe, decade-scale)
    out["time_basis"] = np.where(
        pd.Series(dec_norm).map(len) > 0,
        "S1.decades_significance",
        None
    )

    out["audit_note"] = audit

    # Other harmonized fields
    out["site_type_tags"] = [to_json_list([]) for _ in range(len(out))]
    out["evidence_mode"] = "institutional_inventory"
    out["primary_source"] = None
    out["active_raw"] = None
    out["corroboration_status"] = "single_source"
    out["corroborating_sources"] = to_json_list([])

    out = out.reindex(columns=HARMONIZED_COLS)
    return enforce_schema_types(out)



def harmonize_s2(df: pd.DataFrame) -> pd.DataFrame:
    df = df.reset_index(drop=True)

    out = pd.DataFrame(index=df.index)
    out["source_dataset"] = "S2"

    width = len(str(len(df)))
    out["source_record_id"] = [f"{i+1:0{width}d}" for i in range(len(df))]
    out["entry_id"] = [f"S2:{rid}" for rid in out["source_record_id"]]

    out["name_raw"] = df["Name"].map(norm_ws)
    out["name_canonical"] = out["name_raw"].map(canonicalize_name)
    out["address_raw"] = df["Location"].map(norm_ws)
    out["active_raw"] = df["Active"].map(norm_ws)

    out["lat"] = pd.to_numeric(df.get("Latitude"), errors="coerce")
    out["lon"] = pd.to_numeric(df.get("Longitude"), errors="coerce")
    out["space_basis"] = np.where(out["lat"].notna() & out["lon"].notna(), "source_coordinates", "address_only")
    out["spatial_precision"] = np.where(out["space_basis"].eq("source_coordinates"), "point", None)

    dec_norm: List[List[str]] = []
    start_y: List[Optional[int]] = []
    end_y: List[Optional[int]] = []
    audit: List[Optional[str]] = []

    for t in out["active_raw"].tolist():
        d, sy, ey, a = decades_from_free_text(t)
        dec_norm.append(d)
        start_y.append(sy)
        end_y.append(ey)
        audit.append(a)

    out["decades_norm"] = [to_json_list(x) for x in dec_norm]

    approx = [approx_start_end_from_decades(x) for x in dec_norm]
    out["start_year"] = [sy if sy is not None else ap[0] for sy, ap in zip(start_y, approx)]
    out["end_year"] = [ey if ey is not None else ap[1] for ey, ap in zip(end_y, approx)]
    out["time_basis"] = np.where(out["active_raw"].notna(), "S2.active_raw", None)
    out["audit_note"] = audit

    out["site_type_tags"] = [to_json_list([]) for _ in range(len(out))]
    out["evidence_mode"] = "community_directory"
    out["primary_source"] = None
    out["corroboration_status"] = "single_source"
    out["corroborating_sources"] = to_json_list([])

    out = out.reindex(columns=HARMONIZED_COLS)
    return enforce_schema_types(out)


def harmonize_s3(df: pd.DataFrame) -> pd.DataFrame:
    df = df.reset_index(drop=True)

    out = pd.DataFrame(index=df.index)
    out["source_dataset"] = "S3"

    width = len(str(len(df)))
    out["source_record_id"] = [f"{i+1:0{width}d}" for i in range(len(df))]
    out["entry_id"] = [f"S3:{rid}" for rid in out["source_record_id"]]

    out["name_raw"] = df["Bar Name"].map(norm_ws)
    out["name_canonical"] = out["name_raw"].map(canonicalize_name)
    out["address_raw"] = df["Address"].map(norm_ws)

    def map_tags(cat: Any) -> List[str]:
        c = (norm_ws(cat) or "").lower()
        tags: List[str] = []
        if "bar" in c or "club" in c:
            tags.append("bar_club")
        if "after hours" in c:
            tags.append("after_hours")
        if "party" in c or "night" in c or "disco" in c:
            tags.append("nightlife_event")
        return sorted(set(tags))

    out["site_type_tags"] = df["Category"].map(map_tags).map(to_json_list)

    open_y = pd.to_numeric(df["Open"], errors="coerce")
    closed_y = pd.to_numeric(df["Closed"], errors="coerce")
    out["start_year"] = open_y.where(open_y.notna(), None)
    out["end_year"] = closed_y.where(closed_y.notna(), None)

    dec_field = df["Decade(s)"].map(norm_ws)
    dec_norm: List[List[str]] = []
    audit: List[Optional[str]] = []

    for x in dec_field.tolist():
        if not x:
            dec_norm.append([])
            audit.append(None)
            continue
        toks = re.split(r"[;,]| and | & ", x)
        toks = [t.strip() for t in toks if t and t.strip()]
        clean, a = normalize_decade_tokens(toks)
        dec_norm.append(clean)
        audit.append(a)

    out["decades_norm"] = [to_json_list(x) for x in dec_norm]

    approx = [approx_start_end_from_decades(x) for x in dec_norm]
    out["start_year"] = [
        int(sy) if sy is not None and not (isinstance(sy, float) and np.isnan(sy)) else ap[0]
        for sy, ap in zip(out["start_year"].tolist(), approx)
    ]
    out["end_year"] = [
        int(ey) if ey is not None and not (isinstance(ey, float) and np.isnan(ey)) else ap[1]
        for ey, ap in zip(out["end_year"].tolist(), approx)
    ]

    out["time_basis"] = np.where(df["Open"].notna() | df["Closed"].notna(), "S3.open_closed", "S3.decades")
    out["audit_note"] = audit

    out["lat"] = pd.Series([pd.NA] * len(out), dtype="Float64")
    out["lon"] = pd.Series([pd.NA] * len(out), dtype="Float64")
    out["space_basis"] = "address_only"
    out["spatial_precision"] = None

    out["evidence_mode"] = "scholarly_reconstruction"
    out["primary_source"] = df.get("Primary Source", pd.Series([None] * len(df))).map(norm_ws)
    out["corroboration_status"] = "single_source"
    out["corroborating_sources"] = to_json_list([])
    out["active_raw"] = None

    out = out.reindex(columns=HARMONIZED_COLS)
    return enforce_schema_types(out)


# ---------------------------
# 4) Run harmonization (no cross-source merging here)
# ---------------------------
s1 = pd.read_csv(S1_PATH)
s2 = pd.read_csv(S2_PATH)
s3 = pd.read_excel(S3_PATH)

h1 = harmonize_s1(s1)
h2 = harmonize_s2(s2)
h3 = harmonize_s3(s3)

harm = pd.concat([h1, h2, h3], ignore_index=True)

assert harm["entry_id"].is_unique, "entry_id must be unique at this stage"
assert list(harm.columns) == HARMONIZED_COLS, "harmonized schema mismatch"

harm.to_csv(OUT_HARMONIZED, index=False, encoding="utf-8-sig")
print(f"Wrote: {OUT_HARMONIZED} (rows={len(harm)})")


Wrote: harmonized_core.csv (rows=542)
