In [None]:
# ============================================================
# ONE CELL (PAPER-READY, NO-TRANSFORMERS, WITH UPLOAD):
# Uses your uploaded files:
#   - asrs_narratives_clean(.csv/.xlsx)
#   - asrs_paper_dataset(.csv/.xlsx)          [full]
#   - asrs_paper_dataset_sample(.csv/.xlsx)   [sample]
#
# Runs: Rule baseline + TF-IDF(LogReg) baseline
# Outputs: metrics + figures + debug CSV + zip + auto-download
# ============================================================

import os, re, json, random, zipfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ---------- Install deps (safe, minimal) ----------
import sys, subprocess
def pip_install(pkgs):
    subprocess.run([sys.executable, "-m", "pip", "install", "-q", "--no-cache-dir"] + pkgs, check=False)

pip_install([
    "numpy==1.26.4",
    "pandas==2.2.2",
    "scikit-learn==1.4.2",
    "matplotlib",
    "tqdm",
    "openpyxl",
])

# Re-import after install
import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

print("ENV OK:",
      "numpy", np.__version__,
      "pandas", pd.__version__)

# ---------- Upload ----------
from google.colab import files
print("\n[UPLOAD] Please upload these 3 files now:")
print(" - asrs_narratives_clean.csv (or .xlsx)")
print(" - asrs_paper_dataset.csv (or .xlsx)")
print(" - asrs_paper_dataset_sample.csv (or .xlsx)\n")
uploaded = files.upload()
assert len(uploaded) > 0, "No files uploaded."

# ---------- Helper: load csv/xlsx ----------
def load_table(path):
    if path.lower().endswith(".csv"):
        return pd.read_csv(path, low_memory=False)
    if path.lower().endswith(".xlsx") or path.lower().endswith(".xls"):
        return pd.read_excel(path)
    raise ValueError(f"Unsupported file type: {path}")

# ---------- Find files by fuzzy name ----------
def pick_file(keys, contains_any):
    keys_l = [k.lower() for k in keys]
    for i, k in enumerate(keys_l):
        if any(s in k for s in contains_any):
            return list(keys)[i]
    return None

keys = list(uploaded.keys())

f_narr = pick_file(keys, ["narratives_clean", "narrative_clean"])
f_full = pick_file(keys, ["paper_dataset.csv", "paper_dataset.xlsx", "asrs_paper_dataset"])
f_samp = pick_file(keys, ["paper_dataset_sample", "dataset_sample", "sample"])

# fallback: if user uploaded exact names
if f_samp is None:
    for k in keys:
        if "sample" in k.lower():
            f_samp = k; break

assert f_samp is not None or f_full is not None, "I can't find sample/full dataset file among uploads."

print("\nDetected files:")
print(" - narratives:", f_narr)
print(" - full:", f_full)
print(" - sample:", f_samp)

# ---------- Choose dataset to run (sample first for speed) ----------
USE_FULL = False   # <-- set True if you want to run full dataset (will take longer)

DATA_FILE = f_full if (USE_FULL and f_full is not None) else f_samp
assert DATA_FILE is not None, "Requested full dataset but not uploaded."

df = load_table(DATA_FILE)
print(f"\nUsing dataset: {DATA_FILE} | shape={df.shape}")

# ---------- Validate required columns ----------
need_cols = {"incident_id", "narrative", "labels"}
missing = need_cols - set([c.lower() for c in df.columns])
if missing:
    # try case-insensitive mapping
    colmap = {c.lower(): c for c in df.columns}
    for req in list(need_cols):
        if req in colmap:
            df.rename(columns={colmap[req]: req}, inplace=True)
    missing2 = need_cols - set(df.columns)
    assert not missing2, f"Dataset missing columns: {missing2}. Columns found: {list(df.columns)}"
else:
    # normalize to exact names
    colmap = {c.lower(): c for c in df.columns}
    df.rename(columns={colmap["incident_id"]: "incident_id",
                       colmap["narrative"]: "narrative",
                       colmap["labels"]: "labels"}, inplace=True)

df["narrative"] = df["narrative"].astype(str)

# ---------- Config ----------
TOPK = 50
SEED = 42
TEST_SIZE = 0.2
random.seed(SEED)
np.random.seed(SEED)

# ---------- Parse & clean labels ----------
def parse_labels(s):
    if not isinstance(s, str) or not s.strip():
        return []
    return [x.strip() for x in s.split(";") if x.strip()]

def clean_label(l):
    l = str(l)
    l = l.replace("\xa0", " ").replace("&nbsp;", " ").replace("&nbsp", " ")
    l = re.sub(r"\s+", " ", l).strip()
    if l == "" or l.lower() in {"none", "nan", "null"}:
        return ""
    return l

df["gold"] = df["labels"].apply(parse_labels)
df["gold"] = df["gold"].apply(lambda labs: [clean_label(x) for x in labs])
df["gold"] = df["gold"].apply(lambda labs: [x for x in labs if x])

# Build TopK after cleaning
all_gold = [lab for labs in df["gold"] for lab in labs]
freq = pd.Series(all_gold).value_counts()
label_space = freq.head(TOPK).index.tolist()
label_set = set(label_space)

df["gold_topk"] = df["gold"].apply(lambda labs: [l for l in labs if l in label_set])
df_eval = df[df["gold_topk"].map(len) > 0].copy()

print(f"\nEval rows after TopK filter: {len(df_eval)} / {len(df)} | TOPK={TOPK}")
print("Top 10 labels:", label_space[:10])

# ---------- Metrics ----------
mlb = MultiLabelBinarizer(classes=label_space)
Y_gold = df_eval["gold_topk"].tolist()

def compute_metrics(true_list, pred_list):
    yt = mlb.fit_transform(true_list)
    yp = mlb.transform(pred_list)
    return {
        "micro_f1": float(f1_score(yt, yp, average="micro", zero_division=0)),
        "macro_f1": float(f1_score(yt, yp, average="macro", zero_division=0)),
        "micro_precision": float(precision_score(yt, yp, average="micro", zero_division=0)),
        "micro_recall": float(recall_score(yt, yp, average="micro", zero_division=0)),
    }

rows = []

# ---------- Baseline 1: Rule substring ----------
def normalize(s):
    return re.sub(r"\s+", " ", str(s).lower()).strip()

label_value = {l: normalize(l.split(":", 1)[-1]) for l in label_space}

def rule_pred(text):
    t = normalize(text)
    out = []
    for lab in label_space:
        key = label_value[lab]
        if len(key) < 4:
            continue
        if key in t:
            out.append(lab)
    # dedup & sort
    out = list(dict.fromkeys(out))
    return sorted(out)

df_eval["pred_rule"] = df_eval["narrative"].apply(rule_pred)
m_rule = compute_metrics(Y_gold, df_eval["pred_rule"].tolist())
rows.append({"method": "Rule baseline (substring)", **m_rule})

# ---------- Baseline 2: TF-IDF + OvR Logistic Regression ----------
X = df_eval["narrative"].values
Ybin = mlb.fit_transform(Y_gold)

X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    X, Ybin, np.arange(len(df_eval)), test_size=TEST_SIZE, random_state=SEED
)

vec = TfidfVectorizer(ngram_range=(1,2), min_df=2, max_features=80000)
Xtr = vec.fit_transform(X_train)
Xte = vec.transform(X_test)

probs = np.zeros((Xte.shape[0], len(label_space)), dtype=float)

print("\nTraining TF-IDF + LogReg (OvR over TopK labels)...")
for j, lab in enumerate(label_space):
    yj = y_train[:, j]
    if yj.sum() < 5:
        continue
    clf = LogisticRegression(max_iter=2000, class_weight="balanced")
    clf.fit(Xtr, yj)
    probs[:, j] = clf.predict_proba(Xte)[:, 1]

y_pred = (probs >= 0.5).astype(int)
pred_tfidf = [[label_space[j] for j in np.where(row == 1)[0]] for row in y_pred]

gold_test = [Y_gold[i] for i in idx_test]
m_tfidf = compute_metrics(gold_test, pred_tfidf)
rows.append({"method": "TF-IDF + LogReg (test split)", **m_tfidf})

# ---------- Save metrics ----------
metrics_df = pd.DataFrame(rows)
print("\n=== PAPER METRICS ===")
print(metrics_df)
metrics_df.to_csv("paper_results_metrics.csv", index=False)

# ---------- Figures (Top-20 support + TF-IDF recall on test split) ----------
yt = mlb.fit_transform(gold_test)
yp = mlb.transform(pred_tfidf)

support = np.asarray(yt.sum(axis=0)).ravel()
tp = np.asarray((yt.multiply(yp)).sum(axis=0)).ravel()
rec = np.divide(tp, support, out=np.zeros_like(tp, dtype=float), where=support > 0)

top_idx = np.argsort(-support)[:20]
top_labels = [label_space[i] for i in top_idx]

plt.figure(figsize=(10,5))
plt.bar(range(len(top_idx)), support[top_idx])
plt.xticks(range(len(top_idx)), [l.split(":")[0] for l in top_labels], rotation=45, ha="right")
plt.ylabel("Support (count in gold, test split)")
plt.title("Top-20 Label Support (ASRS TopK)")
plt.tight_layout()
plt.savefig("figure_top_labels_support.png", dpi=300)
plt.show()

plt.figure(figsize=(10,5))
plt.bar(range(len(top_idx)), rec[top_idx])
plt.xticks(range(len(top_idx)), [l.split(":")[0] for l in top_labels], rotation=45, ha="right")
plt.ylabel("Recall (TF-IDF)")
plt.ylim(0, 1)
plt.title("TF-IDF Recall on Top-20 Labels (test split)")
plt.tight_layout()
plt.savefig("figure_top_labels_tfidf_recall.png", dpi=300)
plt.show()

# ---------- Debug predictions (paper error analysis) ----------
debug = df_eval[["incident_id","narrative","gold_topk"]].copy()
debug["gold_topk"] = debug["gold_topk"].apply(lambda x: "; ".join(x))

# add rule for all rows
debug["pred_rule"] = df_eval["pred_rule"].apply(lambda x: "; ".join(x))

# add TF-IDF predictions only for test split rows; others blank
pred_tfidf_full = [""] * len(df_eval)
for local_i, global_i in enumerate(idx_test):
    pred_tfidf_full[global_i] = "; ".join(pred_tfidf[local_i])
debug["pred_tfidf_testonly"] = pred_tfidf_full

debug.to_csv("paper_predictions_debug.csv", index=False)

# ---------- Zip all artifacts ----------
artifacts = [
    "paper_results_metrics.csv",
    "paper_predictions_debug.csv",
    "figure_top_labels_support.png",
    "figure_top_labels_tfidf_recall.png",
]
zip_name = "paper_artifacts.zip"
with zipfile.ZipFile(zip_name, "w", zipfile.ZIP_DEFLATED) as z:
    for f in artifacts:
        if os.path.exists(f):
            z.write(f)

print("\nSaved artifacts:")
for f in artifacts + [zip_name]:
    if os.path.exists(f):
        print(" -", f)

# ---------- Auto-download ----------
print("\nDownloading artifacts...")
for f in artifacts + [zip_name]:
    if os.path.exists(f):
        files.download(f)

print("\nALL DONE.")


ModuleNotFoundError: No module named 'numpy.strings'

In [None]:
# ============================================================
# ONE CELL (SELF-HEAL + UPLOAD + PAPER RESULTS + DOWNLOAD)
# - Fixes broken numpy ("numpy.char"/"numpy.strings" missing) by hard reset
# - After reconnect, re-run the SAME cell to execute experiment
# - No transformers (robust): Rule baseline + TF-IDF/LogReg baseline
# - Outputs + auto-download: metrics, figures, debug csv, zip
# ============================================================

import os, sys, subprocess, textwrap, zipfile, re, random, signal, time

PHASE_FLAG = "/content/.PHASE2_OK"

def sh(cmd):
    return subprocess.run(cmd, shell=True, check=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True).stdout

# -------------------------
# PHASE 1: Fix environment + hard restart
# -------------------------
if not os.path.exists(PHASE_FLAG):
    print("PHASE 1/2: Repairing broken numpy environment, then restarting runtime...\n")

    # Try to remove broken numpy dirs (common when mixed installs happened)
    print("Removing suspicious numpy folders...")
    print(sh("python - <<'PY'\n"
             "import site, glob, os\n"
             "paths=[]\n"
             "for p in site.getsitepackages():\n"
             "  paths += glob.glob(os.path.join(p,'numpy*'))\n"
             "for p in paths:\n"
             "  print(' -', p)\n"
             "PY"))

    # Force uninstall and reinstall clean stack with no cache
    print("Uninstalling potentially broken packages...")
    print(sh("pip -q uninstall -y numpy pandas scikit-learn scipy || true"))

    # Reinstall pinned versions (Colab-safe). scipy included to avoid sklearn issues.
    print("Installing clean pinned stack...")
    print(sh("pip -q install --no-cache-dir numpy==1.26.4 pandas==2.2.2 scipy==1.11.4 scikit-learn==1.4.2 "
             "matplotlib tqdm openpyxl"))

    # Sanity check numpy core modules
    print("Sanity-checking numpy modules...")
    out = sh("python - <<'PY'\n"
             "import numpy as np\n"
             "import numpy.char\n"
             "print('numpy OK:', np.__version__)\n"
             "print('numpy file:', np.__file__)\n"
             "PY")
    print(out)

    if "numpy OK" not in out:
        print("\nStill broken. Forcing runtime kill now; reconnect and run this cell again.\n")
    else:
        # Mark phase 2 ready
        open(PHASE_FLAG, "w").write("ok")
        print("\nEnvironment fixed. Restarting runtime now (required)...\n")

    # Hard kill kernel to force clean reload of site-packages
    time.sleep(1)
    os.kill(os.getpid(), signal.SIGKILL)

# -------------------------
# PHASE 2: Upload + run experiment + download artifacts
# -------------------------
print("PHASE 2/2: Running paper-ready baselines...\n")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

print("ENV OK:",
      "numpy", np.__version__,
      "pandas", pd.__version__)

# ---------- Upload ----------
from google.colab import files
print("\n[UPLOAD] Upload these files (csv OR xlsx):")
print(" - asrs_narratives_clean")
print(" - asrs_paper_dataset")
print(" - asrs_paper_dataset_sample\n")
uploaded = files.upload()
assert len(uploaded) > 0, "No files uploaded."

keys = list(uploaded.keys())

def load_table(path):
    if path.lower().endswith(".csv"):
        return pd.read_csv(path, low_memory=False)
    if path.lower().endswith(".xlsx") or path.lower().endswith(".xls"):
        return pd.read_excel(path)
    raise ValueError(f"Unsupported file type: {path}")

def pick(keys, patterns):
    kl = [k.lower() for k in keys]
    for i, k in enumerate(kl):
        if any(p in k for p in patterns):
            return keys[i]
    return None

f_samp = pick(keys, ["paper_dataset_sample", "dataset_sample", "sample"])
f_full = pick(keys, ["paper_dataset.csv", "paper_dataset.xlsx", "asrs_paper_dataset"])
f_narr = pick(keys, ["narratives_clean", "narrative_clean"])

assert f_samp is not None or f_full is not None, "Cannot find sample/full dataset in uploads."

print("\nDetected:")
print(" - narratives:", f_narr)
print(" - full:", f_full)
print(" - sample:", f_samp)

# Choose dataset: sample first for speed
USE_FULL = False   # set True if you want full
DATA_FILE = f_full if (USE_FULL and f_full is not None) else f_samp
assert DATA_FILE is not None, "Requested full but it wasn't uploaded."

df = load_table(DATA_FILE)
print(f"\nUsing dataset: {DATA_FILE} | shape={df.shape}")

# ---------- Normalize column names ----------
lower = {c.lower(): c for c in df.columns}
need = ["incident_id", "narrative", "labels"]
for n in need:
    assert n in lower, f"Missing column '{n}' in dataset. Found columns: {list(df.columns)}"
df = df.rename(columns={lower["incident_id"]:"incident_id",
                        lower["narrative"]:"narrative",
                        lower["labels"]:"labels"})
df["narrative"] = df["narrative"].astype(str)

# ---------- Config ----------
TOPK = 50
SEED = 42
TEST_SIZE = 0.2
random.seed(SEED)
np.random.seed(SEED)

# ---------- Labels parse & clean ----------
def parse_labels(s):
    if not isinstance(s, str) or not s.strip():
        return []
    return [x.strip() for x in s.split(";") if x.strip()]

def clean_label(l):
    l = str(l)
    l = l.replace("\xa0", " ").replace("&nbsp;", " ").replace("&nbsp", " ")
    l = re.sub(r"\s+", " ", l).strip()
    if l == "" or l.lower() in {"none", "nan", "null"}:
        return ""
    return l

df["gold"] = df["labels"].apply(parse_labels)
df["gold"] = df["gold"].apply(lambda labs: [clean_label(x) for x in labs])
df["gold"] = df["gold"].apply(lambda labs: [x for x in labs if x])

all_gold = [lab for labs in df["gold"] for lab in labs]
freq = pd.Series(all_gold).value_counts()
label_space = freq.head(TOPK).index.tolist()
label_set = set(label_space)

df["gold_topk"] = df["gold"].apply(lambda labs: [l for l in labs if l in label_set])
df_eval = df[df["gold_topk"].map(len) > 0].copy()

print(f"\nEval rows after TopK filter: {len(df_eval)} / {len(df)} | TOPK={TOPK}")
print("Top 10 labels:", label_space[:10])

mlb = MultiLabelBinarizer(classes=label_space)
Y_gold_all = df_eval["gold_topk"].tolist()

def compute_metrics(true_list, pred_list):
    yt = mlb.fit_transform(true_list)
    yp = mlb.transform(pred_list)
    return {
        "micro_f1": float(f1_score(yt, yp, average="micro", zero_division=0)),
        "macro_f1": float(f1_score(yt, yp, average="macro", zero_division=0)),
        "micro_precision": float(precision_score(yt, yp, average="micro", zero_division=0)),
        "micro_recall": float(recall_score(yt, yp, average="micro", zero_division=0)),
    }

rows = []

# ---------- Baseline 1: Rule substring ----------
def normalize(s):
    return re.sub(r"\s+", " ", str(s).lower()).strip()

label_value = {l: normalize(l.split(":", 1)[-1]) for l in label_space}

def rule_pred(text):
    t = normalize(text)
    out = []
    for lab in label_space:
        key = label_value[lab]
        if len(key) < 4:
            continue
        if key in t:
            out.append(lab)
    out = list(dict.fromkeys(out))
    return sorted(out)

df_eval["pred_rule"] = df_eval["narrative"].apply(rule_pred)
m_rule = compute_metrics(Y_gold_all, df_eval["pred_rule"].tolist())
rows.append({"method": "Rule baseline (substring)", **m_rule})

# ---------- Baseline 2: TF-IDF + OvR Logistic Regression ----------
X = df_eval["narrative"].values
Ybin = mlb.fit_transform(Y_gold_all)

X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    X, Ybin, np.arange(len(df_eval)), test_size=TEST_SIZE, random_state=SEED
)

vec = TfidfVectorizer(ngram_range=(1,2), min_df=2, max_features=80000)
Xtr = vec.fit_transform(X_train)
Xte = vec.transform(X_test)

probs = np.zeros((Xte.shape[0], len(label_space)), dtype=float)

print("\nTraining TF-IDF + LogReg (OvR over TopK labels)...")
for j, lab in enumerate(label_space):
    yj = y_train[:, j]
    if yj.sum() < 5:
        continue
    clf = LogisticRegression(max_iter=2000, class_weight="balanced")
    clf.fit(Xtr, yj)
    probs[:, j] = clf.predict_proba(Xte)[:, 1]

y_pred = (probs >= 0.5).astype(int)
pred_tfidf = [[label_space[j] for j in np.where(row == 1)[0]] for row in y_pred]

gold_test = [Y_gold_all[i] for i in idx_test]
m_tfidf = compute_metrics(gold_test, pred_tfidf)
rows.append({"method": "TF-IDF + LogReg (test split)", **m_tfidf})

# ---------- Save metrics ----------
metrics_df = pd.DataFrame(rows)
print("\n=== PAPER METRICS ===")
print(metrics_df)
metrics_df.to_csv("paper_results_metrics.csv", index=False)

# ---------- Figures ----------
yt = mlb.fit_transform(gold_test)
yp = mlb.transform(pred_tfidf)

support = np.asarray(yt.sum(axis=0)).ravel()
tp = np.asarray((yt.multiply(yp)).sum(axis=0)).ravel()
rec = np.divide(tp, support, out=np.zeros_like(tp, dtype=float), where=support > 0)

top_idx = np.argsort(-support)[:20]
top_labels = [label_space[i] for i in top_idx]

plt.figure(figsize=(10,5))
plt.bar(range(len(top_idx)), support[top_idx])
plt.xticks(range(len(top_idx)), [l.split(":")[0] for l in top_labels], rotation=45, ha="right")
plt.ylabel("Support (count in gold, test split)")
plt.title("Top-20 Label Support (ASRS TopK)")
plt.tight_layout()
plt.savefig("figure_top_labels_support.png", dpi=300)
plt.show()

plt.figure(figsize=(10,5))
plt.bar(range(len(top_idx)), rec[top_idx])
plt.xticks(range(len(top_idx)), [l.split(":")[0] for l in top_labels], rotation=45, ha="right")
plt.ylabel("Recall (TF-IDF)")
plt.ylim(0, 1)
plt.title("TF-IDF Recall on Top-20 Labels (test split)")
plt.tight_layout()
plt.savefig("figure_top_labels_tfidf_recall.png", dpi=300)
plt.show()

# ---------- Debug predictions ----------
debug = df_eval[["incident_id","narrative","gold_topk"]].copy()
debug["gold_topk"] = debug["gold_topk"].apply(lambda x: "; ".join(x))
debug["pred_rule"] = df_eval["pred_rule"].apply(lambda x: "; ".join(x))

pred_tfidf_full = [""] * len(df_eval)
for local_i, global_i in enumerate(idx_test):
    pred_tfidf_full[global_i] = "; ".join(pred_tfidf[local_i])
debug["pred_tfidf_testonly"] = pred_tfidf_full

debug.to_csv("paper_predictions_debug.csv", index=False)

# ---------- Zip ----------
artifacts = [
    "paper_results_metrics.csv",
    "paper_predictions_debug.csv",
    "figure_top_labels_support.png",
    "figure_top_labels_tfidf_recall.png",
]
zip_name = "paper_artifacts.zip"
with zipfile.ZipFile(zip_name, "w", zipfile.ZIP_DEFLATED) as z:
    for f in artifacts:
        if os.path.exists(f):
            z.write(f)

print("\nSaved artifacts:")
for f in artifacts + [zip_name]:
    if os.path.exists(f):
        print(" -", f)

# ---------- Auto-download ----------
from google.colab import files
print("\nDownloading artifacts...")
for f in artifacts + [zip_name]:
    if os.path.exists(f):
        files.download(f)

print("\nALL DONE.")


PHASE 1/2: Repairing broken numpy environment, then restarting runtime...

Removing suspicious numpy folders...
 - /usr/local/lib/python3.12/dist-packages/numpy
 - /usr/local/lib/python3.12/dist-packages/numpy-1.26.4.dist-info
 - /usr/local/lib/python3.12/dist-packages/numpy.libs

Uninstalling potentially broken packages...

Installing clean pinned stack...
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 61.0/61.0 kB 67.4 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 60.4/60.4 kB 254.7 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 18.0/18.0 MB 244.4 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.7/12.7 MB 221.1 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 35.8/35.8 MB 144.9 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.2/12.2 MB 146.7 MB/s eta 0:00:00
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following depen

In [None]:
# ============================================================
# ONE CELL (CLEAN RESET + UPLOAD + PAPER BASELINES + DOWNLOAD)
# Purpose:
#   - Fix broken numpy installs in Colab (numpy.char / numpy.strings missing)
#   - After reconnect, upload your files and run paper-ready baselines
#
# Inputs (upload in Phase 2):
#   - asrs_narratives_clean(.csv/.xlsx)      [optional]
#   - asrs_paper_dataset(.csv/.xlsx)         [optional]
#   - asrs_paper_dataset_sample(.csv/.xlsx)  [recommended for speed]
#
# Outputs:
#   - paper_results_metrics.csv
#   - paper_predictions_debug.csv
#   - figure_top_labels_support.png
#   - figure_top_labels_tfidf_recall.png
#   - paper_artifacts.zip   (all above packed)
#
# Notes:
#   - This cell intentionally kills the process after fixing env to force restart.
#     Colab shows "session crashed" -> that's expected.
# ============================================================

import os, sys, subprocess, time, signal, zipfile, re, random

PHASE_FLAG = "/content/.ASRS_PHASE2_READY"

def run(cmd):
    """Run a shell command and print output (robust)."""
    p = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
    return p.returncode, p.stdout

def pip_install(pkgs):
    cmd = "python -m pip -q install --no-cache-dir " + " ".join(pkgs)
    return run(cmd)

def pip_uninstall(pkgs):
    cmd = "python -m pip -q uninstall -y " + " ".join(pkgs) + " || true"
    return run(cmd)

# -------------------------
# PHASE 1: Hard repair numpy + restart runtime
# -------------------------
if not os.path.exists(PHASE_FLAG):
    print("PHASE 1/2: Repairing Python environment (numpy/pandas/sklearn/scipy)...\n")

    # 1) Uninstall common conflicting packages
    print("Uninstalling conflicting packages...")
    _, out = pip_uninstall(["numpy","pandas","scikit-learn","scipy","transformers","tokenizers","accelerate"])
    print(out)

    # 2) Reinstall pinned stable stack (CPU/GPU both OK)
    print("Installing pinned clean stack (Colab-safe)...")
    code, out = pip_install([
        "numpy==1.26.4",
        "pandas==2.2.2",
        "scipy==1.11.4",
        "scikit-learn==1.4.2",
        "matplotlib",
        "tqdm",
        "openpyxl"
    ])
    print(out)

    # 3) Sanity check for numpy critical modules
    print("\nSanity checking numpy import health...")
    code, out = run(
        "python - <<'PY'\n"
        "import numpy as np\n"
        "import numpy.char\n"
        "print('numpy_version', np.__version__)\n"
        "print('numpy_file', np.__file__)\n"
        "print('numpy_char_ok')\n"
        "PY"
    )
    print(out)

    if "numpy_char_ok" not in out:
        print("\n[FAILED] numpy still broken after reinstall.\n"
              "Do: Runtime -> Factory reset runtime, then run this cell again.\n")
        raise RuntimeError("numpy still broken")

    # Mark phase 2 ready and force a hard restart
    os.makedirs("/content", exist_ok=True)
    with open(PHASE_FLAG, "w") as f:
        f.write("ok")

    print("\nEnvironment repaired. Forcing runtime restart now...\n"
          "(Colab may say 'session crashed' — that's expected.)\n")

    time.sleep(1)
    os.kill(os.getpid(), signal.SIGKILL)

# -------------------------
# PHASE 2: Upload files + run paper-ready baselines + download
# -------------------------
print("PHASE 2/2: Upload -> Run paper baselines -> Export artifacts\n")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

print("ENV OK:",
      "numpy", np.__version__,
      "| pandas", pd.__version__)

# -------------------------
# Upload
# -------------------------
from google.colab import files
print("\n[UPLOAD] Please upload your files now (csv OR xlsx).")
print("Expected names contain keywords like:")
print(" - narratives_clean")
print(" - paper_dataset")
print(" - paper_dataset_sample\n")

uploaded = files.upload()
assert len(uploaded) > 0, "No files uploaded."

names = list(uploaded.keys())
print("\nUploaded:")
for n in names:
    print(" -", n)

def pick_file(names, must_have_any):
    low = [n.lower() for n in names]
    for i, n in enumerate(low):
        if any(k in n for k in must_have_any):
            return names[i]
    return None

f_sample = pick_file(names, ["paper_dataset_sample", "dataset_sample", "sample"])
f_full   = pick_file(names, ["paper_dataset.csv", "paper_dataset.xlsx", "asrs_paper_dataset"])
f_narr   = pick_file(names, ["narratives_clean", "narrative_clean"])

print("\nDetected:")
print(" - narratives_clean:", f_narr)
print(" - paper_dataset(full):", f_full)
print(" - paper_dataset_sample:", f_sample)

# Prefer sample for speed, fallback to full if no sample
DATA_FILE = f_sample if f_sample is not None else f_full
assert DATA_FILE is not None, "Cannot find sample or full dataset in your uploads."

def load_table(path):
    path_low = path.lower()
    if path_low.endswith(".csv"):
        return pd.read_csv(path, low_memory=False)
    if path_low.endswith(".xlsx") or path_low.endswith(".xls"):
        return pd.read_excel(path)
    raise ValueError(f"Unsupported file type: {path}")

df = load_table(DATA_FILE)
print(f"\nUsing dataset: {DATA_FILE} | shape={df.shape}")

# -------------------------
# Normalize columns
# -------------------------
cols_map = {c.lower(): c for c in df.columns}

for need in ["incident_id", "narrative", "labels"]:
    assert need in cols_map, f"Missing column '{need}'. Found: {list(df.columns)}"

df = df.rename(columns={
    cols_map["incident_id"]:"incident_id",
    cols_map["narrative"]:"narrative",
    cols_map["labels"]:"labels"
})

df["narrative"] = df["narrative"].astype(str)

# -------------------------
# Config
# -------------------------
TOPK = 50
SEED = 42
TEST_SIZE = 0.2

random.seed(SEED)
np.random.seed(SEED)

# -------------------------
# Labels parse + clean
# -------------------------
def parse_labels(s):
    if not isinstance(s, str) or not s.strip():
        return []
    return [x.strip() for x in s.split(";") if x.strip()]

def clean_label(l):
    l = str(l)
    l = l.replace("\xa0", " ").replace("&nbsp;", " ").replace("&nbsp", " ")
    l = re.sub(r"\s+", " ", l).strip()
    if l == "" or l.lower() in {"none","nan","null"}:
        return ""
    return l

df["gold"] = df["labels"].apply(parse_labels)
df["gold"] = df["gold"].apply(lambda labs: [clean_label(x) for x in labs])
df["gold"] = df["gold"].apply(lambda labs: [x for x in labs if x])

all_gold = [lab for labs in df["gold"] for lab in labs]
freq = pd.Series(all_gold).value_counts()
label_space = freq.head(TOPK).index.tolist()
label_set = set(label_space)

df["gold_topk"] = df["gold"].apply(lambda labs: [l for l in labs if l in label_set])
df_eval = df[df["gold_topk"].map(len) > 0].copy()

print(f"\nTopK={TOPK} | Eval rows={len(df_eval)}/{len(df)}")
print("Top 10 labels:", label_space[:10])

mlb = MultiLabelBinarizer(classes=label_space)
Y_gold_all = df_eval["gold_topk"].tolist()

def compute_metrics(true_list, pred_list):
    yt = mlb.fit_transform(true_list)
    yp = mlb.transform(pred_list)
    return {
        "micro_f1": float(f1_score(yt, yp, average="micro", zero_division=0)),
        "macro_f1": float(f1_score(yt, yp, average="macro", zero_division=0)),
        "micro_precision": float(precision_score(yt, yp, average="micro", zero_division=0)),
        "micro_recall": float(recall_score(yt, yp, average="micro", zero_division=0)),
    }

rows = []

# -------------------------
# Baseline A: Rule substring match
# -------------------------
def normalize(s):
    return re.sub(r"\s+", " ", str(s).lower()).strip()

label_value = {l: normalize(l.split(":", 1)[-1]) for l in label_space}

def rule_pred(text):
    t = normalize(text)
    out = []
    for lab in label_space:
        key = label_value[lab]
        if len(key) < 4:
            continue
        if key in t:
            out.append(lab)
    return sorted(list(dict.fromkeys(out)))

df_eval["pred_rule"] = df_eval["narrative"].apply(rule_pred)
m_rule = compute_metrics(Y_gold_all, df_eval["pred_rule"].tolist())
rows.append({"method":"Rule baseline (substring)", **m_rule})

# -------------------------
# Baseline B: TF-IDF + OvR Logistic Regression
# -------------------------
X = df_eval["narrative"].values
Ybin = mlb.fit_transform(Y_gold_all)

X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    X, Ybin, np.arange(len(df_eval)),
    test_size=TEST_SIZE, random_state=SEED
)

vec = TfidfVectorizer(ngram_range=(1,2), min_df=2, max_features=80000)
Xtr = vec.fit_transform(X_train)
Xte = vec.transform(X_test)

probs = np.zeros((Xte.shape[0], len(label_space)), dtype=float)

print("\nTraining TF-IDF + LogisticRegression (OvR) ...")
for j, lab in enumerate(label_space):
    yj = y_train[:, j]
    if yj.sum() < 5:
        continue
    clf = LogisticRegression(max_iter=2000, class_weight="balanced")
    clf.fit(Xtr, yj)
    probs[:, j] = clf.predict_proba(Xte)[:, 1]

y_pred = (probs >= 0.5).astype(int)
pred_tfidf = [[label_space[j] for j in np.where(row == 1)[0]] for row in y_pred]

gold_test = [Y_gold_all[i] for i in idx_test]
m_tfidf = compute_metrics(gold_test, pred_tfidf)
rows.append({"method":"TF-IDF + LogReg (test split)", **m_tfidf})

# -------------------------
# Save metrics
# -------------------------
metrics_df = pd.DataFrame(rows)
print("\n=== PAPER METRICS ===")
print(metrics_df)
metrics_df.to_csv("paper_results_metrics.csv", index=False)

# -------------------------
# Figures (Top-20 support + recall on test split)
# -------------------------
yt = mlb.fit_transform(gold_test)
yp = mlb.transform(pred_tfidf)

support = np.asarray(yt.sum(axis=0)).ravel()
tp = np.asarray((yt.multiply(yp)).sum(axis=0)).ravel()
rec = np.divide(tp, support, out=np.zeros_like(tp, dtype=float), where=support > 0)

top_idx = np.argsort(-support)[:20]
top_labels = [label_space[i] for i in top_idx]

plt.figure(figsize=(10,5))
plt.bar(range(len(top_idx)), support[top_idx])
plt.xticks(range(len(top_idx)), [l.split(":")[0] for l in top_labels], rotation=45, ha="right")
plt.ylabel("Support (count in gold, test split)")
plt.title("Top-20 Label Support (ASRS TopK)")
plt.tight_layout()
plt.savefig("figure_top_labels_support.png", dpi=300)
plt.show()

plt.figure(figsize=(10,5))
plt.bar(range(len(top_idx)), rec[top_idx])
plt.xticks(range(len(top_idx)), [l.split(":")[0] for l in top_labels], rotation=45, ha="right")
plt.ylabel("Recall (TF-IDF)")
plt.ylim(0, 1)
plt.title("TF-IDF Recall on Top-20 Labels (test split)")
plt.tight_layout()
plt.savefig("figure_top_labels_tfidf_recall.png", dpi=300)
plt.show()

# -------------------------
# Debug predictions
# -------------------------
debug = df_eval[["incident_id","narrative","gold_topk"]].copy()
debug["gold_topk"] = debug["gold_topk"].apply(lambda x: "; ".join(x))
debug["pred_rule"] = df_eval["pred_rule"].apply(lambda x: "; ".join(x))

# Only fill TF-IDF predictions for test split indices
pred_tfidf_full = [""] * len(df_eval)
for local_i, global_i in enumerate(idx_test):
    pred_tfidf_full[global_i] = "; ".join(pred_tfidf[local_i])
debug["pred_tfidf_testonly"] = pred_tfidf_full

debug.to_csv("paper_predictions_debug.csv", index=False)

# -------------------------
# Zip + download
# -------------------------
artifacts = [
    "paper_results_metrics.csv",
    "paper_predictions_debug.csv",
    "figure_top_labels_support.png",
    "figure_top_labels_tfidf_recall.png",
]
zip_name = "paper_artifacts.zip"

with zipfile.ZipFile(zip_name, "w", zipfile.ZIP_DEFLATED) as z:
    for f in artifacts:
        if os.path.exists(f):
            z.write(f)

print("\nSaved artifacts:")
for f in artifacts + [zip_name]:
    if os.path.exists(f):
        print(" -", f)

from google.colab import files
print("\nDownloading artifacts...")
for f in artifacts + [zip_name]:
    if os.path.exists(f):
        files.download(f)

print("\nALL DONE.")


PHASE 1/2: Repairing Python environment (numpy/pandas/sklearn/scipy)...

Uninstalling conflicting packages...

Installing pinned clean stack (Colab-safe)...
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 61.0/61.0 kB 74.8 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 60.4/60.4 kB 127.1 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 18.0/18.0 MB 108.1 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.7/12.7 MB 77.8 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 35.8/35.8 MB 43.0 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.2/12.2 MB 57.5 MB/s eta 0:00:00
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sentence-transformers 5.2.2 requires transformers<6.0.0,>=4.41.0, which is not installed.
torchtune 0.6.1 requires tokenizers, which is not installed.
peft 0.18.1 requires accelerate>=0.21.

RuntimeError: numpy still broken

In [None]:
# ============================================================
# ONE CELL (MOST STABLE): Fix broken numpy -> Upload -> Paper baselines -> Download
# - Fix numpy "numpy.char"/"numpy.strings" missing by reinstalling a clean stack
# - No transformers/torch: avoids GPU/LLM dependency conflicts
# Input upload: asrs_paper_dataset_sample.(csv/xlsx)  [preferred]
#               asrs_paper_dataset.(csv/xlsx)         [fallback]
#               asrs_narratives_clean.(csv/xlsx)      [optional, not required]
# Output: metrics + figures + debug CSV + zip + auto-download
# ============================================================

import os, sys, subprocess, time, signal, zipfile, re, random

PHASE_FLAG = "/content/.ASRS_BASELINE_PHASE2_READY"

def sh(cmd):
    p = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
    return p.returncode, p.stdout

def pip_install(pkgs):
    cmd = "python -m pip -q install --no-cache-dir " + " ".join(pkgs)
    return sh(cmd)

def pip_uninstall(pkgs):
    cmd = "python -m pip -q uninstall -y " + " ".join(pkgs) + " || true"
    return sh(cmd)

# -------------------------
# PHASE 1: repair numpy stack and restart (only once)
# -------------------------
if not os.path.exists(PHASE_FLAG):
    print("PHASE 1/2: Repairing environment (clean numpy stack)...\n")

    # Remove possibly broken core libs
    print("Uninstalling numpy/pandas/scipy/sklearn (ignore errors)...")
    _, out = pip_uninstall(["numpy","pandas","scipy","scikit-learn","openpyxl"])
    print(out)

    print("Installing pinned clean stack (for paper baselines)...")
    code, out = pip_install([
        "numpy==1.26.4",
        "pandas==2.2.2",
        "scipy==1.11.4",
        "scikit-learn==1.4.2",
        "matplotlib",
        "tqdm",
        "openpyxl"
    ])
    print(out)

    print("\nSanity check numpy core modules...")
    code, out = sh(
        "python - <<'PY'\n"
        "import numpy as np\n"
        "import numpy.char\n"
        "print('numpy_version', np.__version__)\n"
        "print('numpy_char_ok')\n"
        "PY"
    )
    print(out)

    if "numpy_char_ok" not in out:
        print("\n[FAILED] numpy still broken.\n"
              "Do: Runtime -> Factory reset runtime, then run this cell again.\n")
        raise RuntimeError("numpy still broken")

    with open(PHASE_FLAG, "w") as f:
        f.write("ok")

    print("\nEnvironment fixed. Forcing restart now (Colab may show 'crashed' — normal).")
    time.sleep(1)
    os.kill(os.getpid(), signal.SIGKILL)

# -------------------------
# PHASE 2: upload + run paper baselines
# -------------------------
print("PHASE 2/2: Upload files -> Run baselines -> Export artifacts\n")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

print("ENV OK:", "numpy", np.__version__, "| pandas", pd.__version__)

# -------------------------
# Upload
# -------------------------
from google.colab import files
print("\n[UPLOAD] Upload your dataset files now (csv OR xlsx).")
print("Recommended: asrs_paper_dataset_sample.csv (or .xlsx)\n")
uploaded = files.upload()
assert len(uploaded) > 0, "No files uploaded."

names = list(uploaded.keys())
print("\nUploaded:")
for n in names:
    print(" -", n)

def pick(names, keywords):
    low = [n.lower() for n in names]
    for i, n in enumerate(low):
        if any(k in n for k in keywords):
            return names[i]
    return None

f_sample = pick(names, ["paper_dataset_sample", "dataset_sample", "sample"])
f_full   = pick(names, ["paper_dataset", "asrs_paper_dataset"])
DATA_FILE = f_sample if f_sample else f_full
assert DATA_FILE is not None, "Didn't find paper_dataset_sample or paper_dataset in your uploads."

def load_table(path):
    p = path.lower()
    if p.endswith(".csv"):
        return pd.read_csv(path, low_memory=False)
    if p.endswith(".xlsx") or p.endswith(".xls"):
        return pd.read_excel(path)
    raise ValueError(f"Unsupported file: {path}")

df = load_table(DATA_FILE)
print(f"\nUsing: {DATA_FILE} | shape={df.shape}")

# -------------------------
# Normalize required columns
# -------------------------
cols_map = {c.lower(): c for c in df.columns}
need = ["incident_id", "narrative", "labels"]
for c in need:
    assert c in cols_map, f"Missing column '{c}'. Found: {list(df.columns)}"

df = df.rename(columns={
    cols_map["incident_id"]:"incident_id",
    cols_map["narrative"]:"narrative",
    cols_map["labels"]:"labels"
})
df["narrative"] = df["narrative"].astype(str)

# -------------------------
# Config
# -------------------------
TOPK = 50
SEED = 42
TEST_SIZE = 0.2

random.seed(SEED)
np.random.seed(SEED)

# -------------------------
# Parse + clean labels
# -------------------------
def parse_labels(s):
    if not isinstance(s, str) or not s.strip():
        return []
    return [x.strip() for x in s.split(";") if x.strip()]

def clean_label(l):
    l = str(l)
    l = l.replace("\xa0", " ").replace("&nbsp;", " ").replace("&nbsp", " ")
    l = re.sub(r"\s+", " ", l).strip()
    if l == "" or l.lower() in {"none","nan","null"}:
        return ""
    return l

df["gold"] = df["labels"].apply(parse_labels)
df["gold"] = df["gold"].apply(lambda labs: [clean_label(x) for x in labs])
df["gold"] = df["gold"].apply(lambda labs: [x for x in labs if x])

all_gold = [lab for labs in df["gold"] for lab in labs]
freq = pd.Series(all_gold).value_counts()
label_space = freq.head(TOPK).index.tolist()
label_set = set(label_space)

df["gold_topk"] = df["gold"].apply(lambda labs: [l for l in labs if l in label_set])
df_eval = df[df["gold_topk"].map(len) > 0].copy()

print(f"\nTOPK={TOPK} | eval rows={len(df_eval)}/{len(df)}")
print("Top 10 labels:", label_space[:10])

mlb = MultiLabelBinarizer(classes=label_space)

def compute_metrics(true_list, pred_list):
    yt = mlb.fit_transform(true_list)
    yp = mlb.transform(pred_list)
    return {
        "micro_f1": float(f1_score(yt, yp, average="micro", zero_division=0)),
        "macro_f1": float(f1_score(yt, yp, average="macro", zero_division=0)),
        "micro_precision": float(precision_score(yt, yp, average="micro", zero_division=0)),
        "micro_recall": float(recall_score(yt, yp, average="micro", zero_division=0)),
    }

rows = []
Y_gold_all = df_eval["gold_topk"].tolist()

# -------------------------
# Baseline 1: Rule substring
# -------------------------
def normalize(s):
    return re.sub(r"\s+", " ", str(s).lower()).strip()

label_value = {l: normalize(l.split(":", 1)[-1]) for l in label_space}

def rule_pred(text):
    t = normalize(text)
    out = []
    for lab in label_space:
        key = label_value[lab]
        if len(key) < 4:
            continue
        if key in t:
            out.append(lab)
    return sorted(list(dict.fromkeys(out)))

df_eval["pred_rule"] = df_eval["narrative"].apply(rule_pred)
m_rule = compute_metrics(Y_gold_all, df_eval["pred_rule"].tolist())
rows.append({"method":"Rule baseline (substring)", **m_rule})

# -------------------------
# Baseline 2: TF-IDF + OvR Logistic Regression
# -------------------------
X = df_eval["narrative"].values
Ybin = mlb.fit_transform(Y_gold_all)

X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    X, Ybin, np.arange(len(df_eval)),
    test_size=TEST_SIZE, random_state=SEED
)

vec = TfidfVectorizer(ngram_range=(1,2), min_df=2, max_features=80000)
Xtr = vec.fit_transform(X_train)
Xte = vec.transform(X_test)

probs = np.zeros((Xte.shape[0], len(label_space)), dtype=float)

print("\nTraining TF-IDF + LogisticRegression (OvR)...")
for j, lab in enumerate(label_space):
    yj = y_train[:, j]
    if yj.sum() < 5:
        continue
    clf = LogisticRegression(max_iter=2000, class_weight="balanced")
    clf.fit(Xtr, yj)
    probs[:, j] = clf.predict_proba(Xte)[:, 1]

y_pred = (probs >= 0.5).astype(int)
pred_tfidf = [[label_space[j] for j in np.where(row == 1)[0]] for row in y_pred]

gold_test = [Y_gold_all[i] for i in idx_test]
m_tfidf = compute_metrics(gold_test, pred_tfidf)
rows.append({"method":"TF-IDF + LogReg (test split)", **m_tfidf})

# -------------------------
# Save metrics
# -------------------------
metrics_df = pd.DataFrame(rows)
print("\n=== PAPER METRICS ===")
print(metrics_df)
metrics_df.to_csv("paper_results_metrics.csv", index=False)

# -------------------------
# Figures
# -------------------------
yt = mlb.fit_transform(gold_test)
yp = mlb.transform(pred_tfidf)

support = np.asarray(yt.sum(axis=0)).ravel()
tp = np.asarray((yt.multiply(yp)).sum(axis=0)).ravel()
rec = np.divide(tp, support, out=np.zeros_like(tp, dtype=float), where=support > 0)

top_idx = np.argsort(-support)[:20]
top_labels = [label_space[i] for i in top_idx]

plt.figure(figsize=(10,5))
plt.bar(range(len(top_idx)), support[top_idx])
plt.xticks(range(len(top_idx)), [l.split(":")[0] for l in top_labels], rotation=45, ha="right")
plt.ylabel("Support (count in gold, test split)")
plt.title("Top-20 Label Support (ASRS TopK)")
plt.tight_layout()
plt.savefig("figure_top_labels_support.png", dpi=300)
plt.show()

plt.figure(figsize=(10,5))
plt.bar(range(len(top_idx)), rec[top_idx])
plt.xticks(range(len(top_idx)), [l.split(":")[0] for l in top_labels], rotation=45, ha="right")
plt.ylabel("Recall (TF-IDF)")
plt.ylim(0, 1)
plt.title("TF-IDF Recall on Top-20 Labels (test split)")
plt.tight_layout()
plt.savefig("figure_top_labels_tfidf_recall.png", dpi=300)
plt.show()

# -------------------------
# Debug CSV
# -------------------------
debug = df_eval[["incident_id","narrative","gold_topk"]].copy()
debug["gold_topk"] = debug["gold_topk"].apply(lambda x: "; ".join(x))
debug["pred_rule"] = df_eval["pred_rule"].apply(lambda x: "; ".join(x))

pred_tfidf_full = [""] * len(df_eval)
for local_i, global_i in enumerate(idx_test):
    pred_tfidf_full[global_i] = "; ".join(pred_tfidf[local_i])
debug["pred_tfidf_testonly"] = pred_tfidf_full

debug.to_csv("paper_predictions_debug.csv", index=False)

# -------------------------
# Zip + auto download
# -------------------------
artifacts = [
    "paper_results_metrics.csv",
    "paper_predictions_debug.csv",
    "figure_top_labels_support.png",
    "figure_top_labels_tfidf_recall.png",
]
zip_name = "paper_artifacts.zip"

with zipfile.ZipFile(zip_name, "w", zipfile.ZIP_DEFLATED) as z:
    for f in artifacts:
        if os.path.exists(f):
            z.write(f)

print("\nSaved artifacts:")
for f in artifacts + [zip_name]:
    if os.path.exists(f):
        print(" -", f)

from google.colab import files
print("\nDownloading artifacts...")
for f in artifacts + [zip_name]:
    if os.path.exists(f):
        files.download(f)

print("\nALL DONE.")


PHASE 1/2: Repairing environment (clean numpy stack)...

Uninstalling numpy/pandas/scipy/sklearn (ignore errors)...

Installing pinned clean stack (for paper baselines)...
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 61.0/61.0 kB 5.7 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 60.4/60.4 kB 152.0 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 18.0/18.0 MB 255.2 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.7/12.7 MB 211.0 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 35.8/35.8 MB 190.2 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.2/12.2 MB 221.5 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 250.9/250.9 kB 267.8 MB/s eta 0:00:00
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sentence-transformers 5.2.2 requires transformers<6.0.0,>=4.41.0, which is not installed.
t

RuntimeError: numpy still broken

In [None]:
# ===== CELL 2 (FIXED): UPLOAD -> CLEAN LABELS -> FAST BASELINES -> EXPORT -> DOWNLOAD =====
import os, re, zipfile, random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

from google.colab import files

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier

print("ENV:", "numpy", np.__version__, "| pandas", pd.__version__)

# -------------------------
# 0) Upload
# -------------------------
print("\n[UPLOAD] Please upload your files (csv/xlsx):")
print(" - asrs_paper_dataset_sample.*  (preferred)")
print(" - asrs_paper_dataset.*         (fallback)")
print(" - asrs_narratives_clean.*      (optional)\n")
uploaded = files.upload()
assert len(uploaded) > 0, "No files uploaded."

names = list(uploaded.keys())
print("\nUploaded:")
for n in names:
    print(" -", n)

def pick(names, keywords):
    low = [n.lower() for n in names]
    for i, n in enumerate(low):
        if any(k in n for k in keywords):
            return names[i]
    return None

f_sample = pick(names, ["paper_dataset_sample", "dataset_sample", "sample"])
f_full   = pick(names, ["paper_dataset", "asrs_paper_dataset"])
DATA_FILE = f_sample if f_sample else f_full
assert DATA_FILE is not None, "Can't find asrs_paper_dataset_sample or asrs_paper_dataset in uploads."

def load_table(path):
    p = path.lower()
    if p.endswith(".csv"):
        return pd.read_csv(path, low_memory=False)
    if p.endswith(".xlsx") or p.endswith(".xls"):
        return pd.read_excel(path)
    raise ValueError(f"Unsupported file type: {path}")

df = load_table(DATA_FILE)
print(f"\nUsing: {DATA_FILE} | shape={df.shape}")

# -------------------------
# 1) Normalize columns
# -------------------------
cols_map = {c.lower(): c for c in df.columns}
for c in ["incident_id","narrative","labels"]:
    assert c in cols_map, f"Missing required column '{c}'. Found: {list(df.columns)}"

df = df.rename(columns={
    cols_map["incident_id"]:"incident_id",
    cols_map["narrative"]:"narrative",
    cols_map["labels"]:"labels"
})

df["narrative"] = df["narrative"].astype(str)

# -------------------------
# 2) Config
# -------------------------
TOPK = 50
SEED = 42
TEST_SIZE = 0.2
FAST_ML_BASELINE = True   # True=SGD(快)；False=不跑ML baseline
random.seed(SEED)
np.random.seed(SEED)

# -------------------------
# 3) Label parsing + CLEAN (关键修复：丢掉 "xxx: nan")
# -------------------------
def parse_labels(s):
    if not isinstance(s, str) or not s.strip():
        return []
    return [x.strip() for x in s.split(";") if x.strip()]

def clean_label(l):
    l = str(l)
    l = l.replace("\xa0", " ").replace("&nbsp;", " ").replace("&nbsp", " ")
    l = re.sub(r"\s+", " ", l).strip()

    if l == "" or l.lower() in {"none","nan","null"}:
        return ""

    # ✅ 关键：如果是 "Field: Value"，Value 是 nan/none/空 直接丢掉
    if ":" in l:
        k, v = l.split(":", 1)
        k = k.strip()
        v = v.strip()
        if v.lower() in {"nan", "none", "null", ""}:
            return ""
        return f"{k}: {v}"

    return l

df["gold"] = df["labels"].apply(parse_labels)
df["gold"] = df["gold"].apply(lambda labs: [clean_label(x) for x in labs])
df["gold"] = df["gold"].apply(lambda labs: [x for x in labs if x])

# ✅ TopK AFTER cleaning
all_gold = [lab for labs in df["gold"] for lab in labs]
freq = pd.Series(all_gold).value_counts()
label_space = freq.head(TOPK).index.tolist()
label_set = set(label_space)

df["gold_topk"] = df["gold"].apply(lambda labs: [l for l in labs if l in label_set])
df_eval = df[df["gold_topk"].map(len) > 0].copy()

print(f"\nTOPK={TOPK} | eval rows={len(df_eval)}/{len(df)}")
print("Top 10 labels:", label_space[:10])
assert all(": nan" not in x.lower() for x in label_space), "TopK still contains ': nan' -> check your input labels format."

# -------------------------
# 4) Metrics helpers
# -------------------------
mlb = MultiLabelBinarizer(classes=label_space)

def compute_metrics(true_list, pred_list):
    yt = mlb.fit_transform(true_list)
    yp = mlb.transform(pred_list)
    return {
        "micro_f1": float(f1_score(yt, yp, average="micro", zero_division=0)),
        "macro_f1": float(f1_score(yt, yp, average="macro", zero_division=0)),
        "micro_precision": float(precision_score(yt, yp, average="micro", zero_division=0)),
        "micro_recall": float(recall_score(yt, yp, average="micro", zero_division=0)),
    }

rows = []
Y_gold_all = df_eval["gold_topk"].tolist()

# -------------------------
# 5) Baseline 1: rule substring
# -------------------------
def normalize(s):
    return re.sub(r"\s+", " ", str(s).lower()).strip()

label_value = {l: normalize(l.split(":", 1)[-1]) for l in label_space}

def rule_pred(text):
    t = normalize(text)
    out = []
    for lab in label_space:
        key = label_value[lab]
        if len(key) < 4:
            continue
        if key in t:
            out.append(lab)
    return sorted(list(dict.fromkeys(out)))

df_eval["pred_rule"] = df_eval["narrative"].apply(rule_pred)
m_rule = compute_metrics(Y_gold_all, df_eval["pred_rule"].tolist())
rows.append({"method":"Rule baseline (substring)", **m_rule})

# -------------------------
# 6) Baseline 2: TF-IDF + OneVsRest SGD (FAST, paper-usable)
# -------------------------
pred_test = None
gold_test = None
idx_test = None

if FAST_ML_BASELINE:
    X = df_eval["narrative"].values
    Ybin = mlb.fit_transform(Y_gold_all)

    X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
        X, Ybin, np.arange(len(df_eval)), test_size=TEST_SIZE, random_state=SEED
    )

    vec = TfidfVectorizer(ngram_range=(1,2), min_df=2, max_features=80000)
    Xtr = vec.fit_transform(X_train)
    Xte = vec.transform(X_test)

    print("\nTraining TF-IDF + OneVsRest(SGDClassifier) ... (FAST)")
    clf = OneVsRestClassifier(
        SGDClassifier(loss="log_loss", alpha=1e-5, max_iter=25, tol=1e-3)
    )
    clf.fit(Xtr, y_train)

    # predict_proba exists for log_loss; returns prob matrix
    probs = clf.predict_proba(Xte)
    y_pred = (probs >= 0.5).astype(int)

    pred_test = [[label_space[j] for j in np.where(row == 1)[0]] for row in y_pred]
    gold_test = [Y_gold_all[i] for i in idx_test]

    m_tfidf = compute_metrics(gold_test, pred_test)
    rows.append({"method":"TF-IDF + OvR SGD (test split)", **m_tfidf})

# -------------------------
# 7) Save metrics
# -------------------------
metrics_df = pd.DataFrame(rows)
print("\n=== PAPER METRICS ===")
print(metrics_df)
metrics_df.to_csv("paper_results_metrics.csv", index=False)

# -------------------------
# 8) Figures (use ML baseline test split if available; else use gold_all vs rule)
# -------------------------
if FAST_ML_BASELINE and (pred_test is not None):
    yt = mlb.fit_transform(gold_test)
    yp = mlb.transform(pred_test)
    title_suffix = " (test split, TF-IDF)"
else:
    yt = mlb.fit_transform(Y_gold_all)
    yp = mlb.transform(df_eval["pred_rule"].tolist())
    title_suffix = " (rule baseline)"

support = np.asarray(yt.sum(axis=0)).ravel()
tp = np.asarray((yt.multiply(yp)).sum(axis=0)).ravel()
rec = np.divide(tp, support, out=np.zeros_like(tp, dtype=float), where=support > 0)

top_idx = np.argsort(-support)[:20]
top_labels = [label_space[i] for i in top_idx]

plt.figure(figsize=(10,5))
plt.bar(range(len(top_idx)), support[top_idx])
plt.xticks(range(len(top_idx)), [l.split(":")[0] for l in top_labels], rotation=45, ha="right")
plt.ylabel("Support (count)")
plt.title("Top-20 Label Support" + title_suffix)
plt.tight_layout()
plt.savefig("figure_top_labels_support.png", dpi=300)
plt.show()

plt.figure(figsize=(10,5))
plt.bar(range(len(top_idx)), rec[top_idx])
plt.xticks(range(len(top_idx)), [l.split(":")[0] for l in top_labels], rotation=45, ha="right")
plt.ylabel("Recall")
plt.ylim(0, 1)
plt.title("Recall on Top-20 Labels" + title_suffix)
plt.tight_layout()
plt.savefig("figure_top_labels_recall.png", dpi=300)
plt.show()

# -------------------------
# 9) Debug CSV
# -------------------------
debug = df_eval[["incident_id","narrative","gold_topk"]].copy()
debug["gold_topk"] = debug["gold_topk"].apply(lambda x: "; ".join(x))
debug["pred_rule"] = df_eval["pred_rule"].apply(lambda x: "; ".join(x))

# Put ML preds only for test rows
pred_ml_full = [""] * len(df_eval)
if FAST_ML_BASELINE and (pred_test is not None):
    for local_i, global_i in enumerate(idx_test):
        pred_ml_full[global_i] = "; ".join(pred_test[local_i])
debug["pred_tfidf_testonly"] = pred_ml_full

debug.to_csv("paper_predictions_debug.csv", index=False)

# -------------------------
# 10) Zip + download
# -------------------------
artifacts = [
    "paper_results_metrics.csv",
    "paper_predictions_debug.csv",
    "figure_top_labels_support.png",
    "figure_top_labels_recall.png",
]
zip_name = "paper_artifacts.zip"
with zipfile.ZipFile(zip_name, "w", zipfile.ZIP_DEFLATED) as z:
    for f in artifacts:
        if os.path.exists(f):
            z.write(f)

print("\nSaved artifacts:")
for f in artifacts + [zip_name]:
    if os.path.exists(f):
        print(" -", f)

print("\nDownloading...")
for f in artifacts + [zip_name]:
    if os.path.exists(f):
        files.download(f)

print("\nALL DONE.")


ENV: numpy 1.26.4 | pandas 2.2.2

[UPLOAD] Please upload your files (csv/xlsx):
 - asrs_paper_dataset_sample.*  (preferred)
 - asrs_paper_dataset.*         (fallback)
 - asrs_narratives_clean.*      (optional)



Saving asrs_narratives_clean.csv to asrs_narratives_clean (1).csv
Saving asrs_paper_dataset.csv to asrs_paper_dataset (1).csv
Saving asrs_paper_dataset_sample.csv to asrs_paper_dataset_sample (1).csv

Uploaded:
 - asrs_narratives_clean (1).csv
 - asrs_paper_dataset (1).csv
 - asrs_paper_dataset_sample (1).csv

Using: asrs_paper_dataset_sample (1).csv | shape=(2000, 3)

TOPK=50 | eval rows=687/2000
Top 10 labels: ['Reference: X', 'Altitude.AGL.Single Value: 0.0', 'Crew Size.Number Of Crew: Flight Crew Size', 'ASRS Report Number.Accession Number: ACN', 'Flight Plan: IFR', 'Detector.Person: Flight Crew', 'Aircraft Operator: Air Carrier', 'Operating Under FAR Part: 121', 'Altitude.MSL.Single Value: MSL', 'Flight Conditions: VMC']

Training TF-IDF + OneVsRest(SGDClassifier) ... (FAST)

=== PAPER METRICS ===
                          method  micro_f1  macro_f1  micro_precision  \
0      Rule baseline (substring)  0.086290  0.073068         0.233141   
1  TF-IDF + OvR SGD (test split)  0.4996

AttributeError: 'numpy.ndarray' object has no attribute 'multiply'

In [None]:
# ============================================================
# ASRS multi-label baseline: Rule baseline + TFIDF OvR(SGD)
# numpy 1.26.4 | pandas 2.2.2
# Fix for yt.multiply(yp): works for dense numpy arrays
# ============================================================

import os
import re
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier

# ----------------------------
# 0) Config
# ----------------------------
DATA_PATH = "asrs_paper_dataset_sample (1).csv"   # <-- 改成你的文件名
RANDOM_STATE = 42
TEST_SIZE = 0.30
TOPK = 50

# 若你的标签分隔符不是这些，可以自行加
LABEL_SEPS = [r"\|\|", r"\|", r";", r",", r"\t"]

# ----------------------------
# 1) Load CSV
# ----------------------------
assert os.path.exists(DATA_PATH), f"File not found: {DATA_PATH}"
df = pd.read_csv(DATA_PATH)

print("Loaded:", DATA_PATH, "| shape =", df.shape)
print("Columns:", list(df.columns))
print(df.head(3))

# ----------------------------
# 2) 自动识别文本列 & 标签列
#    - 文本列: 平均长度更长、字符串为主
#    - 标签列: 往往包含分隔符/冒号/短 token，且唯一值更“离散”
# ----------------------------
def guess_text_and_label_cols(df: pd.DataFrame):
    obj_cols = [c for c in df.columns if df[c].dtype == "object"]
    if len(obj_cols) == 0:
        raise ValueError("No object columns found; cannot guess text/label columns.")

    # 计算每列的平均字符串长度、空值比例
    stats = []
    for c in obj_cols:
        s = df[c].astype(str)
        avg_len = s.str.len().mean()
        null_rate = df[c].isna().mean()
        # 含分隔符的比例（粗略）
        sep_hit = s.str.contains(r"(\|\||\||;|,|\t)").mean()
        colon_hit = s.str.contains(":").mean()
        uniq_ratio = df[c].nunique() / max(len(df), 1)
        stats.append((c, avg_len, null_rate, sep_hit, colon_hit, uniq_ratio))

    # 文本列: avg_len 大，sep_hit 相对小
    # 标签列: sep_hit/colon_hit 高 或 avg_len 相对小
    # 这里给个启发式打分
    def text_score(x):
        c, avg_len, null_rate, sep_hit, colon_hit, uniq_ratio = x
        return avg_len - 50*sep_hit - 20*colon_hit - 10*null_rate

    def label_score(x):
        c, avg_len, null_rate, sep_hit, colon_hit, uniq_ratio = x
        return 80*sep_hit + 30*colon_hit - 0.2*avg_len - 10*null_rate

    text_col = max(stats, key=text_score)[0]
    label_col = max([x for x in stats if x[0] != text_col], key=label_score, default=None)
    if label_col is None:
        # 如果只有一个 object 列，那就没法猜；直接报错
        raise ValueError("Only one object column found; cannot determine label column.")
    label_col = label_col[0]
    return text_col, label_col, stats

text_col, label_col, col_stats = guess_text_and_label_cols(df)

print("\n[Auto-detected]")
print("  text_col :", text_col)
print("  label_col:", label_col)

print("\n[Column stats: col, avg_len, null_rate, sep_hit, colon_hit, uniq_ratio]")
for row in col_stats:
    print(" ", row)

# ----------------------------
# 3) 解析多标签
#    输入标签可能是：
#      - "A || B || C"
#      - "A;B;C"
#      - "['A','B']"
#      - 单标签字符串
# ----------------------------
_sep_regex = re.compile("|".join(LABEL_SEPS))

def parse_labels(x):
    if pd.isna(x):
        return []
    # 若是类似 "['a', 'b']" 这种
    s = str(x).strip()
    if s.startswith("[") and s.endswith("]"):
        # 轻量安全解析：去掉括号、引号，再按逗号切
        inner = s[1:-1].strip()
        if not inner:
            return []
        parts = [p.strip().strip("'").strip('"') for p in inner.split(",")]
        return [p for p in parts if p]
    # 常规：按分隔符切
    parts = [p.strip() for p in _sep_regex.split(s)]
    parts = [p for p in parts if p]
    return parts

texts = df[text_col].astype(str).fillna("").tolist()
y_list = df[label_col].apply(parse_labels).tolist()

# 去掉完全没标签的样本（可选）
keep_idx = [i for i, lab in enumerate(y_list) if len(lab) > 0]
texts = [texts[i] for i in keep_idx]
y_list = [y_list[i] for i in keep_idx]

print(f"\nAfter removing empty-label rows: n={len(texts)}")

# ----------------------------
# 4) MultiLabelBinarizer
# ----------------------------
mlb = MultiLabelBinarizer(sparse_output=False)  # 这里输出 dense，后面就别用 .multiply 了
Y = mlb.fit_transform(y_list)
labels = mlb.classes_

print("Num labels:", len(labels))
# 统计 Top labels
support = Y.sum(axis=0)
top_idx = np.argsort(-support)[:10]
print("Top 10 labels:", [labels[i] for i in top_idx])

# ----------------------------
# 5) Train/Test split
# ----------------------------
X_train, X_test, Y_train, Y_test = train_test_split(
    texts, Y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

print(f"\nSplit: train={len(X_train)} test={len(X_test)}")

# ----------------------------
# 6) Metrics utilities
# ----------------------------
def multilabel_micro_macro(Y_true, Y_pred):
    """
    Y_true, Y_pred: numpy arrays (n_samples, n_labels), binary 0/1
    returns dict with micro/macro precision/recall/f1
    """
    Y_true = (Y_true > 0).astype(int)
    Y_pred = (Y_pred > 0).astype(int)

    # --- per-label stats ---
    support = Y_true.sum(axis=0).astype(float)                      # positives per label
    pred_pos = Y_pred.sum(axis=0).astype(float)                     # predicted positives per label
    tp = (Y_true * Y_pred).sum(axis=0).astype(float)                # ✅ dense elementwise multiply (fix)
    # per-label precision/recall
    prec = np.divide(tp, pred_pos, out=np.zeros_like(tp), where=pred_pos > 0)
    rec  = np.divide(tp, support,  out=np.zeros_like(tp), where=support > 0)
    f1   = np.divide(2*prec*rec, (prec+rec), out=np.zeros_like(tp), where=(prec+rec) > 0)

    # macro averages over labels that appear in true set (support>0)
    mask = support > 0
    macro_p = prec[mask].mean() if mask.any() else 0.0
    macro_r = rec[mask].mean()  if mask.any() else 0.0
    macro_f1 = f1[mask].mean()  if mask.any() else 0.0

    # micro
    TP = tp.sum()
    P  = pred_pos.sum()
    T  = support.sum()
    micro_p = TP / P if P > 0 else 0.0
    micro_r = TP / T if T > 0 else 0.0
    micro_f1 = (2*micro_p*micro_r/(micro_p+micro_r)) if (micro_p+micro_r)>0 else 0.0

    return {
        "micro_precision": micro_p,
        "micro_recall": micro_r,
        "micro_f1": micro_f1,
        "macro_precision": macro_p,
        "macro_recall": macro_r,
        "macro_f1": macro_f1,
        "per_label": {
            "support": support,
            "pred_pos": pred_pos,
            "tp": tp,
            "precision": prec,
            "recall": rec,
            "f1": f1
        }
    }

# ----------------------------
# 7) Rule baseline (substring)
#    用训练集里 topK labels 做简单 substring 匹配
# ----------------------------
# 选 topK labels（按训练集支持度）
train_support = Y_train.sum(axis=0)
topk_idx = np.argsort(-train_support)[:TOPK]
topk_labels = [labels[i] for i in topk_idx]

def rule_predict(texts, topk_labels):
    preds = np.zeros((len(texts), len(labels)), dtype=int)
    # 只在 topK 上预测，其他全 0
    label_to_i = {lab: i for i, lab in enumerate(labels)}
    for r, t in enumerate(texts):
        t_low = str(t).lower()
        for lab in topk_labels:
            # 简单 substring：label 字符串出现在 text 里就预测为 1
            if lab.lower() in t_low:
                preds[r, label_to_i[lab]] = 1
    return preds

Y_pred_rule = rule_predict(X_test, topk_labels)
m_rule = multilabel_micro_macro(Y_test, Y_pred_rule)

# ----------------------------
# 8) TF-IDF + OneVsRest(SGDClassifier)
# ----------------------------
print("\nTraining TF-IDF + OneVsRest(SGDClassifier) ... (FAST)")

vec = TfidfVectorizer(
    max_features=200000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.98,
    strip_accents="unicode"
)

Xtr = vec.fit_transform(X_train)
Xte = vec.transform(X_test)

clf = OneVsRestClassifier(
    SGDClassifier(
        loss="log_loss",      # 逻辑回归风格，可输出概率/decision
        alpha=1e-5,
        penalty="l2",
        max_iter=2000,
        tol=1e-3,
        random_state=RANDOM_STATE
    ),
    n_jobs=-1
)

clf.fit(Xtr, Y_train)

# 预测：这里给一个简单阈值 0（decision_function），也可改成 topk per row
if hasattr(clf, "decision_function"):
    scores = clf.decision_function(Xte)
    Y_pred_sgd = (scores > 0).astype(int)
else:
    Y_pred_sgd = clf.predict(Xte).astype(int)

m_sgd = multilabel_micro_macro(Y_test, Y_pred_sgd)

# ----------------------------
# 9) Print summary table (like your screenshot)
# ----------------------------
res = pd.DataFrame([
    {
        "method": "Rule baseline (substring)",
        "micro_f1": m_rule["micro_f1"],
        "macro_f1": m_rule["macro_f1"],
        "micro_precision": m_rule["micro_precision"],
        "micro_recall": m_rule["micro_recall"],
    },
    {
        "method": "TF-IDF + OvR SGD (test split)",
        "micro_f1": m_sgd["micro_f1"],
        "macro_f1": m_sgd["macro_f1"],
        "micro_precision": m_sgd["micro_precision"],
        "micro_recall": m_sgd["micro_recall"],
    }
])

pd.set_option("display.max_columns", None)
print("\n=== PAPER METRICS ===")
print(res)

# ----------------------------
# 10) 额外：top labels 的 per-label recall（便于 debug）
# ----------------------------
per = m_sgd["per_label"]
support = per["support"]
recall = per["recall"]
f1 = per["f1"]

top_lbl = np.argsort(-support)[:20]
dbg = pd.DataFrame({
    "label": [labels[i] for i in top_lbl],
    "support": support[top_lbl].astype(int),
    "recall": recall[top_lbl],
    "f1": f1[top_lbl],
})
print("\n=== Top-20 labels (by support) — per-label recall/f1 (SGD) ===")
print(dbg.to_string(index=False))


AssertionError: File not found: asrs_paper_dataset_sample (1).csv

In [None]:
# ============================================================
# ASRS multi-label baseline with FILE UPLOAD (Colab-ready)
# numpy 1.26.4 | pandas 2.2.2
# ============================================================

# ----------------------------
# 0) Environment + Upload
# ----------------------------
import os
import re
import numpy as np
import pandas as pd

from google.colab import files

print("numpy", np.__version__, "| pandas", pd.__version__)
print("\n[UPLOAD] Please upload your files (csv/xlsx):")
print(" - asrs_paper_dataset_sample.*  (preferred)")
print(" - asrs_paper_dataset.*         (fallback)")
print(" - asrs_narratives_clean.*      (optional)")

uploaded = files.upload()

print("\nUploaded:")
for k in uploaded:
    print(" -", k)

# ----------------------------
# 1) Choose dataset (priority)
# ----------------------------
def pick_dataset(files_dict):
    names = list(files_dict.keys())
    # priority order
    for key in names:
        if "asrs_paper_dataset_sample" in key:
            return key
    for key in names:
        if "asrs_paper_dataset" in key:
            return key
    raise ValueError("No ASRS dataset found in uploaded files.")

DATA_FILE = pick_dataset(uploaded)
print(f"\nUsing: {DATA_FILE}")

# ----------------------------
# 2) Load CSV / XLSX
# ----------------------------
if DATA_FILE.endswith(".csv"):
    df = pd.read_csv(DATA_FILE)
elif DATA_FILE.endswith(".xlsx"):
    df = pd.read_excel(DATA_FILE)
else:
    raise ValueError("Unsupported file type.")

print("shape =", df.shape)
print("columns =", list(df.columns))
display(df.head())

# ----------------------------
# 3) Guess text column & label column
# ----------------------------
def guess_text_and_label_cols(df):
    obj_cols = [c for c in df.columns if df[c].dtype == "object"]
    if len(obj_cols) < 2:
        raise ValueError("Need at least 2 string columns.")

    stats = []
    for c in obj_cols:
        s = df[c].astype(str)
        stats.append({
            "col": c,
            "avg_len": s.str.len().mean(),
            "sep": s.str.contains(r"\|\||\||;|,|\t").mean(),
            "colon": s.str.contains(":").mean()
        })

    text_col = max(stats, key=lambda x: x["avg_len"])["col"]
    label_col = max(
        [x for x in stats if x["col"] != text_col],
        key=lambda x: x["sep"] + x["colon"]
    )["col"]

    return text_col, label_col, stats

text_col, label_col, stats = guess_text_and_label_cols(df)

print("\n[Auto-detected columns]")
print(" text_col :", text_col)
print(" label_col:", label_col)

# ----------------------------
# 4) Parse multi-labels
# ----------------------------
LABEL_SEPS = re.compile(r"\|\||\||;|,|\t")

def parse_labels(x):
    if pd.isna(x):
        return []
    s = str(x).strip()
    if s.startswith("[") and s.endswith("]"):
        s = s[1:-1]
    return [p.strip().strip("'").strip('"')
            for p in LABEL_SEPS.split(s) if p.strip()]

texts = df[text_col].astype(str).fillna("").tolist()
y_list = df[label_col].apply(parse_labels).tolist()

# drop empty-label rows
keep = [i for i, y in enumerate(y_list) if len(y) > 0]
texts = [texts[i] for i in keep]
y_list = [y_list[i] for i in keep]

print(f"\nEffective rows: {len(texts)}/{len(df)}")

# ----------------------------
# 5) MultiLabelBinarizer
# ----------------------------
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(sparse_output=False)
Y = mlb.fit_transform(y_list)
labels = mlb.classes_

support = Y.sum(axis=0)
top10 = np.argsort(-support)[:10]
print("\nTop 10 labels:", [labels[i] for i in top10])

# ----------------------------
# 6) Train / Test split
# ----------------------------
from sklearn.model_selection import train_test_split

Xtr_txt, Xte_txt, Ytr, Yte = train_test_split(
    texts, Y, test_size=0.30, random_state=42
)

print(f"Split: train={len(Xtr_txt)} test={len(Xte_txt)}")

# ----------------------------
# 7) Metrics (NUMPY SAFE)
# ----------------------------
def multilabel_metrics(Y_true, Y_pred):
    Y_true = (Y_true > 0).astype(int)
    Y_pred = (Y_pred > 0).astype(int)

    tp = (Y_true * Y_pred).sum(axis=0)
    support = Y_true.sum(axis=0)
    pred_pos = Y_pred.sum(axis=0)

    prec = np.divide(tp, pred_pos, out=np.zeros_like(tp, float), where=pred_pos > 0)
    rec  = np.divide(tp, support,  out=np.zeros_like(tp, float), where=support > 0)
    f1   = np.divide(2*prec*rec, prec+rec, out=np.zeros_like(tp, float), where=(prec+rec)>0)

    mask = support > 0
    macro_f1 = f1[mask].mean() if mask.any() else 0.0

    TP, P, T = tp.sum(), pred_pos.sum(), support.sum()
    micro_p = TP/P if P>0 else 0
    micro_r = TP/T if T>0 else 0
    micro_f1 = 2*micro_p*micro_r/(micro_p+micro_r) if (micro_p+micro_r)>0 else 0

    return micro_f1, macro_f1, micro_p, micro_r

# ----------------------------
# 8) Rule baseline (substring)
# ----------------------------
TOPK = 50
topk_idx = np.argsort(-Ytr.sum(axis=0))[:TOPK]
topk_labels = [labels[i] for i in topk_idx]
label_to_i = {l:i for i,l in enumerate(labels)}

Y_pred_rule = np.zeros_like(Yte)
for r, t in enumerate(Xte_txt):
    t = t.lower()
    for lab in topk_labels:
        if lab.lower() in t:
            Y_pred_rule[r, label_to_i[lab]] = 1

m_rule = multilabel_metrics(Yte, Y_pred_rule)

# ----------------------------
# 9) TF-IDF + OvR(SGD)
# ----------------------------
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier

print("\nTraining TF-IDF + OneVsRest(SGDClassifier) ... (FAST)")

vec = TfidfVectorizer(
    ngram_range=(1,2),
    min_df=2,
    max_df=0.98,
    max_features=200_000
)

Xtr = vec.fit_transform(Xtr_txt)
Xte = vec.transform(Xte_txt)

clf = OneVsRestClassifier(
    SGDClassifier(
        loss="log_loss",
        alpha=1e-5,
        max_iter=2000,
        tol=1e-3,
        random_state=42
    ),
    n_jobs=-1
)

clf.fit(Xtr, Ytr)

scores = clf.decision_function(Xte)
Y_pred_sgd = (scores > 0).astype(int)

m_sgd = multilabel_metrics(Yte, Y_pred_sgd)

# ----------------------------
# 10) Paper-style summary
# ----------------------------
res = pd.DataFrame([
    {
        "method": "Rule baseline (substring)",
        "micro_f1": m_rule[0],
        "macro_f1": m_rule[1],
        "micro_precision": m_rule[2],
        "micro_recall": m_rule[3],
    },
    {
        "method": "TF-IDF + OvR SGD (test split)",
        "micro_f1": m_sgd[0],
        "macro_f1": m_sgd[1],
        "micro_precision": m_sgd[2],
        "micro_recall": m_sgd[3],
    }
])

print("\n=== PAPER METRICS ===")
display(res)


numpy 2.0.2 | pandas 2.2.2

[UPLOAD] Please upload your files (csv/xlsx):
 - asrs_paper_dataset_sample.*  (preferred)
 - asrs_paper_dataset.*         (fallback)
 - asrs_narratives_clean.*      (optional)


Saving asrs_narratives_clean.csv to asrs_narratives_clean.csv
Saving asrs_paper_dataset.csv to asrs_paper_dataset.csv
Saving asrs_paper_dataset_sample.csv to asrs_paper_dataset_sample.csv

Uploaded:
 - asrs_narratives_clean.csv
 - asrs_paper_dataset.csv
 - asrs_paper_dataset_sample.csv

Using: asrs_paper_dataset_sample.csv
shape = (2000, 3)
columns = ['incident_id', 'narrative', 'labels']


Unnamed: 0,incident_id,narrative,labels
0,557782,"DOWNWIND FROM DIETZ TO DFW RWY 17C APCH, DECLA...",ASRS Report Number.Accession Number: nan; ATC ...
1,134261,AT FL190 35 NM FROM BUF LOUD BUZZING WAS HEARD...,ASRS Report Number.Accession Number: nan; Airc...
2,535549,MULTIPLE ACR'S DEPARTING RWY 7 AT LAS. ON THIS...,ASRS Report Number.Accession Number: nan; Alti...
3,216923,AFTER A PRESSURIZATION PROBLEM AND DIVERSION D...,ASRS Report Number.Accession Number: nan; Airc...
4,804965,WHILE PERFORMING R AND L ENG COMPRESSOR WASHES...,ASRS Report Number.Accession Number: nan; Airc...



[Auto-detected columns]
 text_col : narrative
 label_col: labels

Effective rows: 2000/2000

Top 10 labels: ['Reference: nan', 'Make Model Name: nan', 'Function.Flight Crew: nan', 'Reporter Organization: nan', 'Date: nan', 'Qualification.Flight Crew: nan', 'ASRS Report Number.Accession Number: nan', 'Flight Phase: nan', 'State Reference: nan', 'Flight Plan: nan']
Split: train=1400 test=600

Training TF-IDF + OneVsRest(SGDClassifier) ... (FAST)

=== PAPER METRICS ===


Unnamed: 0,method,micro_f1,macro_f1,micro_precision,micro_recall
0,Rule baseline (substring),0.0,0.0,0.0,0.0
1,TF-IDF + OvR SGD (test split),0.570042,0.083796,0.64648,0.509768
