In [None]:
# binary bag of words representation of input data

SPINE_INTERACTION = False # decide whether to start off with doubling the parameters for the model by doing bag-of-words embeddings and their interaction with the spine earlier flag

import os
import joblib
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix

# expected format
# File Path column (to differentiate between reports)
# Sentence Number (within report)
# Sentence (the actual contents of the sentence)
# Brain Related: 1 for yes, 0 for no
# spine_earlier (flag for whether certain keywords relating to the spine have appeared earlier in the document) - can also just pass a constant to guarantee not used in models

main_file = "ml/brain_sentences.csv"
supplementary_file = "ml/brain_sentences_supplementary.csv"

df_main = pd.read_csv(main_file) if os.path.exists(main_file) else pd.DataFrame()
df_supplementary = pd.read_csv(supplementary_file) if os.path.exists(supplementary_file) else pd.DataFrame()

if not df_main.empty and not df_supplementary.empty:
    df = pd.concat([df_main, df_supplementary], ignore_index=True)
elif not df_main.empty:
    df = df_main
elif not df_supplementary.empty:
    df = df_supplementary
else:
    df = pd.DataFrame()

df.to_csv("ml/brain_sentences_combined_full.csv", index=False)
print(f"Total rows after adding supplementary data: {len(df)}")

# boilerplate, won't input to model.
phrases_to_remove = [
    r'findings were discussed\s+(?:with|at|by)',
    r'preliminary report',
    r'dose reduction was obtained',
    r'no substantial differences',
    r'provided by .* to .* at',
    r'The preliminary report'
    r'electronically reviewed',
    r'electronically signed',
    r'discussed with .*? by .*? at',
    r'if AEC could not be utilized then',
    r'^contrast dose:?(?:\s+\S+){0,2}\.?$',  # ensures "contrast dose" is at the start, allows up to 2 more words, and an optional period and colon
    r'^COMPARISON:(?:\s+\S+){0,2}\.?$'  # similarly, COMPARISON, require colon, etc
]

# compile the patterns into one regex pattern (using OR)
pattern = re.compile("|".join(phrases_to_remove), re.IGNORECASE)

# boolean mask of True iff "Sentence" contains any of the patterns
mask = df["Sentence"].str.contains(pattern, na=False)

# drop rows where the mask is True
df = df[~mask]
df.reset_index(drop=True, inplace=True)

df.to_csv("ml/brain_sentences_combined.csv", index=False)
print(f"Total rows after adding supplementary data and removing generic sentences: {len(df)}")

# preprocess sentences: lowercase and remove punctuation
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

df["Cleaned Sentence"] = df["Sentence"].apply(clean_text)

# countvectorizer for a binary (frequency-invariant) bag-of-words representation
vectorizer = CountVectorizer(binary=True, stop_words="english")
X_bow = vectorizer.fit_transform(df["Cleaned Sentence"])  # transform sentences into a bow matrix

# add features not explicitly contained in a sentence
sentence_feature = (df["Sentence Number"] <= 3).astype(int).values.reshape(-1, 1)
spine_feature = df["spine_earlier"].astype(int).values.reshape(-1, 1)

if hasattr(vectorizer, "get_feature_names_out"):
    feature_names = vectorizer.get_feature_names_out()
else:
    feature_names = vectorizer.get_feature_names()

df_bow = pd.DataFrame(X_bow.toarray(), columns=feature_names)

# compute word frequency across sentences
word_counts = df_bow.sum(axis=0)

# filter out columns (words) that appear fewer than 5 times to prevent overfitting - arbitrary cutoff and regularization should make it so it doesn't matter
words_to_keep = word_counts[word_counts >= 5].index.tolist()

df_final = pd.concat(
    [df_bow[words_to_keep], 
     pd.DataFrame(sentence_feature, columns=["Sent_Num_LE3"]),
     pd.DataFrame(spine_feature, columns=["spine_earlier"])], axis=1
)

# insert 'Brain Related' as the first column (the response variable)
df_final.insert(0, "Brain Related", df["Brain Related"].astype(int))

if SPINE_INTERACTION:
    # all cols except the response and raw spine flag
    feature_cols = df_final.columns.drop(["Brain Related", "spine_earlier"]) # words and sentencenum<=3
    # build interactions: each feature x spine_earlier
    interactions = df_final[feature_cols].multiply(df_final["spine_earlier"], axis=0)
    # rename them
    interactions.columns = [f"{col}_x_spine" for col in feature_cols]
    # append 
    df_final = pd.concat([df_final, interactions], axis=1)

X_sparse = csr_matrix(df_final)

df_final.to_csv("ml/brain_sentences_sparse.csv", index=False)
print("Sparse BoW representation saved as ml/brain_sentences_sparse.csv.")
print(f"Columns: {df_final.columns}")

# save for later use
filtered_vectorizer = CountVectorizer(binary=True, stop_words="english", vocabulary=words_to_keep)
joblib.dump(filtered_vectorizer, "ml/local_vectorizer.pkl")
print("Vectorizer saved as ml/local_vectorizer.pkl.")

prop_brain_related = df_final["Brain Related"].mean()
print(f"Proportion brain-related: {prop_brain_related:.4f}")

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import json
import os

# there are two sampling strategies - randomly sample sentences or randomly sample reports
# to prevent data leakage of models with context, and so forth, we randomly sample reports

def split_sparse_by_report(sparse_csv, combined_csv, output_dir):
    df_sparse = pd.read_csv(sparse_csv)
    df_combined = pd.read_csv(combined_csv)
    assert len(df_sparse) == len(df_combined), "Mismatch between combined and sparse rows"
    df_sparse["File Path"] = df_combined["File Path"]
    
    df_sparse["report_name"] = df_sparse["File Path"].apply(lambda x: Path(x).stem)
    
    output_base = Path(output_dir)
    output_base.mkdir(parents=True, exist_ok=True)
    
    for report_name, group in df_sparse.groupby("report_name"):
        report_dir = output_base / report_name
        report_dir.mkdir(parents=True, exist_ok=True)
        group.drop(columns=["File Path", "report_name"]).to_csv(report_dir / "report.csv", index=False)

# train/test split at report level
def get_train_test_indices_from_reports(base_dir, test_frac=0.2, seed=24):
    np.random.seed(seed)
    all_reports = sorted([p.name for p in Path(base_dir).iterdir() if p.is_dir()])
    test_size = int(test_frac * len(all_reports))
    test_reports = set(np.random.choice(all_reports, size=test_size, replace=False))
    train_reports = [r for r in all_reports if r not in test_reports]
    test_reports = list(test_reports)

    train_report_frac = 1.3 # this is only for testing how models perform with less data
    if train_report_frac < 1.0:
        n_keep = int(train_report_frac * len(train_reports))
        train_reports = list(np.random.choice(train_reports, size=n_keep, replace=False))
        
    with open("ml/standardized_train_report_names.json", "w") as f:
        json.dump(train_reports, f)
    with open("ml/standardized_test_report_names.json", "w") as f:
        json.dump(test_reports, f)
        
    # ----------------------------------
    train_indices = []
    test_indices = []
    row_counter = 0
    
    

    for report_name in all_reports:
        csv_path = Path(base_dir) / report_name / "report.csv"
        df       = pd.read_csv(csv_path)
        n_rows   = len(df)

        if report_name in test_reports:
            test_indices .extend(range(row_counter, row_counter + n_rows))
        elif report_name in train_reports:
            train_indices.extend(range(row_counter, row_counter + n_rows))

        row_counter += n_rows
    
    return np.array(train_indices), np.array(test_indices)


# split reports
output_dir = "ml/sparse_individual"
if not os.path.exists(output_dir) or len(os.listdir(output_dir)) == 0:
    split_sparse_by_report(
        sparse_csv="ml/brain_sentences_sparse.csv",
        combined_csv="ml/brain_sentences_combined.csv",
        output_dir=output_dir
    )
    print("Split reports.")
else:
    print("Skipping split: reports already split.")

SAMPLING_SENTENCES = False

if not SAMPLING_SENTENCES:
    # get train test split
    train_idx_path = "ml/standardized_train_indices_for_CRF_and_CLG.npy"
    test_idx_path  = "ml/standardized_test_indices_for_CRF_and_CLG.npy"

    if os.path.exists(train_idx_path) and os.path.exists(test_idx_path):
        train_indices = np.load(train_idx_path)
        test_indices  = np.load(test_idx_path)
        print("Number of train indices:", len(train_indices))
        print("Number of test indices:", len(test_indices))
        print("Loaded train/test indices from disk.")
    else:
        train_indices, test_indices = get_train_test_indices_from_reports("ml/sparse_individual", test_frac=0.2)
        np.save(train_idx_path, train_indices)
        np.save(test_idx_path, test_indices)
        print("Number of train indices:", len(train_indices))
        print("Number of test indices:", len(test_indices))
        print("Randomized train/test split and saved indices.")
    
    df_sparse = pd.read_csv("ml/brain_sentences_sparse.csv")
    df_combined = pd.read_csv("ml/brain_sentences_combined.csv")
    assert len(df_sparse) == len(df_combined), "Mismatch between combined and sparse rows"

    df_sparse["File Path"] = df_combined["File Path"]  

    y = df_sparse["Brain Related"].values
    X = df_sparse.drop(columns=["Brain Related", "File Path"])  # drop for model input
    X_sparse = csr_matrix(X.values)
    
    X_train = X.iloc[train_indices].reset_index(drop=True)
    X_test = X.iloc[test_indices].reset_index(drop=True)
    y_train = y[train_indices]
    y_test = y[test_indices]

    df_train_combined = df_combined.iloc[train_indices].reset_index(drop=True)
    
    # a possibility of how to train the metamodel in our contextual logistic regression approach but we do leave one out training instead
    def delete_indices_for_local_subset(df_train, delete_frac=0.6, seed=24):
        np.random.seed(seed)
        df_train = df_train.reset_index(drop=True)
        grouped = df_train.groupby("report_name").indices
        deleted = set()
        total_candidates = sum(len(idxs) for idxs in grouped.values()) - len(grouped)
        target_to_delete = int(delete_frac * total_candidates)
        attempts = 0
        max_attempts = 10 * target_to_delete

        while len(deleted) < target_to_delete and attempts < max_attempts:
            report = np.random.choice(list(grouped.keys()))
            indices = grouped[report]
            if len(indices) <= 1:
                attempts += 1
                continue
            i = np.random.randint(0, len(indices) - 1)
            idx_i = indices[i]
            idx_ip1 = indices[i + 1]
            if idx_i in deleted or idx_ip1 in deleted:
                attempts += 1
                continue
            deleted.add(idx_i)
            attempts += 1

        all_indices = set(range(len(df_train)))
        local_train_indices = sorted(all_indices - deleted)
        deleted_indices = sorted(deleted)
        return local_train_indices, deleted_indices

    df_combined["report_name"] = df_combined["File Path"].apply(lambda x: Path(x).stem)
    df_train_combined = df_combined.iloc[train_indices].reset_index(drop=True)
    
    local_train_indices, deleted_indices = delete_indices_for_local_subset(
        df_train=df_train_combined,
        delete_frac=0.0,
        seed=24
    )

    print("Local model will train on", len(local_train_indices), "sentences.")
    print("This is because we have", len(deleted_indices), "sentences withheld from local model training.")
    print("The metamodel model will train on all", len(local_train_indices)+len(deleted_indices), "sentences.")

In [None]:
# add all the feature engineering one wants and then do leave one fold out generation of predictions from the base model to avoid data leakage or overconfident trusting of the base model
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm
import re
from pathlib import Path

train_indices = np.load("ml/standardized_train_indices_for_CRF_and_CLG.npy")
test_indices = np.load("ml/standardized_test_indices_for_CRF_and_CLG.npy")


df_combined = pd.read_csv("ml/brain_sentences_combined.csv")
df_sparse = pd.read_csv("ml/brain_sentences_sparse.csv")
assert len(df_combined) == len(df_sparse)
df_sparse["File Path"] = df_combined["File Path"]
df_sparse["report_name"] = df_sparse["File Path"].apply(lambda x: Path(x).stem)
df_sparse = df_sparse.sort_values(
    "report_name",
    kind="mergesort", # stablesort
    ignore_index=True
)
df_sparse.drop(columns=["report_name"], inplace=True)

OUT_OF_SCOPE_VALUE = 0.5 # we need to impute prior predicitons where you are one of the first few sentences and there is no prior. you can choose an unbiased prediction or shift it one way.

DO_PAIRWISE_BOW_INTERACTIONS = False 
if DO_PAIRWISE_BOW_INTERACTIONS:
    from itertools import combinations

    # select bow columns eligible for pairwise interaction
    exclude_patterns = ["Prior_", "_x_", "Facial_Bones", "spine_earlier", "Sent_Num", "Brain Related"]
    def is_bow(col):
        return all(pat not in col for pat in exclude_patterns)

    bow_cols = [col for col in df_sparse.columns
                if is_bow(col)
                and np.issubdtype(df_sparse.dtypes[col], np.number)]
    
    # filter bow columns to those with at least 30 instances to restrict the size of the combination
    bow_cols = [col for col in bow_cols if df_sparse[col].sum() >= 30]
    
    print(f"Eligible BoW columns for pairwise interaction: {len(bow_cols)}")
    
    # compute all unique (unordered) pairwise interactions
    pairwise_interactions = {}
    for col1, col2 in combinations(bow_cols, 2):
        inter_col = f"BOWPAIR_{col1}_x_{col2}"
        pairwise_interactions[inter_col] = df_sparse[col1] * df_sparse[col2]
    
    print(f"Total pairwise interactions computed: {len(pairwise_interactions)}")
    
    if pairwise_interactions:
        pairwise_df = pd.DataFrame(pairwise_interactions, index=df_sparse.index)
    
        interaction_freq = pairwise_df.sum(axis=0)
    
        # select the top 300 most frequent interaction columns
        top_300_inter_cols = interaction_freq.sort_values(ascending=False).head(300).index
    
        pairwise_df = pairwise_df[top_300_inter_cols]
    
        df_sparse = pd.concat([df_sparse, pairwise_df], axis=1)
    
        print(f"Added {len(pairwise_df.columns)} most frequent pairwise BoW interaction columns.") 
    else:
        print("No eligible pairwise BoW interactions to add.")

# optionally, like the spine features, do a similar thing for facial bones (interactions and past flag)
DO_FACIAL_BONES_EARLIER = False
FACIAL_BONES_INTERACTION = False
if DO_FACIAL_BONES_EARLIER:
    # Add 'Sentence' for lookup
    df_sentences = pd.read_csv("ml/brain_sentences_combined.csv")
    df_sparse["Sentence"] = df_sentences["Sentence"]
    
    def has_facial_bones(prior_sentences):
        pattern = re.compile(r"(facial bones|_ial bones)", re.IGNORECASE)
        return any(pattern.search(s) for s in prior_sentences)
    
    facial_bones_earlier = []
    for idx in range(len(df_sparse)):
        current_file = df_sparse.at[idx, "File Path"]
        prior_indices = [i for i in range(idx) if df_sparse.at[i, "File Path"] == current_file]
        prior_sents = df_sparse.loc[prior_indices, "Sentence"].tolist() if prior_indices else []
        facial_bones_earlier.append(1 if has_facial_bones(prior_sents) else 0)
    df_sparse["Facial_Bones_earlier"] = facial_bones_earlier

    df_sparse.drop(columns=["Sentence"], inplace=True) # sentence is not sparse
    
    if FACIAL_BONES_INTERACTION:
        exclude_substrings = ["_x_spine", "_x_Prior1"]
        eligible_cols = [
            col for col in df_sparse.columns
            if np.issubdtype(df_sparse[col].dtype, np.number)
            and all(substr not in col for substr in exclude_substrings)
            and col not in ["Facial_Bones_earlier", "Brain Related"]  # exclude label
        ]

        interaction_df = pd.DataFrame({
            f"{col}_x_FacialBones": df_sparse[col] * df_sparse["Facial_Bones_earlier"]
            for col in eligible_cols
        }, index=df_sparse.index)
        df_sparse = pd.concat([df_sparse, interaction_df], axis=1)


y = df_sparse["Brain Related"].values
X = df_sparse.drop(columns=["Brain Related", "File Path"])


cols_to_zero = X.filter(regex=r'(?i)spine_earlier|sent[\s_-]*num').columns
print(f"Zeroing {len(cols_to_zero)} columns:", list(cols_to_zero))
X.loc[:, cols_to_zero] = 0

X.to_parquet("ml/extended_base_X.parquet", index=False)
np.save("ml/extended_base_y.npy", y)

# depending on whether you want these or not
cols_to_zero = X.filter(regex=r'(?i)spine_earlier|sent[\s_-]*num').columns
print(f"Zeroing {len(cols_to_zero)} columns:", list(cols_to_zero))
X.loc[:, cols_to_zero] = 0


BASE_MODEL_C = 1.0


unique_paths = df_sparse.loc[train_indices, "File Path"].unique()
print(f"Total train reports: {len(unique_paths)}")

# decide how many reports you want to keep:
p = 1.0   # e.g. keep 50% or 100% of your train reports to train on less data
n_keep = int(p * len(unique_paths))

if p < 1.0:
    # randomly choose that many reports
    np.random.seed(24)
    selected_paths = np.random.choice(unique_paths, size=n_keep, replace=False)
    print(f"Keeping {n_keep} reports:", selected_paths[:5], "...")
    
    # new train set after filtering out
    train_indices = np.array([
        idx for idx in train_indices
        if df_sparse.at[idx, "File Path"] in selected_paths
    ])
    
    print("New train size (sentences):", len(train_indices))
    print("Unique reports now:", 
          np.unique(df_sparse.loc[train_indices, "File Path"]).shape[0])

loo_preds = np.full(len(df_sparse), np.nan)

LOO_K = 50  # Set to 1 for leave one out, >1 for leave-k-out in blocks for speed
if LOO_K == 1:
    # standard Leave one out
    for idx in tqdm(train_indices, desc="LOO on train set"):
        train_loo = [i for i in train_indices if i != idx]
        model = LogisticRegression(C=BASE_MODEL_C, max_iter=10000, penalty="l2", solver="saga", fit_intercept=False)
        model.fit(X.iloc[train_loo], y[train_loo])
        loo_preds[idx] = model.predict_proba(X.iloc[[idx]])[:, 1][0]
else:
    # leave-k-out in blocks (folds)
    train_indices_sorted = np.sort(train_indices)
    n_train = len(train_indices_sorted)
    for i in tqdm(range(0, n_train, LOO_K), desc=f"Leave-{LOO_K}-out in blocks"):
        block = train_indices_sorted[i:i+LOO_K]
        train_loo = [j for j in train_indices_sorted if j not in block]
        model = LogisticRegression(C=BASE_MODEL_C, max_iter=10000, penalty="l2", solver="saga", fit_intercept=False)
        model.fit(X.iloc[train_loo], y[train_loo])
        loo_preds[block] = model.predict_proba(X.iloc[block])[:, 1]

# predict test set using base model trained on full train set ---
base_model = LogisticRegression(C=BASE_MODEL_C, max_iter=10000, penalty="l2", solver="saga", fit_intercept=False)
base_model.fit(X.iloc[train_indices], y[train_indices])
for idx in tqdm(test_indices, desc="Predicting test set"):
    loo_preds[idx] = base_model.predict_proba(X.iloc[[idx]])[:, 1][0]

df_sparse["Model_Prediction"] = loo_preds

# compute prior features 
df_sparse["Prior_1_Prediction"] = OUT_OF_SCOPE_VALUE
for i in range(len(df_sparse)):
    current_file = df_sparse.at[i, "File Path"]
    if i >= 1:
        prev_file = df_sparse.at[i - 1, "File Path"]
        if current_file == prev_file:
            df_sparse.at[i, "Prior_1_Prediction"] = float(df_sparse.at[i - 1, "Model_Prediction"])
        else:
            df_sparse.at[i, "Prior_1_Prediction"] = float(OUT_OF_SCOPE_VALUE)
    else:
        df_sparse.at[i, "Prior_1_Prediction"] = float(OUT_OF_SCOPE_VALUE)

# shift for neutral to be 0 for interpretability
df_sparse["Prior_1_Prediction"] = df_sparse["Prior_1_Prediction"].astype(float)
df_sparse["Prior_1_Prediction"] -= 0.5

df_sparse.to_csv("ml/brain_sentences_sparse_meta_for_additive_logit.csv", index=False, float_format="%.6f")
df_sparse.drop(columns=["Model_Prediction", "File Path"], inplace=True)
df_sparse.to_csv("ml/brain_sentences_sparse_meta.csv", index=False, float_format="%.6f")
print("Updated dataset saved with leakage-free prior predictions)


In [None]:
# meteamodel training and performance
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("ml/brain_sentences_sparse_meta_for_additive_logit.csv")

OUT_OF_SCOPE_VALUE = 0.5 

# compute shifted prior predictions for the previous x sentences
HOW_MUCH_HISTORY = 4 # how many previous
for k in range(1, HOW_MUCH_HISTORY+1):
    col_name = f"Prior_{k}_Prediction"
    df[col_name] = OUT_OF_SCOPE_VALUE  # initialize
    for i in range(len(df)):
        current_file = df.at[i, "File Path"]
        if i >= k:
            prev_file = df.at[i - k, "File Path"]
            if current_file == prev_file:
                df.at[i, col_name] = float(df.at[i - k, "Model_Prediction"])
            else:
                df.at[i, col_name] = float(OUT_OF_SCOPE_VALUE)
        else:
            df.at[i, col_name] = float(OUT_OF_SCOPE_VALUE)
    # shift as before
    df[col_name] = df[col_name].astype(float) - 0.5

# compute the aggregation features (mean, max over Prior 1/2/3 shifted)
df["Prior_1234on_Avg"] = df[[f"Prior_{k}_Prediction" for k in range(1, HOW_MUCH_HISTORY+1)]].mean(axis=1)
df["Prior_1234on_Max"] = df[[f"Prior_{k}_Prediction" for k in range(1, HOW_MUCH_HISTORY+1)]].max(axis=1)
df["Prior_1234on_Min"] = df[[f"Prior_{k}_Prediction" for k in range(1, HOW_MUCH_HISTORY+1)]].min(axis=1)
#df.drop(columns=["Prior_2_Prediction", "Prior_3_Prediction"], inplace=True)
df.drop(columns=["Model_Prediction"], inplace=True)

# df = df.drop(columns=[c for c in df.columns if c.startswith("Prior_")]) 

# repeated code if you wanted to do interactions for the metamodel and not the base model, etc
from itertools import combinations
DO_PAIRWISE_INTERACTIONS = False 
if DO_PAIRWISE_INTERACTIONS:
    exclude_patterns = ["Prior_", "_x_", "Facial_Bones", "spine_earlier", "Sent_Num", "Brain Related"]
    def is_bow(col):
        return all(pat not in col for pat in exclude_patterns)
    
    bow_cols = [col for col in df.columns
                if is_bow(col)
                and np.issubdtype(df.dtypes[col], np.number)]
    
    bow_cols = [col for col in bow_cols if df.loc[train_indices, col].sum() >= 15]
    
    print(f"Eligible BoW columns for pairwise interaction: {len(bow_cols)}")
    
    pairwise_interactions = {}
    for col1, col2 in combinations(bow_cols, 2):
        inter_col = f"{col1}_x_{col2}"
        pairwise_interactions[inter_col] = df[col1] * df[col2]
    
    print(f"Total pairwise interactions computed: {len(pairwise_interactions)}")
    
    if pairwise_interactions:
        pairwise_df = pd.DataFrame(pairwise_interactions, index=df.index)
    
        interaction_freq = pairwise_df.loc[train_indices].sum(axis=0)
    
        top_300_inter_cols = interaction_freq.sort_values(ascending=False).head(1000).index
    
        pairwise_df = pairwise_df[top_300_inter_cols]
    
        df = pd.concat([df, pairwise_df], axis=1)
    
        print(f"Added {len(pairwise_df.columns)} most frequent pairwise BoW interaction columns.")

        top_300_inter_cols = interaction_freq.sort_values(ascending=False).head(300).index.tolist()
        pd.Series(top_300_inter_cols).to_csv("ml/top_bow_pairwise_cols.csv", index=False)
    
    else:
        print("No eligible pairwise BoW interactions to add.")


PRIOR_INTERACTION_MODE = 1  # 1: none, 2: exclude spine interactions, 3: all
exclude_cols = [col for col in df.columns if col.startswith("Prior_")] + ["Brain Related"]

if PRIOR_INTERACTION_MODE == 1:
    numeric_cols = []
elif PRIOR_INTERACTION_MODE == 2:
    numeric_cols = [col for col in df.columns
                if col not in exclude_cols
                and np.issubdtype(df.dtypes[col], np.number)
                and "_x_spine" not in col]
elif PRIOR_INTERACTION_MODE == 3:
   numeric_cols = [col for col in df.columns
                if col not in exclude_cols
                and np.issubdtype(df.dtypes[col], np.number)]

else:
    raise ValueError("PRIOR_INTERACTION_MODE must be 1, 2, or 3")

# only make interaction terms if needed
if numeric_cols:
    prior1 = df["Prior_1_Prediction"]
    interaction_data = {f"{col}_x_Prior1": df[col] * prior1 for col in numeric_cols}
    interactions_df = pd.DataFrame(interaction_data, index=df.index)
    df = pd.concat([df, interactions_df], axis=1)


# these can be false if you did it for the base model because you'd inherit it
DO_FACIAL_BONES_EARLIER = False
FACIAL_BONES_INTERACTION = False
import re
if DO_FACIAL_BONES_EARLIER:
    def has_facial_bones(prior_sentences):
        pattern = re.compile(r"(facial bones|_ial bones)", re.IGNORECASE)
        return any(pattern.search(s) for s in prior_sentences)
    
    facial_bones_earlier = []
    
    df_sentences = pd.read_csv("ml/brain_sentences_combined.csv")
    
    df_sentences["report_name"] = df_sentences["File Path"].apply(lambda fp: Path(fp).stem)
    df_sentences.sort_values("report_name", inplace=True, kind="mergesort", ignore_index=True)
    df_sentences.drop(columns=["report_name"], inplace=True)

    df["Sentence"] = df_sentences["Sentence"]
    
    for idx in range(len(df)):
        current_file = df.at[idx, "File Path"]
        # find all prior sentences from the same report
        prior_indices = [i for i in range(idx) if df.at[i, "File Path"] == current_file]
        prior_sents = df.loc[prior_indices, "Sentence"].tolist() if prior_indices else []
        facial_bones_earlier.append(1 if has_facial_bones(prior_sents) else 0)
        
    df["Facial_Bones_earlier"] = facial_bones_earlier
    df.drop(columns=["Sentence"], inplace=True)
    #print("Number of sentences with prior 'Facial Bones' mention:", df["Facial_Bones_earlier"].sum())

    if FACIAL_BONES_INTERACTION:
        exclude_substrings = ["_x_spine", "_x_Prior1"]
        eligible_cols = [col for col in df.columns
                         if np.issubdtype(df[col].dtype, np.number)
                         and all(substr not in col for substr in exclude_substrings)
                         and col not in ["Facial_Bones_earlier", "Brain Related"]] # label shouldn't be in here but just in case

        interaction_df = pd.DataFrame({
            f"{col}_x_FacialBones": df[col] * df["Facial_Bones_earlier"]
            for col in eligible_cols
        }, index=df.index)
        df = pd.concat([df, interaction_df], axis=1)


df.to_csv("ml/prep_for_classical_crf.csv", index=False, float_format="%.6f")

df.drop(columns=["File Path"], inplace=True)

df.to_csv("ml/brain_sentences_sparse_meta_more_aggregation.csv", index=False, float_format="%.6f")
print("Saved to ml/brain_sentences_sparse_meta_more_aggregation.csv")

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

PRIOR_GENERALIZATION_SCALE_FACTOR = 1.0
PRIOR_SCALE_BEFORE_TO_GET_REG_APPLIED = 1.0
SPINE_FEATURE_SCALE = 1.0
PRIOR_INTERACTION_FEATURE_SCALE = 1.0
FACIAL_BONES_INTERACTION_SCALE = 1.0
CHOSEN_C = 1.0

df = pd.read_csv("ml/brain_sentences_sparse_meta_more_aggregation.csv")

y = df["Brain Related"].values
X = df.drop(columns=["Brain Related"])
X.index = df.index

cols_to_zero = X.filter(regex=r'(?i)spine_earlier|sent[\s_\-]*num').columns
X.loc[:, cols_to_zero] = 0

# optional scaling to enforce regularization differently
prior_cols = [col for col in X.columns if col.startswith("Prior_")]
X[prior_cols] = X[prior_cols] * PRIOR_SCALE_BEFORE_TO_GET_REG_APPLIED
 
spine_cols = [col for col in X.columns if '_x_spine' in col]
X[spine_cols] = X[spine_cols] * SPINE_FEATURE_SCALE

prior1_interaction_cols = [col for col in X.columns if col.endswith('_x_Prior1')]
X[prior1_interaction_cols] = X[prior1_interaction_cols] * PRIOR_INTERACTION_FEATURE_SCALE

facial_bones_interaction_cols = [col for col in X.columns if col.endswith('_x_FacialBones')]
X[facial_bones_interaction_cols] = X[facial_bones_interaction_cols] * FACIAL_BONES_INTERACTION_SCALE

X_train = X.iloc[train_indices].copy()
X_test = X.iloc[test_indices].copy()
y_train, y_test = y[train_indices], y[test_indices]

for col in prior_cols:
    if col in X_test.columns:
        X_test[col] *= PRIOR_GENERALIZATION_SCALE_FACTOR

# train metamodel
model = LogisticRegression(
    max_iter=10000, penalty="l2", C=CHOSEN_C, solver="liblinear", fit_intercept=False
)
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_pred_train)
print(f"Training Accuracy: {train_accuracy:.4f}")

y_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")
print("\nTest Set Classification Report:\n", classification_report(y_test, y_pred))

# save the metamodel
import joblib
joblib.dump(model, "ml/logistic_regression_brain_related_meta_more_aggregation.pkl")
print("Meta model saved to ml/logistic_regression_brain_related_meta_more_aggregation.pkl")

if True:
    cm = confusion_matrix(y_test, y_pred)
    
    cm_test = cm

    if cm_test.shape == (2, 2):
        TN = cm_test[0, 0]
        FP = cm_test[0, 1]
        FN = cm_test[1, 0]
        TP = cm_test[1, 1]

        recall_1 = TP / (TP + FN) if (TP + FN) > 0 else 0.0
        recall_0 = TN / (TN + FP) if (TN + FP) > 0 else 0.0

        total = TN + FP + FN + TP
        actual_prop_0 = (TN + FP) / total
        actual_prop_1 = (TP + FN) / total
        print(f"Test set distribution: class 0 = {actual_prop_0:.2%}, class 1 = {actual_prop_1:.2%}")

        for target_prop_0, target_prop_1 in [
            (0.10, 0.90),
            (0.15, 0.85),
            (0.50, 0.50),
            (0.90, 0.10),
            (actual_prop_0, actual_prop_1)
        ]:
            weighted_acc = target_prop_1 * recall_1 + target_prop_0 * recall_0
            print(f"Weighted accuracy ({target_prop_1:.0%} class 1, {target_prop_0:.0%} class 0): {weighted_acc:.4f}")
    else:
        print("Unexpected confusion-matrix shape:", cm_test.shape)

    # reverse the order so that class 1 appears first
    cm = cm[::-1, ::-1]

    class_labels = ["Brain Related", "Not Brain Related"]

    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_labels, yticklabels=class_labels)
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title("Confusion Matrix")
    plt.show()

    joblib.dump(model, "ml/logistic_regression_brain_related.pkl")
    print("Model trained and saved as ml/logistic_regression_brain_related.pkl.")

    feature_names = X.columns
    coefs = model.coef_[0]

    coef_df = pd.DataFrame({
        'feature': feature_names,
        'coefficient': coefs
    })

    # compute absolute coefficients to rank by importance
    coef_df['abs_coef'] = coef_df['coefficient'].abs()

    # Sort by the absolute value of coefficients in descending order and select the top x
    topx = coef_df.sort_values(by='abs_coef', ascending=False).head(40)

    # determine the bar colors: blue if the coefficient is positive, red if negative
    colors = topx['coefficient'].apply(lambda x: 'blue' if x > 0 else 'red')

    plt.figure(figsize=(16, 14))
    plt.barh(topx['feature'][::-1], topx['coefficient'][::-1], color=colors[::-1])
    plt.xlabel("Coefficient Value")
    plt.title("Top Features by Coefficient Magnitude")
    plt.tight_layout()
    plt.show()

    y_scores = model.decision_function(X_test)  

    fpr, tpr, _ = roc_curve(y_test, y_scores)

    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})')
    plt.plot([0, 1], [0, 1], color='gray', linestyle='--')  # Diagonal line
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.grid()
    plt.show()

    top25 = coef_df.sort_values(by='abs_coef', ascending=False).head(45)
    
    print("Top 25 features by absolute coefficient magnitude:")
    print(top25[['feature', 'coefficient']].to_string(index=False))

    from sklearn.metrics import f1_score, accuracy_score

    meta_probs = model.predict_proba(X_test)[:, 1]
    
    thresholds = np.arange(0.0, 1.01, 0.01)
    meta_results = []
    
    for thresh in thresholds:
        meta_preds = (meta_probs > thresh).astype(int)
        acc = accuracy_score(y_test, meta_preds)
        f1_cls1 = f1_score(y_test, meta_preds, pos_label=1, zero_division=0)
        f1_macro = f1_score(y_test, meta_preds, average="macro", zero_division=0)  # NEW
        meta_results.append((thresh, f1_cls1, f1_macro, acc))
        print(f"Threshold {thresh:.2f}: Accuracy={acc:.4f}, F1_1={f1_cls1:.4f}, MacroF1={f1_macro:.4f}")
    
    meta_df = pd.DataFrame(meta_results, columns=["threshold", "F1_class1", "MacroF1", "accuracy"])
    meta_df.to_csv("meta_threshold_f1_acc.csv", index=False)
    print("Saved meta model threshold metrics (F1_1, MacroF1, accuracy) to meta_threshold_f1_acc.csv")
    
    plt.figure(figsize=(7, 4))
    plt.plot(meta_df["threshold"], meta_df["accuracy"], marker='o', label='Accuracy')
    plt.plot(meta_df["threshold"], meta_df["F1_class1"], marker='s', label='F1 (Class 1)')
    plt.plot(meta_df["threshold"], meta_df["MacroF1"], marker='^', label='Macro F1')  # NEW
    plt.xlabel('Threshold')
    plt.ylabel('Score')
    plt.title('Meta Model: Accuracy, F1 (Class 1) & Macro F1 vs Threshold')
    plt.legend()
    plt.xlim(0, 0.99)
    plt.ylim(0.8, 1)
    plt.grid(True, alpha=0.4)
    plt.show()
    
np.save("ml/global_model_probabilities.npy", meta_probs.astype(np.float32))
np.save("ml/global_model_labels.npy",      y_test.astype(np.int32))
np.savez("ml/global_model_test_scores_labels.npz",
         scores=meta_probs.astype(np.float32),
         labels=y_test.astype(np.int32))

In [None]:
# classical crf
import pandas as pd
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from pathlib import Path

df = pd.read_csv("ml/prep_for_classical_crf.csv")

# drop columns containing 'Prior', crf has its transition probabilities
prior_cols = [col for col in df.columns if "Prior" in col]
df = df.drop(columns=prior_cols)

df["report_name"] = df["File Path"].apply(lambda fp: Path(fp).stem)
df.sort_values(
    "report_name",
    kind="mergesort", 
    inplace=True,
    ignore_index=True
)

zero_cols = df.filter(regex=r'(?i)spine_earlier|sent[\s_\-]*num').columns
print(f"Zeroing {len(zero_cols)} CRF columns:", list(zero_cols)[:10], "..." if len(zero_cols) > 10 else "")
df.loc[:, zero_cols] = 0

df["Split"] = "unused"
df.loc[train_indices, "Split"] = "train"
df.loc[test_indices,  "Split"] = "test"

def row_to_feats(row):
    return {
        col: row[col]
        for col in df.columns
        if col not in ("File Path", "report_name", "Brain Related", "Split")
    }

X_train, y_train = [], []
X_test,  y_test  = [], []

for report, group in df.groupby("report_name"):
    split = group["Split"].iloc[0]
    # skip any “unused” reports - this is if we train on less data
    if split not in ("train", "test"):
        continue

    X_seq = [row_to_feats(r) for _, r in group.iterrows()]
    y_seq = group["Brain Related"].astype(str).tolist()

    if split == "train":
        X_train.append(X_seq)
        y_train.append(y_seq)
    else:
        X_test.append(X_seq)
        y_test.append(y_seq)


train_report_names = sorted(df.loc[df["Split"] == "train", "report_name"].unique())
test_report_names  = sorted(df.loc[df["Split"] == "test",  "report_name"].unique())

with open("ml/from_classical_standardized_train_report_names.json", "w") as f:
    json.dump(train_report_names, f)
with open("ml/from_classical_standardized_test_report_names.json", "w") as f:
    json.dump(test_report_names, f)

print("Saved standardized train/test report names for CRF compatibility.")

df.drop(columns=["report_name"], inplace=True)


crf = CRF(
    algorithm="lbfgs",
    c1=1.5,
    c2=2.0,
    max_iterations=10000,
    all_possible_transitions=True
)


crf.fit(X_train, y_train)
y_pred = crf.predict(X_test)

print(flat_classification_report(
    y_test, y_pred, labels=["0", "1"], digits=4,
    target_names=["Not Brain‑Related", "Brain‑Related"]
))

y_test_flat = [item for seq in y_test for item in seq]
y_pred_flat = [item for seq in y_pred for item in seq]

acc = accuracy_score(y_test_flat, y_pred_flat)
print(f"Overall Accuracy: {acc:.4f}")

bacc = balanced_accuracy_score(y_test_flat, y_pred_flat)
print(f"Balanced Accuracy (macro recall): {bacc:.4f}")

from sklearn.metrics import confusion_matrix

cm_test = confusion_matrix(y_test_flat, y_pred_flat)

if cm_test.shape == (2, 2):
    TN = cm_test[0, 0]
    FP = cm_test[0, 1]
    FN = cm_test[1, 0]
    TP = cm_test[1, 1]

    recall_1 = TP / (TP + FN) if (TP + FN) > 0 else 0.0
    recall_0 = TN / (TN + FP) if (TN + FP) > 0 else 0.0

    total = TN + FP + FN + TP
    actual_prop_0 = (TN + FP) / total
    actual_prop_1 = (TP + FN) / total
    print(f"Test set distribution: class 0 = {actual_prop_0:.2%}, class 1 = {actual_prop_1:.2%}")

    for target_prop_0, target_prop_1 in [
        (0.10, 0.90),
        (0.15, 0.85),
        (0.50, 0.50),
        (0.90, 0.10),
        (actual_prop_0, actual_prop_1)
    ]:
        weighted_acc = target_prop_1 * recall_1 + target_prop_0 * recall_0
        print(f"Weighted accuracy ({target_prop_1:.0%} class 1, {target_prop_0:.0%} class 0): {weighted_acc:.4f}")
else:
    print("Unexpected confusion-matrix shape:", cm_test.shape)

per_report_accuracies = []
for y_true_seq, y_pred_seq in zip(y_test, y_pred):
    if len(y_true_seq) == 0:
        continue
    n_correct = sum(int(t == p) for t, p in zip(y_true_seq, y_pred_seq))
    acc = n_correct / len(y_true_seq)
    per_report_accuracies.append(acc)

plt.figure(figsize=(7, 4))
plt.hist(per_report_accuracies, bins=15, color='dodgerblue', alpha=0.8)
plt.xlabel("Per-Report Accuracy")
plt.ylabel("Number of Reports")
plt.title("Histogram of CRF Accuracy per Test Report")
plt.grid(True, axis='y')
plt.tight_layout()
plt.show()

from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# get CRF marginal probabilities for each sentence in the test set
probs_flat = []
labels_flat = []

for X_seq, y_seq in zip(X_test, y_test):
    marginals = crf.predict_marginals_single(X_seq)
    for marg, label in zip(marginals, y_seq):
        probs_flat.append(marg["1"])
        labels_flat.append(int(label))

fpr, tpr, thresholds = roc_curve(labels_flat, probs_flat)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(7, 5))
plt.plot(fpr, tpr, label=f'CRF ROC (AUC = {roc_auc:.4f})', lw=2)
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("CRF ROC Curve (Sentence-Level)")
plt.legend()
plt.tight_layout()
plt.show()

thresholds = np.arange(0.0, 1.01, 0.01)
crf_results = []

from sklearn.metrics import f1_score, accuracy_score

thresholds = np.arange(0.0, 1.01, 0.01)
crf_results = []

for thresh in thresholds:
    preds_at_thresh = (np.array(probs_flat) > thresh).astype(int)
    acc = accuracy_score(labels_flat, preds_at_thresh)
    f1_cls1 = f1_score(labels_flat, preds_at_thresh, pos_label=1, zero_division=0)
    f1_macro = f1_score(labels_flat, preds_at_thresh, average="macro", zero_division=0)  # NEW
    crf_results.append((thresh, f1_cls1, f1_macro, acc))
    print(f"Threshold {thresh:.2f}: Accuracy={acc:.4f}, F1_1={f1_cls1:.4f}, MacroF1={f1_macro:.4f}")

crf_df = pd.DataFrame(crf_results, columns=["threshold", "F1_class1", "MacroF1", "accuracy"])
crf_df.to_csv("crf_threshold_f1_acc.csv", index=False)
print("Saved CRF threshold metrics (F1_1, MacroF1, accuracy) to crf_threshold_f1_acc.csv")

plt.figure(figsize=(7, 4))
plt.plot(crf_df["threshold"], crf_df["accuracy"], marker='o', label='Accuracy')
plt.plot(crf_df["threshold"], crf_df["F1_class1"], marker='s', label='F1 (Class 1)')
plt.plot(crf_df["threshold"], crf_df["MacroF1"], marker='^', label='Macro F1')  # NEW
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('CRF: Accuracy, F1 (Class 1) & Macro F1 vs Threshold')
plt.legend()
plt.grid(True, alpha=0.4)
plt.xlim(0, 0.99)
plt.ylim(0.8, 1)
plt.show()

probs_arr  = np.asarray(probs_flat,  dtype=np.float32)
labels_arr = np.asarray(labels_flat, dtype=np.int32)

np.save("ml/crf_model_probabilities.npy", probs_arr)
np.save("ml/crf_model_labels.npy",       labels_arr)
np.savez("ml/classical_crf_test_scores_labels.npz",
         scores=probs_arr, labels=labels_arr)



In [None]:
# basically the early code but varying the dataset size, etc
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm
import re
from pathlib import Path
from sklearn.metrics import f1_score, balanced_accuracy_score

train_indices = np.load("ml/standardized_train_indices_for_CRF_and_CLG.npy")
test_indices = np.load("ml/standardized_test_indices_for_CRF_and_CLG.npy")

p_s = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.7, 0.75, 0.8, 0.9, 1.0]
dataset_sample_seeds = [1,2,3,4,5,6,7,8,9,10]



# will collect one row per (p, seed)
results = []

for p in p_s:
    for s in dataset_sample_seeds:
        if s>1 and p >= 1: continue
        print(f"Processing with p = {p}, seed = {s}")
        train_indices = np.load("ml/standardized_train_indices_for_CRF_and_CLG.npy") 
        
        # --- Load data ---
        df_combined = pd.read_csv("ml/brain_sentences_combined.csv")
        df_sparse = pd.read_csv("ml/brain_sentences_sparse.csv")
        assert len(df_combined) == len(df_sparse)
        df_sparse["File Path"] = df_combined["File Path"]
        df_sparse["report_name"] = df_sparse["File Path"].apply(lambda x: Path(x).stem)
        #df_sparse = df_sparse.sort_values("report_name").reset_index(drop=True)
        df_sparse = df_sparse.sort_values(
            "report_name",
            kind="mergesort",
            ignore_index=True
        )

        df_sparse.drop(columns=["report_name"], inplace=True)
        
        OUT_OF_SCOPE_VALUE = 0.5
      
        
        DO_PAIRWISE_BOW_INTERACTIONS = False 
        if DO_PAIRWISE_BOW_INTERACTIONS:
            from itertools import combinations
        
            exclude_patterns = ["Prior_", "_x_", "Facial_Bones", "spine_earlier", "Sent_Num", "Brain Related"]
            def is_bow(col):
                return all(pat not in col for pat in exclude_patterns)
        
            bow_cols = [col for col in df_sparse.columns
                        if is_bow(col)
                        and np.issubdtype(df_sparse.dtypes[col], np.number)]
            
            bow_cols = [col for col in bow_cols if df_sparse[col].sum() >= 30]
            
            print(f"Eligible BoW columns for pairwise interaction: {len(bow_cols)}")
            
            pairwise_interactions = {}
            for col1, col2 in combinations(bow_cols, 2):
                inter_col = f"BOWPAIR_{col1}_x_{col2}"
                pairwise_interactions[inter_col] = df_sparse[col1] * df_sparse[col2]
            
            print(f"Total pairwise interactions computed: {len(pairwise_interactions)}")
            
            if pairwise_interactions:
                pairwise_df = pd.DataFrame(pairwise_interactions, index=df_sparse.index)
            
                interaction_freq = pairwise_df.sum(axis=0)
            
                top_300_inter_cols = interaction_freq.sort_values(ascending=False).head(300).index
            
                pairwise_df = pairwise_df[top_300_inter_cols]
            
                df_sparse = pd.concat([df_sparse, pairwise_df], axis=1)
            
                print(f"Added {len(pairwise_df.columns)} most frequent pairwise BoW interaction columns.") 
            else:
                print("No eligible pairwise BoW interactions to add.")
        
        DO_FACIAL_BONES_EARLIER = False
        FACIAL_BONES_INTERACTION = False
        if DO_FACIAL_BONES_EARLIER:
            df_sentences = pd.read_csv("ml/brain_sentences_combined.csv")
            df_sparse["Sentence"] = df_sentences["Sentence"]
            
            def has_facial_bones(prior_sentences):
                pattern = re.compile(r"(facial bones|_ial bones)", re.IGNORECASE)
                return any(pattern.search(s) for s in prior_sentences)
            
            facial_bones_earlier = []
            for idx in range(len(df_sparse)):
                current_file = df_sparse.at[idx, "File Path"]
                prior_indices = [i for i in range(idx) if df_sparse.at[i, "File Path"] == current_file]
                prior_sents = df_sparse.loc[prior_indices, "Sentence"].tolist() if prior_indices else []
                facial_bones_earlier.append(1 if has_facial_bones(prior_sents) else 0)
            df_sparse["Facial_Bones_earlier"] = facial_bones_earlier
            df_sparse.drop(columns=["Sentence"], inplace=True)
            
            if FACIAL_BONES_INTERACTION:
                exclude_substrings = ["_x_spine", "_x_Prior1"]
                eligible_cols = [
                    col for col in df_sparse.columns
                    if np.issubdtype(df_sparse[col].dtype, np.number)
                    and all(substr not in col for substr in exclude_substrings)
                    and col not in ["Facial_Bones_earlier", "Brain Related"]  # exclude label!
                ]
        
                interaction_df = pd.DataFrame({
                    f"{col}_x_FacialBones": df_sparse[col] * df_sparse["Facial_Bones_earlier"]
                    for col in eligible_cols
                }, index=df_sparse.index)
                df_sparse = pd.concat([df_sparse, interaction_df], axis=1)
        
        
        y = df_sparse["Brain Related"].values
        X = df_sparse.drop(columns=["Brain Related", "File Path"])

        cols_to_zero = X.filter(regex=r'(?i)spine_earlier|sent[\s_\-]*num').columns
        if len(cols_to_zero):
            X.loc[:, cols_to_zero] = 0
        
        X.to_csv("ml/extended_base_X.parquet", index=False)
        np.save("ml/extended_base_y.npy", y)
        
        
        
        BASE_MODEL_C = 1.0
        
        
        unique_paths = df_sparse.loc[train_indices, "File Path"].unique()
        print(f"Total train reports: {len(unique_paths)}")
        
        # p already decided by loop
        n_keep = int(p * len(unique_paths))
        
        if p < 1.0:
            np.random.seed(s)
            selected_paths = np.random.choice(unique_paths, size=n_keep, replace=False)
            print(f"Keeping {n_keep} reports:", selected_paths[:5], "...")
            
            train_indices = np.array([
                idx for idx in train_indices
                if df_sparse.at[idx, "File Path"] in selected_paths
            ])
            
            print("New train size (sentences):", len(train_indices))
            print("Unique reports now:", 
                  np.unique(df_sparse.loc[train_indices, "File Path"]).shape[0])
        
        loo_preds = np.full(len(df_sparse), np.nan)
        
        LOO_K = int(400*p) # heuristic
        if LOO_K == 1:
            for idx in tqdm(train_indices, desc="LOO on train set"):
                train_loo = [i for i in train_indices if i != idx]
                model = LogisticRegression(C=BASE_MODEL_C, max_iter=10000, penalty="l2", solver="saga", fit_intercept=False)
                model.fit(X.iloc[train_loo], y[train_loo])
                loo_preds[idx] = model.predict_proba(X.iloc[[idx]])[:, 1][0]
        else:
            train_indices_sorted = np.sort(train_indices)
            n_train = len(train_indices_sorted)
            for i in tqdm(range(0, n_train, LOO_K), desc=f"Leave-{LOO_K}-out in blocks"):
                block = train_indices_sorted[i:i+LOO_K]
                train_loo = [j for j in train_indices_sorted if j not in block]
                model = LogisticRegression(C=BASE_MODEL_C, max_iter=10000, penalty="l2", solver="saga", fit_intercept=False)
                model.fit(X.iloc[train_loo], y[train_loo])
                loo_preds[block] = model.predict_proba(X.iloc[block])[:, 1]
        
        base_model = LogisticRegression(C=BASE_MODEL_C, max_iter=10000, penalty="l2", solver="saga", fit_intercept=False)
        base_model.fit(X.iloc[train_indices], y[train_indices])
        for idx in tqdm(test_indices, desc="Predicting test set"):
            loo_preds[idx] = base_model.predict_proba(X.iloc[[idx]])[:, 1][0]
        
        
        
        df_sparse["Model_Prediction"] = loo_preds
        
        df_sparse["Prior_1_Prediction"] = OUT_OF_SCOPE_VALUE
        for i in range(len(df_sparse)):
            current_file = df_sparse.at[i, "File Path"]
            if i >= 1:
                prev_file = df_sparse.at[i - 1, "File Path"]
                if current_file == prev_file:
                    df_sparse.at[i, "Prior_1_Prediction"] = float(df_sparse.at[i - 1, "Model_Prediction"])
                else:
                    df_sparse.at[i, "Prior_1_Prediction"] = float(OUT_OF_SCOPE_VALUE)
            else:
                df_sparse.at[i, "Prior_1_Prediction"] = float(OUT_OF_SCOPE_VALUE)
        
        df_sparse["Prior_1_Prediction"] = df_sparse["Prior_1_Prediction"].astype(float)
        df_sparse["Prior_1_Prediction"] -= 0.5
        
        df_sparse.to_csv("ml/brain_sentences_sparse_meta_for_additive_logit.csv", index=False, float_format="%.6f")
        df_sparse.drop(columns=["Model_Prediction", "File Path"], inplace=True)
        df_sparse.to_csv("ml/brain_sentences_sparse_meta.csv", index=False, float_format="%.6f")
        print("Updated dataset saved with correct, leakage-free prior predictions.")
        
        import pandas as pd
        import numpy as np
        from sklearn.metrics import roc_curve, auc
        from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
        import matplotlib.pyplot as plt
        import seaborn as sns
        
        df = pd.read_csv("ml/brain_sentences_sparse_meta_for_additive_logit.csv")
        
        OUT_OF_SCOPE_VALUE = 1.0
        
        HOW_MUCH_HISTORY = 4 
        for k in range(1, HOW_MUCH_HISTORY+1):
            col_name = f"Prior_{k}_Prediction"
            df[col_name] = OUT_OF_SCOPE_VALUE
            for i in range(len(df)):
                current_file = df.at[i, "File Path"]
                if i >= k:
                    prev_file = df.at[i - k, "File Path"]
                    if current_file == prev_file:
                        df.at[i, col_name] = float(df.at[i - k, "Model_Prediction"])
                    else:
                        df.at[i, col_name] = float(OUT_OF_SCOPE_VALUE)
                else:
                    df.at[i, col_name] = float(OUT_OF_SCOPE_VALUE)
            df[col_name] = df[col_name].astype(float) - 0.5
        
        df["Prior_1234on_Avg"] = df[[f"Prior_{k}_Prediction" for k in range(1, HOW_MUCH_HISTORY)]].mean(axis=1)
        df["Prior_1234on_Max"] = df[[f"Prior_{k}_Prediction" for k in range(1, HOW_MUCH_HISTORY)]].max(axis=1)
        df["Prior_1234on_Min"] = df[[f"Prior_{k}_Prediction" for k in range(1, HOW_MUCH_HISTORY)]].min(axis=1)
        #df.drop(columns=["Prior_2_Prediction", "Prior_3_Prediction"], inplace=True)
        df.drop(columns=["Model_Prediction"], inplace=True)
        df = df.drop(columns=[c for c in df.columns if c.startswith("Prior_")])
        
        from itertools import combinations
        DO_PAIRWISE_INTERACTIONS = False
        if DO_PAIRWISE_INTERACTIONS:
            exclude_patterns = ["Prior_", "_x_", "Facial_Bones", "spine_earlier", "Sent_Num", "Brain Related"]
            def is_bow(col):
                return all(pat not in col for pat in exclude_patterns)
            
            bow_cols = [col for col in df.columns
                        if is_bow(col)
                        and np.issubdtype(df.dtypes[col], np.number)]
            
            bow_cols = [col for col in bow_cols if df.loc[train_indices, col].sum() >= 15]
            
            print(f"Eligible BoW columns for pairwise interaction: {len(bow_cols)}")
            
            pairwise_interactions = {}
            for col1, col2 in combinations(bow_cols, 2):
                inter_col = f"{col1}_x_{col2}"
                pairwise_interactions[inter_col] = df[col1] * df[col2]
            
            print(f"Total pairwise interactions computed: {len(pairwise_interactions)}")
            
            if pairwise_interactions:
                pairwise_df = pd.DataFrame(pairwise_interactions, index=df.index)
            
                interaction_freq = pairwise_df.loc[train_indices].sum(axis=0)
            
                top_300_inter_cols = interaction_freq.sort_values(ascending=False).head(1000).index
            
                pairwise_df = pairwise_df[top_300_inter_cols]
            
                df = pd.concat([df, pairwise_df], axis=1)
            
                print(f"Added {len(pairwise_df.columns)} most frequent pairwise BoW interaction columns.")
        
                top_300_inter_cols = interaction_freq.sort_values(ascending=False).head(300).index.tolist()
                pd.Series(top_300_inter_cols).to_csv("ml/top_bow_pairwise_cols.csv", index=False)
            
            else:
                print("No eligible pairwise BoW interactions to add.")
        
        PRIOR_INTERACTION_MODE = 1
        exclude_cols = [col for col in df.columns if col.startswith("Prior_")] + ["Brain Related"]
        
        if PRIOR_INTERACTION_MODE == 1:
            numeric_cols = []
        elif PRIOR_INTERACTION_MODE == 2:
            numeric_cols = [col for col in df.columns
                        if col not in exclude_cols
                        and np.issubdtype(df.dtypes[col], np.number)
                        and "_x_spine" not in col]
        elif PRIOR_INTERACTION_MODE == 3:
           numeric_cols = [col for col in df.columns
                        if col not in exclude_cols
                        and np.issubdtype(df.dtypes[col], np.number)]
        
        else:
            raise ValueError("PRIOR_INTERACTION_MODE must be 1, 2, or 3")
        
        if numeric_cols:
            prior1 = df["Prior_1_Prediction"]
            interaction_data = {f"{col}_x_Prior1": df[col] * prior1 for col in numeric_cols}
            interactions_df = pd.DataFrame(interaction_data, index=df.index)
            df = pd.concat([df, interactions_df], axis=1)
        
        DO_FACIAL_BONES_EARLIER = False
        FACIAL_BONES_INTERACTION = False
        import re
        if DO_FACIAL_BONES_EARLIER:
            def has_facial_bones(prior_sentences):
                pattern = re.compile(r"(facial bones|_ial bones)", re.IGNORECASE)
                return any(pattern.search(s) for s in prior_sentences)
            
            facial_bones_earlier = []
            
            df_sentences = pd.read_csv("ml/brain_sentences_combined.csv")
            df_sentences["report_name"] = df_sentences["File Path"].apply(lambda fp: Path(fp).stem)
            df_sentences.sort_values("report_name", inplace=True, ignore_index=True)
            df_sentences.drop(columns=["report_name"], inplace=True)
        
            df["Sentence"] = df_sentences["Sentence"]
            
            for idx in range(len(df)):
                current_file = df.at[idx, "File Path"]
                prior_indices = [i for i in range(idx) if df.at[i, "File Path"] == current_file]
                prior_sents = df.loc[prior_indices, "Sentence"].tolist() if prior_indices else []
                facial_bones_earlier.append(1 if has_facial_bones(prior_sents) else 0)
                
            df["Facial_Bones_earlier"] = facial_bones_earlier
            df.drop(columns=["Sentence"], inplace=True)
        
            if FACIAL_BONES_INTERACTION:
                exclude_substrings = ["_x_spine", "_x_Prior1"]
                eligible_cols = [col for col in df.columns
                                 if np.issubdtype(df[col].dtype, np.number)
                                 and all(substr not in col for substr in exclude_substrings)
                                 and col not in ["Facial_Bones_earlier", "Brain Related"]]
        
                interaction_df = pd.DataFrame({
                    f"{col}_x_FacialBones": df[col] * df["Facial_Bones_earlier"]
                    for col in eligible_cols
                }, index=df.index)
                df = pd.concat([df, interaction_df], axis=1)
        
        
        df.to_csv("ml/prep_for_classical_crf.csv", index=False, float_format="%.6f")
        
        df.drop(columns=["File Path"], inplace=True)
        
        df.to_csv("ml/brain_sentences_sparse_meta_more_aggregation.csv", index=False, float_format="%.6f")
        print("Saved to ml/brain_sentences_sparse_meta_more_aggregation.csv")
        
        import pandas as pd
        import numpy as np
        from sklearn.linear_model import LogisticRegression
        from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
        from sklearn.metrics import precision_score, recall_score
        
        PRIOR_GENERALIZATION_SCALE_FACTOR = 1.0
        PRIOR_SCALE_BEFORE_TO_GET_REG_APPLIED = 1.0
        SPINE_FEATURE_SCALE = 1.0
        PRIOR_INTERACTION_FEATURE_SCALE = 1.0
        FACIAL_BONES_INTERACTION_SCALE = 1.0
        CHOSEN_C = 1.0
        
        df = pd.read_csv("ml/brain_sentences_sparse_meta_more_aggregation.csv")
        
        y = df["Brain Related"].values
        X = df.drop(columns=["Brain Related"])
        X.index = df.index

        cols_to_zero = X.filter(regex=r'(?i)spine_earlier|sent[\s_\-]*num').columns
        if len(cols_to_zero):
            X.loc[:, cols_to_zero] = 0
        
        prior_cols = [col for col in X.columns if col.startswith("Prior_")]
        X[prior_cols] = X[prior_cols] * PRIOR_SCALE_BEFORE_TO_GET_REG_APPLIED
         
        spine_cols = [col for col in X.columns if '_x_spine' in col]
        X[spine_cols] = X[spine_cols] * SPINE_FEATURE_SCALE
        
        prior1_interaction_cols = [col for col in X.columns if col.endswith('_x_Prior1')]
        X[prior1_interaction_cols] = X[prior1_interaction_cols] * PRIOR_INTERACTION_FEATURE_SCALE
        
        facial_bones_interaction_cols = [col for col in X.columns if col.endswith('_x_FacialBones')]
        X[facial_bones_interaction_cols] = X[facial_bones_interaction_cols] * FACIAL_BONES_INTERACTION_SCALE
        
        X_train = X.iloc[train_indices].copy()
        X_test = X.iloc[test_indices].copy()
        y_train, y_test = y[train_indices], y[test_indices]
        
        for col in prior_cols:
            if col in X_test.columns:
                X_test[col] *= PRIOR_GENERALIZATION_SCALE_FACTOR
        
        model = LogisticRegression(
            max_iter=10000, penalty="l2", C=CHOSEN_C, solver="liblinear", fit_intercept=False
        )
        model.fit(X_train, y_train)
        
        y_pred_train = model.predict(X_train)
        train_accuracy = accuracy_score(y_train, y_pred_train)
        print(f"Training Accuracy: {train_accuracy:.4f}")
        
        y_pred = model.predict(X_test)
        test_accuracy = accuracy_score(y_test, y_pred)
        meta_acc = test_accuracy
        print(f"Test Accuracy: {test_accuracy:.4f}")
        print("\nTest Set Classification Report:\n", classification_report(y_test, y_pred))

        meta_cr = classification_report(
            y_test, y_pred, output_dict=True, zero_division=0
        )

        m_prec_0 = meta_cr['0']['precision']
        m_rec_0  = meta_cr['0']['recall']
        m_prec_1 = meta_cr['1']['precision']
        m_rec_1  = meta_cr['1']['recall']

        meta_f1_0 = meta_cr['0']['f1-score']
        meta_f1_1 = meta_cr['1']['f1-score']
        meta_f1_macro = f1_score(y_test, y_pred, average='macro')
        meta_f1_weighted = f1_score(y_test, y_pred, average='weighted')
        meta_bal_acc = balanced_accuracy_score(y_test, y_pred)
        
                
        import joblib
        joblib.dump(model, "ml/logistic_regression_brain_related_meta_more_aggregation.pkl")
        print("Meta model saved to ml/logistic_regression_brain_related_meta_more_aggregation.pkl")
        
        if True:
            cm = confusion_matrix(y_test, y_pred)
            
            cm_test = cm
        
            if cm_test.shape == (2, 2):
                TN = cm_test[0, 0]
                FP = cm_test[0, 1]
                FN = cm_test[1, 0]
                TP = cm_test[1, 1]
        
                recall_1 = TP / (TP + FN) if (TP + FN) > 0 else 0.0
                recall_0 = TN / (TN + FP) if (TN + FP) > 0 else 0.0
        
                total = TN + FP + FN + TP
                actual_prop_0 = (TN + FP) / total
                actual_prop_1 = (TP + FN) / total
                print(f"Test set distribution: class 0 = {actual_prop_0:.2%}, class 1 = {actual_prop_1:.2%}")
        
                for target_prop_0, target_prop_1 in [
                    (0.10, 0.90),
                    (0.15, 0.85),
                    (0.50, 0.50),
                    (0.90, 0.10),
                    (actual_prop_0, actual_prop_1)
                ]:
                    weighted_acc = target_prop_1 * recall_1 + target_prop_0 * recall_0
                    print(f"Weighted accuracy ({target_prop_1:.0%} class 1, {target_prop_0:.0%} class 0): {weighted_acc:.4f}")
            else:
                print("Unexpected confusion-matrix shape:", cm_test.shape)
        
            cm = cm[::-1, ::-1]
        
            class_labels = ["Brain Related", "Not Brain Related"]
        
            plt.figure(figsize=(6, 5))
            sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_labels, yticklabels=class_labels)
            plt.xlabel("Predicted Label")
            plt.ylabel("True Label")
            plt.title("Confusion Matrix")
            plt.show()
        
            joblib.dump(model, "ml/logistic_regression_brain_related.pkl")
            print("Model trained and saved as ml/logistic_regression_brain_related.pkl.")
        
            feature_names = X.columns  
            coefs = model.coef_[0]
        
            coef_df = pd.DataFrame({
                'feature': feature_names,
                'coefficient': coefs
            })
        
            coef_df['abs_coef'] = coef_df['coefficient'].abs()
        
            topx = coef_df.sort_values(by='abs_coef', ascending=False).head(40)
        
            colors = topx['coefficient'].apply(lambda x: 'blue' if x > 0 else 'red')
        
            plt.figure(figsize=(16, 14))
            plt.barh(topx['feature'][::-1], topx['coefficient'][::-1], color=colors[::-1])
            plt.xlabel("Coefficient Value")
            plt.title("Top Features by Coefficient Magnitude")
            plt.tight_layout()
            plt.show()
        
            y_scores = model.decision_function(X_test)
        
            fpr, tpr, _ = roc_curve(y_test, y_scores)
        
            roc_auc = auc(fpr, tpr)
        
            plt.figure(figsize=(8, 6))
            plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc:.3f})')
            plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
            plt.xlim([0.0, 1.0])
            plt.ylim([0.0, 1.05])
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title('Receiver Operating Characteristic (ROC) Curve')
            plt.legend(loc="lower right")
            plt.grid()
            plt.show()
        
            top25 = coef_df.sort_values(by='abs_coef', ascending=False).head(45)
            
            print("Top 25 features by absolute coefficient magnitude:")
            print(top25[['feature', 'coefficient']].to_string(index=False))
        
        import pandas as pd
        from sklearn_crfsuite import CRF
        from sklearn_crfsuite.metrics import flat_classification_report
        from sklearn.metrics import accuracy_score, balanced_accuracy_score
        
        df = pd.read_csv("ml/prep_for_classical_crf.csv")
        
        prior_cols = [col for col in df.columns if "Prior" in col]
        df = df.drop(columns=prior_cols)
        
        df["report_name"] = df["File Path"].apply(lambda fp: Path(fp).stem)
        df.sort_values(
            "report_name",
            kind="mergesort",
            inplace=True,
            ignore_index=True
        )

        zero_cols = df.filter(regex=r'(?i)spine_earlier|sent[\s_\-]*num').columns
        if len(zero_cols):
            df.loc[:, zero_cols] = 0
        
        df["Split"] = "unused"
        df.loc[train_indices, "Split"] = "train"
        df.loc[test_indices,  "Split"] = "test"
        
        def row_to_feats(row):
            return {
                col: row[col]
                for col in df.columns
                if col not in ("File Path", "report_name", "Brain Related", "Split")
            }
        
        X_train, y_train = [], []
        X_test,  y_test  = [], []
        
        for report, group in df.groupby("report_name"):
            split = group["Split"].iloc[0]
            if split not in ("train", "test"):
                continue
        
            X_seq = [row_to_feats(r) for _, r in group.iterrows()]
            y_seq = group["Brain Related"].astype(str).tolist()
        
            if split == "train":
                X_train.append(X_seq)
                y_train.append(y_seq)
            else:
                X_test.append(X_seq)
                y_test.append(y_seq)
        
        df.drop(columns=["report_name"], inplace=True)
        
        crf = CRF(
            algorithm="lbfgs",
            c1=1.5,
            c2=2.0,
            max_iterations=10000,
            all_possible_transitions=True
        )
        crf.fit(X_train, y_train)
        y_pred = crf.predict(X_test)
        
        print(flat_classification_report(
            y_test, y_pred, labels=["0", "1"], digits=4,
            target_names=["Not Brain‑Related", "Brain‑Related"]
        ))
        
        y_test_flat = [item for seq in y_test for item in seq]
        y_pred_flat = [item for seq in y_pred for item in seq]

        crf_acc = accuracy_score(y_test_flat, y_pred_flat)
        # per-class
        c_prec_0 = precision_score(y_test_flat, y_pred_flat, pos_label='0')
        c_rec_0  = recall_score(y_test_flat, y_pred_flat, pos_label='0')
        c_prec_1 = precision_score(y_test_flat, y_pred_flat, pos_label='1')
        c_rec_1  = recall_score(y_test_flat, y_pred_flat, pos_label='1')

        crf_f1_0 = f1_score(y_test_flat, y_pred_flat, pos_label='0')
        crf_f1_1 = f1_score(y_test_flat, y_pred_flat, pos_label='1')
        crf_f1_macro = f1_score(y_test_flat, y_pred_flat, average='macro')
        crf_f1_weighted = f1_score(y_test_flat, y_pred_flat, average='weighted')
        crf_bal_acc = balanced_accuracy_score(y_test_flat, y_pred_flat)
        

        # append one row of results
        results.append({
            'p': p,
            'seed': s,
            # meta
            'meta_accuracy': meta_acc,
            'meta_balanced_accuracy': meta_bal_acc,
            'meta_prec_0': m_prec_0,
            'meta_rec_0':  m_rec_0,
            'meta_f1_0':   meta_f1_0,
            'meta_prec_1': m_prec_1,
            'meta_rec_1':  m_rec_1,
            'meta_f1_1':   meta_f1_1,
            'meta_f1_macro': meta_f1_macro,
            'meta_f1_weighted': meta_f1_weighted,
            # crf
            'crf_accuracy': crf_acc,
            'crf_balanced_accuracy': crf_bal_acc,
            'crf_prec_0':   c_prec_0,
            'crf_rec_0':    c_rec_0,
            'crf_f1_0':     crf_f1_0,
            'crf_prec_1':   c_prec_1,
            'crf_rec_1':    c_rec_1,
            'crf_f1_1':     crf_f1_1,
            'crf_f1_macro': crf_f1_macro,
            'crf_f1_weighted': crf_f1_weighted,
        })

        
        acc = accuracy_score(y_test_flat, y_pred_flat)
        print(f"Overall Accuracy: {acc:.4f}")
        
        bacc = balanced_accuracy_score(y_test_flat, y_pred_flat)
        print(f"Balanced Accuracy (macro recall): {bacc:.4f}")
        
        from sklearn.metrics import confusion_matrix
        
        cm_test = confusion_matrix(y_test_flat, y_pred_flat)
        
        if cm_test.shape == (2, 2):
            TN = cm_test[0, 0]
            FP = cm_test[0, 1]
            FN = cm_test[1, 0]
            TP = cm_test[1, 1]
        
            recall_1 = TP / (TP + FN) if (TP + FN) > 0 else 0.0
            recall_0 = TN / (TN + FP) if (TN + FP) > 0 else 0.0
        
            total = TN + FP + FN + TP
            actual_prop_0 = (TN + FP) / total
            actual_prop_1 = (TP + FN) / total
            print(f"Test set distribution: class 0 = {actual_prop_0:.2%}, class 1 = {actual_prop_1:.2%}")
        
            for target_prop_0, target_prop_1 in [
                (0.10, 0.90),
                (0.15, 0.85),
                (0.50, 0.50),
                (0.90, 0.10),
                (actual_prop_0, actual_prop_1)
            ]:
                weighted_acc = target_prop_1 * recall_1 + target_prop_0 * recall_0
                print(f"Weighted accuracy ({target_prop_1:.0%} class 1, {target_prop_0:.0%} class 0): {weighted_acc:.4f}")
        else:
            print("Unexpected confusion-matrix shape:", cm_test.shape)
        
        per_report_accuracies = []
        for y_true_seq, y_pred_seq in zip(y_test, y_pred):
            if len(y_true_seq) == 0:
                continue
            n_correct = sum(int(t == p) for t, p in zip(y_true_seq, y_pred_seq))
            acc = n_correct / len(y_true_seq)
            per_report_accuracies.append(acc)
        
        plt.figure(figsize=(7, 4))
        plt.hist(per_report_accuracies, bins=15, color='dodgerblue', alpha=0.8)
        plt.xlabel("Per-Report Accuracy")
        plt.ylabel("Number of Reports")
        plt.title("Histogram of CRF Accuracy per Test Report")
        plt.grid(True, axis='y')
        plt.tight_layout()
        plt.show()
        
        from sklearn.metrics import roc_curve, auc
        import matplotlib.pyplot as plt
        
        probs_flat = []
        labels_flat = []
        
        for X_seq, y_seq in zip(X_test, y_test):
            marginals = crf.predict_marginals_single(X_seq)
            for marg, label in zip(marginals, y_seq):
                probs_flat.append(marg["1"])
                labels_flat.append(int(label))
        
        fpr, tpr, thresholds = roc_curve(labels_flat, probs_flat)
        roc_auc = auc(fpr, tpr)
        
        plt.figure(figsize=(7, 5))
        plt.plot(fpr, tpr, label=f'CRF ROC (AUC = {roc_auc:.3f})', lw=2)
        plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title("CRF ROC Curve (Sentence-Level)")
        plt.legend()
        plt.tight_layout()
        plt.show()
        
df_res = pd.DataFrame(results)
df_res.to_csv("ml/meta_crf_metrics_diff_dataset_size_seed.csv", index=False)
print("Saved ml/meta_crf_metrics_diff_dataset_size_seed.csv")
