In [None]:
from pathlib import Path

import pandas as pd
from openslide import OpenSlide
import numpy as np
import matplotlib.pyplot as plt
import torch
import cv2
from torch.nn.functional import normalize
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

from histopatseg.fewshot.protonet import ProtoNet, prototype_topk_vote
from histopatseg.models.foundation_models import load_model
from histopatseg.data.compute_embeddings_tcga_ut import load_hdf5
from histopatseg.evaluation.utils import aggregate_tile_embeddings, custom_balanced_group_kfold
from histopatseg.utils import get_device


In [None]:
protonet = ProtoNet.load("../models/protonet/cptac_enriched_uni2_20x_luad_differentiation_5_patterns.pt")
# protonet = ProtoNet.load("../models/protonet/lunghist700_luad_differentiation_uni2_20x.pt")

In [None]:
print(protonet.prototype_embeddings.shape)

In [None]:
# label_map = {'Acinar adenocarcinoma': 0, 'Lepidic adenocarcinoma': 1, 'Micropapillary adenocarcinoma': 2, 'Normal': 3, 'Papillary adenocarcinoma': 4, 'Solid adenocarcinoma': 5}
label_map = protonet.label_map
# label_map = {"aca_bd": 0, "aca_md": 1, "aca_pd": 2, "nor": 3}
print(label_map)

In [None]:
project_dir = Path(".").resolve().parent
print(f"Project Directory: {project_dir}")

In [None]:
def compute_roi_histograms(df, n_patterns):
    roi_histograms = {}
    roi_labels = {}

    for image_id, group in df.groupby("original_filename"):
        cluster_counts = np.bincount(group["predicted_pattern"], minlength=n_patterns)
        histogram = cluster_counts / cluster_counts.sum()  # normalize
        roi_histograms[image_id] = histogram
        roi_labels[image_id] = group["class_name"].iloc[0]  # assuming consistent label

    return roi_histograms, roi_labels

In [None]:
metadata  = pd.read_csv(project_dir / "/home/valentin/workspaces/histopatseg/data/processed/LungHist700_tiled/LungHist700_20x/metadata.csv").set_index("tile_id")

In [None]:
data = np.load("../data/processed/embeddings/LungHist700/lunghist700_20x_UNI2_embeddings.npz")
embeddings = data["embeddings"]
tile_ids = data["tile_ids"]

embeddings_df = pd.DataFrame(
    {
        "tile_id": tile_ids,
        "embeddings": list(embeddings),
    }
).set_index("tile_id")

df = pd.concat([embeddings_df, metadata], axis=1)

In [None]:
# df = df[(df["superclass"]=="aca") | (df["superclass"]=="nor")]
df = df[(df["superclass"]=="aca")]

In [None]:
df["predicted_pattern"] = df["embeddings"].apply(
    lambda x: protonet.predict(torch.tensor(x, dtype=torch.float32))[0].numpy()
)

In [None]:
df.head()

In [None]:
embeddings = np.stack(df["embeddings"].values)
labels = df["class_name"].values
patient_ids = df["patient_id"].values
roi_ids = df["original_filename"].values
patient_ids = df["patient_id"].values

In [None]:
cv = list(custom_balanced_group_kfold(
    embeddings,
    labels,
    patient_ids,
    n_splits=4,
))

In [None]:
train_idx, test_idx = cv[3]

In [None]:
df_train = df.iloc[train_idx]
df_test = df.iloc[test_idx]

In [None]:
train_histograms, train_labels = compute_roi_histograms(df_train, len(label_map))
test_histograms, test_labels = compute_roi_histograms(df_test, len(label_map))

In [None]:
# Convert dicts to arrays
X_train = np.stack(list(train_histograms.values()))
y_train = np.array(list(train_labels.values()))

X_test = np.stack(list(test_histograms.values()))
y_test = np.array(list(test_labels.values()))

# Train logistic regression
clf = LogisticRegression(multi_class="multinomial", max_iter=1000)
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)

print("Classification report:")
print(classification_report(y_test, y_pred))

print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))

In [None]:
histograms, labels = compute_roi_histograms(df, len(label_map))

In [None]:
X = np.array(list(train_histograms.values()))
y = np.array(list(train_labels.values()))

In [None]:
X

In [None]:
label_map

In [None]:
label_map_inv = {v: k for k, v in label_map.items()}

In [None]:
main_pattern_mapping = {
    'Acinar adenocarcinoma': "aca_md",
    'Lepidic adenocarcinoma': "aca_bd",
    'Micropapillary adenocarcinoma': "aca_pd",
    'Normal': "nor",
    'Papillary adenocarcinoma': "aca_md",
    'Solid adenocarcinoma': "aca_pd"
}

In [None]:
def grade_roi(ratios):
    """
    Classify an ROI into 'Well', 'Moderate', or 'Poor' differentiation.
    
    Parameters:
    - ratios: list or numpy array of shape (6,), representing the ratio of:
        [Acinar, Lepidic, Micropapillary, Normal, Papillary, Solid]
    
    Returns:
    - str: 'Well', 'Moderate', or 'Poor'
    """

    # Unpack the ratios for clarity
    acinar, lepidic, micropapillary, papillary, solid = ratios

    # Compute high-grade component ratio
    high_grade_ratio = micropapillary + solid

    # Decision based on grading guidelines
    if high_grade_ratio >= 0.20:
        return 'aca_pd'
    elif acinar + papillary  >= 0.90:  # optional threshold, can adjust or remove
        return 'aca_md'
    else:
        return 'aca_bd'  # fallback if no clear well-diff pattern

In [None]:
y_pred = np.array([grade_roi(x) for x in X])

In [None]:
print("Classification report:")
print(classification_report(y, y_pred))

print("Confusion matrix:")
print(confusion_matrix(y, y_pred))

In [None]:
labels

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix_percent(y_true, y_pred, labels, title="Confusion Matrix (Percent)"):
    """
    Plots a confusion matrix with percentages and a legend for True vs Predicted.

    Args:
        y_true (array-like): Ground truth labels.
        y_pred (array-like): Predicted labels.
        labels (list): List of label names corresponding to the classes.
        title (str): Title of the plot.
    """
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    cm_percent = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100  # Normalize to percentages

    plt.figure(figsize=(10, 8))
    sns.heatmap(cm_percent, annot=True, fmt=".2f", cmap="Blues", xticklabels=labels, yticklabels=labels)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title(title)
    plt.show()

# Example usage
labels = ["aca_bd", "aca_md", "aca_pd", "nor"]
plot_confusion_matrix_percent(y, y_pred, labels=labels, title="Confusion Matrix: True vs Predicted (Percent)")