In [None]:
import os
from pathlib import Path

from dotenv import load_dotenv
import pandas as pd
import umap
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay

from histopatseg.constants import CLASS_MAPPING, SUPERCLASS_MAPPING, SUBCLASS_MAPPING

load_dotenv()


In [None]:
magnification = 10
data_path = Path(os.getenv("LUNGHIST700_PATH"))
metadata = pd.read_csv(
    data_path /
    f"LungHist700_{magnification}x/metadata.csv").set_index("tile_id").drop(columns=["image_id"])

embeddings_path = Path(
    f"/home/valentin/workspaces/histopatseg/data/processed/LungHist700_embeddings/UNI2_{magnification}x.npz"
    # f"/home/valentin/workspaces/histopatseg/data/processed/LungHist700_embeddings/convnext_large_{magnification}x.npz"
    # f"/home/valentin/workspaces/histopatseg/data/processed/LungHist700_embeddings/bioptimus_{magnification}x.npz"
)

In [None]:
with np.load(embeddings_path) as data:
    embeddings = data["embeddings"]
    tile_ids = data["tile_ids"]
    embedding_dim = data["embedding_dim"]


In [None]:
embeddings.shape

In [None]:
def aggregate_embeddings(embeddings, tile_ids, metadata):

    df = pd.DataFrame(embeddings)
    df["image_id"] = metadata.loc[tile_ids]["original_filename"].to_list()

    aggregated_df = df.groupby("image_id").mean()
    image_ids = list(aggregated_df.index)

    grouped_metadata = metadata.groupby("original_filename").agg(
        {key: "first"
         for key in metadata.columns})

    return aggregated_df, grouped_metadata.loc[image_ids]

In [None]:
embeddings_df, metadata_aggregated = aggregate_embeddings(embeddings, tile_ids, metadata)

In [None]:
metadata_aggregated.head()

In [None]:
embeddings_df = embeddings_df.loc[metadata_aggregated[metadata_aggregated["superclass"] == "aca"].index]
patient_ids = metadata_aggregated[metadata_aggregated["superclass"] == "aca"]["patient_id"].values
# patient_ids = metadata_aggregated["patient_id"].values
labels = metadata_aggregated[metadata_aggregated["superclass"] == "aca"]["subclass"].values
# labels = metadata_aggregated["class_name"].values
labels_mapped = np.array([SUBCLASS_MAPPING[label] for label in labels])
# labels_mapped = np.array([CLASS_MAPPING[label] for label in labels])

In [None]:
class_name_labels = metadata_aggregated[metadata_aggregated["superclass"] == "aca"]["class_name"].values
np.unique(class_name_labels)

In [None]:
embeddings_df.shape

In [None]:
cv_splitter = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
knn_classifier = Pipeline([
    ("scaler", StandardScaler()),
    ("classifier", KNeighborsClassifier(n_neighbors=2)),
    # ("classifier", LogisticRegression(max_iter=1000)),
])


In [None]:
all_predictions = []
all_true_labels = []
accuracies = []

for fold, (train_idx, test_idx) in enumerate(cv_splitter.split(embeddings_df, labels_mapped, groups=patient_ids)):
    assert set(patient_ids[train_idx]).isdisjoint(set(patient_ids[test_idx]))
    X_train, X_test = embeddings[train_idx], embeddings[test_idx]
    y_train, y_test = labels_mapped[train_idx], labels_mapped[test_idx]

    knn_classifier.fit(X_train, y_train)
    y_pred = knn_classifier.predict(X_test)

    all_predictions.extend(y_pred)
    all_true_labels.extend(y_test)
    fold_accuracy = (y_pred == y_test).mean()
    accuracies.append(fold_accuracy)

    print(f"k-NN - Fold {fold + 1}: Accuracy = {fold_accuracy:.4f}")

concatenated_accuracy = accuracy_score(all_true_labels, all_predictions)
conf_matrix = confusion_matrix(all_true_labels, all_predictions)