In [None]:
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
import umap
from luadseg.eval.anorak import _load_index, _load_embeddings_matrix

In [None]:
tile_size = 448
mag_label = "20x"

df = pd.read_parquet(f"/home/valentin/workspaces/luadseg/data/tiles/ANORAK/tiles_{tile_size}_{mag_label}_pad/index.parquet")
df.to_csv(f"/home/valentin/workspaces/luadseg/data/tiles/ANORAK/tiles_{tile_size}_{mag_label}_pad/index.csv", index=False)


In [None]:
df = df.set_index("tile_idx")

In [None]:
df.head()

In [None]:
pt_path = f"/home/valentin/workspaces/luadseg/data/embeds/ANORAK/UNI2_anorak_{tile_size}_{mag_label}/UNI2_anorak_{tile_size}_{mag_label}.pt"
sn_pt_path = f"/home/valentin/workspaces/luadseg/data/embeds/ANORAK/UNI2_anorak_{tile_size}_{mag_label}_SN-train046_Da777/UNI2_anorak_{tile_size}_{mag_label}_SN-train046_Da777.pt"

In [None]:
pattern_cols = [f"ratio_{i}" for i in range(7)]
df["sum_ratios"] = df[pattern_cols].sum(axis=1)

pattern_cols = [f"ratio_{i}" for i in range(1, 7)]
df["max_ratio"] = df[pattern_cols].max(axis=1)

In [None]:
X, tile_idx = _load_embeddings_matrix(Path(pt_path))  # [N, D]
X_sn, tile_idx_sn = _load_embeddings_matrix(Path(sn_pt_path))  # [N, D]

In [None]:
(tile_idx != tile_idx_sn).sum()  # should be 0

In [None]:
df = df.loc[tile_idx]

In [None]:
df_filtered = df[df["max_ratio"] > 0.5]
df_filtered.shape


In [None]:
X = X[df_filtered.index.values]  # [N_filtered, D]
X_sn = X_sn[df_filtered.index.values]  # [N_filtered, D]

In [None]:
X_sn.shape

In [None]:
y = df_filtered["dominant_label"].values

In [None]:
umap_reducer = umap.UMAP(n_components=2, random_state=42)
X_umap = umap_reducer.fit_transform(X)


In [None]:

# t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
X_tsne = tsne.fit_transform(X)


In [None]:
np.unique(y)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# assume you have:
# X_umap, X_tsne: (n_samples, 2)
# y: (n_samples,) with 6 discrete labels (e.g. 0–5)

unique_classes = np.unique(y)
palette = sns.color_palette("tab10", len(unique_classes))
cmap = dict(zip(unique_classes, palette))

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# --- UMAP ---
for cls in unique_classes:
    mask = y == cls
    axes[0].scatter(
        X_umap[mask, 0], X_umap[mask, 1],
        color=cmap[cls], s=10, label=f"Class {cls}"
    )
axes[0].set_title("UMAP projection")
axes[0].set_xlabel("UMAP-1")
axes[0].set_ylabel("UMAP-2")
axes[0].legend(title="Class", bbox_to_anchor=(1.05, 1), loc="upper left")

# --- t-SNE ---
for cls in unique_classes:
    mask = y == cls
    axes[1].scatter(
        X_tsne[mask, 0], X_tsne[mask, 1],
        color=cmap[cls], s=10, label=f"Class {cls}"
    )
axes[1].set_title("t-SNE projection")
axes[1].set_xlabel("tSNE-1")
axes[1].set_ylabel("tSNE-2")

plt.tight_layout()
plt.show()


In [None]:
import plotly.express as px
import pandas as pd
from luadseg.data.constants import ANORAK_CLASS_MAPPING, PATTERN_COLORS


In [None]:
tile_idx = df_filtered.index.values
tile_id = df_filtered['tile_id'].values

df = pd.DataFrame({
    'x': X_tsne[:, 0],
    'y': X_tsne[:, 1],
    'label': y,
    'tile_id': tile_id
})

# Map numeric labels to class names
df['label_name'] = df['label'].map(ANORAK_CLASS_MAPPING)

# Convert RGB (0–255) to hex
def rgb_to_hex(rgb):
    return '#%02x%02x%02x' % rgb

color_discrete_map = {k: rgb_to_hex(v) for k, v in PATTERN_COLORS.items() if k in ANORAK_CLASS_MAPPING.values()}

# --- plot ---
fig = px.scatter(
    df,
    x='x', y='y',
    color='label_name',
    color_discrete_map=color_discrete_map,
    hover_data=['tile_id'],
    title="UMAP projection (hover to see tile ID)"
)

fig.update_traces(marker=dict(size=5))
fig.update_layout(
    legend_title_text="Pattern class",
    legend=dict(itemsizing='constant')
)
fig.show()

In [None]:
df_filtered["tile_id"]

In [None]:
X_tsne = tsne.fit_transform(X_sn)

In [None]:
tile_idx = df_filtered.index.values
tile_id = df_filtered['tile_id'].values

df = pd.DataFrame({
    'x': X_tsne[:, 0],
    'y': X_tsne[:, 1],
    'label': y,
    'tile_id': tile_id
})

# Map numeric labels to class names
df['label_name'] = df['label'].map(ANORAK_CLASS_MAPPING)

# Convert RGB (0–255) to hex
def rgb_to_hex(rgb):
    return '#%02x%02x%02x' % rgb

color_discrete_map = {k: rgb_to_hex(v) for k, v in PATTERN_COLORS.items() if k in ANORAK_CLASS_MAPPING.values()}

# --- plot ---
fig = px.scatter(
    df,
    x='x', y='y',
    color='label_name',
    color_discrete_map=color_discrete_map,
    hover_data=['tile_id'],
    title="UMAP projection (hover to see tile ID)"
)

fig.update_traces(marker=dict(size=5))
fig.update_layout(
    legend_title_text="Pattern class",
    legend=dict(itemsizing='constant')
)
fig.show()