In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q torch_geometric
!pip install -q class_resolver
!pip3 install pymatting

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pymatting
  Downloading pymatting-1.1.14-py3-none-any.whl.metadata (7.7 kB)
Downloading pymatting-1.1.14-py3-none-any.whl (54 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.7/54.7 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymatting
Successfully installed pymatting-1.1.14


In [2]:
import numpy as np
import torch
import torch.nn as nn
from torchvision import transforms
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss
from scipy import sparse
from scipy.sparse.linalg import eigsh
from torch.utils.data import TensorDataset, DataLoader, Subset
import random

In [3]:
data = np.load('/content/drive/MyDrive/TejaswiAbburi_va797/Dataset/Medmnist_data/pneumoniamnist_224.npz', allow_pickle=True)

all_images = np.concatenate([data['train_images'], data['val_images'], data['test_images']], axis=0)
all_labels = np.concatenate([data['train_labels'], data['val_labels'], data['test_labels']], axis=0).squeeze()

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize((224, 224)),
    transforms.Lambda(lambda x: x.repeat(3, 1, 1)),  # grayscale → 3-channel
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

images = torch.stack([transform(img) for img in all_images])
labels = torch.tensor(all_labels).long()

In [4]:
dataset = TensorDataset(images, labels)
class0_indices = [i for i in range(len(labels)) if labels[i] == 0]
class1_indices = [i for i in range(len(labels)) if labels[i] == 1]

random.seed(42)
sampled_class0 = random.sample(class0_indices, min(2000, len(class0_indices)))
sampled_class1 = random.sample(class1_indices, min(2000, len(class1_indices)))
combined_indices = sampled_class0 + sampled_class1
random.shuffle(combined_indices)

final_dataset = Subset(dataset, combined_indices)
final_loader = DataLoader(final_dataset, batch_size=64, shuffle=False)

device = "cuda" if torch.cuda.is_available() else "cpu"

In [5]:
import torchvision.models as models
import torch.nn as nn

resnet = models.resnet18(pretrained=True)
resnet.fc = nn.Identity()  # Remove final classification layer
resnet = resnet.cuda() if torch.cuda.is_available() else resnet
resnet.eval()
resnet_feats = []
y_list = []

with torch.no_grad():
    for imgs, labels in final_loader:
        imgs = imgs.cuda() if torch.cuda.is_available() else imgs
        features = resnet(imgs)
        resnet_feats.append(features.cpu())
        y_list.extend(labels.cpu().tolist())
F = torch.cat(resnet_feats, dim=0).numpy().astype(np.float32)
y_labels = np.array(y_list).astype(np.float32)

print("Feature shape:", F.shape)
print("Label shape:", y_labels.shape)



Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


100%|██████████| 44.7M/44.7M [00:00<00:00, 85.4MB/s]


Feature shape: (3583, 512)
Label shape: (3583,)


In [6]:
def tokencut_on_features(F_array, alpha=1e-6):
    N, D = F_array.shape

    # Normalize features row-wise
    norms = np.linalg.norm(F_array, axis=1, keepdims=True) + 1e-10
    F_norm = F_array / norms

    # Cosine similarity matrix
    W = np.dot(F_norm, F_norm.T)
    W = W + alpha

    # Normalized Laplacian
    d = np.sum(W, axis=1)
    d_inv_sqrt = np.diag(1.0 / np.sqrt(d + 1e-10))
    L = np.eye(N) - d_inv_sqrt @ W @ d_inv_sqrt

    L_sparse = sparse.csr_matrix(L)

    # Fiedler vector (2nd smallest eigenvector)
    vals, vecs = eigsh(L_sparse, k=2, which='SM')
    fiedler = vecs[:, 1]

    # Threshold by mean
    threshold = fiedler.mean()
    labels = (fiedler > threshold).astype(np.int64)

    return labels, fiedler

labels, scores = tokencut_on_features(F)

In [7]:
y_pred = labels
acc = accuracy_score(y_labels, y_pred)
inv_acc = accuracy_score(y_labels, 1 - y_pred)
if inv_acc > acc:
    y_pred = 1 - y_pred
    acc = inv_acc

prec = precision_score(y_labels, y_pred)
rec = recall_score(y_labels, y_pred)
f1 = f1_score(y_labels, y_pred)

# Normalize fiedler scores for logloss
probs = (scores - scores.min()) / (scores.max() - scores.min() + 1e-10)
logloss = log_loss(y_labels, probs)

print("===== TokenCut Results (PneumoniaMNIST) =====")
print("Accuracy Score:", acc)
print("Precision Score:", prec)
print("Recall Score:", rec)
print("F1 Score:", f1)
print("Log Loss:", logloss)

===== TokenCut Results (PneumoniaMNIST) =====
Accuracy Score: 0.8504046888082613
Precision Score: 0.9447144592952612
Recall Score: 0.7775
F1 Score: 0.8529895776193088
Log Loss: 0.5329166516244409


In [8]:
print(y_pred)

[0 1 0 ... 0 0 1]


In [9]:
print(y_labels)

[0. 1. 0. ... 0. 0. 1.]


In [10]:
num_runs = 10

acc_scores, prec_scores, rec_scores, f1_scores, log_losses = [], [], [], [], []

for run in range(num_runs):
    print(f"\n--- Run {run+1}/{num_runs} ---")
    np.random.seed(run)
    torch.manual_seed(run)

    y_pred, scores = tokencut_on_features(F)

    acc = accuracy_score(y_labels, y_pred)
    inv_acc = accuracy_score(y_labels, 1 - y_pred)
    if inv_acc > acc:
        y_pred = 1 - y_pred
        acc = inv_acc

    prec = precision_score(y_labels, y_pred, zero_division=0)
    rec = recall_score(y_labels, y_pred, zero_division=0)
    f1 = f1_score(y_labels, y_pred, zero_division=0)

    probs = (scores - scores.min()) / (scores.max() - scores.min() + 1e-10)
    logloss = log_loss(y_labels, probs)

    acc_scores.append(acc)
    prec_scores.append(prec)
    rec_scores.append(rec)
    f1_scores.append(f1)
    log_losses.append(logloss)

    print(f"Run {run+1} | Acc: {acc:.4f} | Prec: {prec:.4f} | Rec: {rec:.4f} | "
          f"F1: {f1:.4f} | LogLoss: {logloss:.4f}")

print("\n================ FINAL SUMMARY ================\n")
print(f"{'Metric':>15} | {'Mean':>10} ± {'Std':<10}")
print("-" * 50)
print(f"{'Accuracy':>15} | {np.mean(acc_scores):.4f} ± {np.std(acc_scores):.4f}")
print(f"{'Precision':>15} | {np.mean(prec_scores):.4f} ± {np.std(prec_scores):.4f}")
print(f"{'Recall':>15} | {np.mean(rec_scores):.4f} ± {np.std(rec_scores):.4f}")
print(f"{'F1 Score':>15} | {np.mean(f1_scores):.4f} ± {np.std(f1_scores):.4f}")
print(f"{'Log Loss':>15} | {np.mean(log_losses):.4f} ± {np.std(log_losses):.4f}")


--- Run 1/10 ---
Run 1 | Acc: 0.8504 | Prec: 0.9447 | Rec: 0.7775 | F1: 0.8530 | LogLoss: 1.1505

--- Run 2/10 ---
Run 2 | Acc: 0.8504 | Prec: 0.9447 | Rec: 0.7775 | F1: 0.8530 | LogLoss: 0.5329

--- Run 3/10 ---
Run 3 | Acc: 0.8504 | Prec: 0.9447 | Rec: 0.7775 | F1: 0.8530 | LogLoss: 1.1505

--- Run 4/10 ---
Run 4 | Acc: 0.8504 | Prec: 0.9447 | Rec: 0.7775 | F1: 0.8530 | LogLoss: 1.1505

--- Run 5/10 ---
Run 5 | Acc: 0.8504 | Prec: 0.9447 | Rec: 0.7775 | F1: 0.8530 | LogLoss: 1.1505

--- Run 6/10 ---
Run 6 | Acc: 0.8504 | Prec: 0.9447 | Rec: 0.7775 | F1: 0.8530 | LogLoss: 0.5329

--- Run 7/10 ---
Run 7 | Acc: 0.8504 | Prec: 0.9447 | Rec: 0.7775 | F1: 0.8530 | LogLoss: 1.1505

--- Run 8/10 ---
Run 8 | Acc: 0.8504 | Prec: 0.9447 | Rec: 0.7775 | F1: 0.8530 | LogLoss: 1.1505

--- Run 9/10 ---
Run 9 | Acc: 0.8504 | Prec: 0.9447 | Rec: 0.7775 | F1: 0.8530 | LogLoss: 0.5329

--- Run 10/10 ---
Run 10 | Acc: 0.8504 | Prec: 0.9447 | Rec: 0.7775 | F1: 0.8530 | LogLoss: 1.1505


         Metric 

In [11]:
max_probability_value = np.max(probs)
print("The maximum probability value in the array is:", max_probability_value)

The maximum probability value in the array is: 0.9999999988644296
