In [4]:
import numpy as np
import pandas as pd

from utils import KNNImageImputer, MeanImageImputer
from utils import Missingness
from utils import get_device
from utils import get_raw_data
from utils import plot_dec_performance
from utils import run_dec_pipeline

In [5]:
missingness = Missingness()
knn_imputer = KNNImageImputer(k=5)
mean_imputer = MeanImageImputer()

device = get_device()

missingness_percentages = np.arange(0, 100, 10)

# Loading data

In [3]:
tensor_x, labels, indices = get_raw_data('mnist', device=device)

# Knn Imputation

In [5]:
ari_scores_knn = []
nmi_scores_knn = []

In [6]:
for mcar_percent in missingness_percentages:
    missing_rate_float = mcar_percent / 100.0

    ari, nmi = run_dec_pipeline(
        X_clean=tensor_x,
        y_true=labels,
        data_indices=indices,
        missingness=missingness,
        corruption_type='mcar',
        missing_rate=missing_rate_float,
        imputer=knn_imputer,
        device=device,
        ae_epochs=20,
        dec_epochs=50
    )

    ari_scores_knn.append(ari)
    nmi_scores_knn.append(nmi)

	No corruption applied
	Running imputation: KNNImageImputer
	- Training Autoencoder
Epoch 5/20: average loss = 0.0185
Epoch 10/20: average loss = 0.0130
Epoch 15/20: average loss = 0.0114


KeyboardInterrupt: 

In [None]:
results_df = pd.DataFrame({
    'Missingness': missingness_percentages,
    'ARI': ari_scores_knn,
    'NMI': nmi_scores_knn
})

print(results_df)

In [None]:
plot_dec_performance(
    missingness_percentages=missingness_percentages,
    score_arrays=[ari_scores_knn, nmi_scores_knn],
    labels=['ARI', 'NMI'],
    title='DEC Clustering Performance (KNN Imputation)'
)

# Mean imputation

In [None]:
ari_scores_mean = []
nmi_scores_mean = []

In [None]:
for mcar_percent in missingness_percentages:
    missing_rate_float = mcar_percent / 100.0

    ari, nmi = run_dec_pipeline(
        X_clean=tensor_x,
        y_true=labels,
        data_indices=indices,
        missingness=missingness,
        corruption_type='mcar',
        missing_rate=missing_rate_float,
        imputer=mean_imputer,
        device=device,
        ae_epochs=20,
        dec_epochs=50
    )

    ari_scores_mean.append(ari)
    nmi_scores_mean.append(nmi)

In [None]:
results_df = pd.DataFrame({
    'Missingness': missingness_percentages,
    'ARI (Mean-DEC)': ari_scores_mean,
    'NMI (Mean-DEC)': nmi_scores_mean
})

print(results_df)

In [None]:
plot_dec_performance(
    missingness_percentages=missingness_percentages,
    score_arrays=[ari_scores_mean, nmi_scores_mean],
    labels=['ARI', 'NMI'],
    title='DEC Clustering Performance (Mean Imputation)'
)

# Denoising Autoencoder

In [None]:
# ari_scores_knn = []
# nmi_scores_knn = []
#
# corruption_type = "mcar"
# missingness_percentages = np.arange(0, 30, 10)

In [None]:
# for mcar_percent in missingness_percentages:
#     missing_rate_float = mcar_percent / 100.0
#     print(f"\n-------- Missingness percentage {missing_rate_float} --------")
#
#     corruption_kwargs = {"missing_rate": missing_rate_float}
#
#     # X_missing_flat, _ = missingness.apply_corruption(
#     #     tensor_x,
#     #     corruption_type='mcar',
#     #     missing_rate=missing_rate_float
#     # )
#     #
#     # X_missing_image = X_missing_flat.view(-1, 1, H_W, H_W)
#     #
#     # X_imputed_image = knn_impute_image(X_missing_image, k=K_KNN)
#     # X_imputed_flat = X_imputed_image.view(-1, N_FEATURES)
#
#     ari, nmi = run_dec_pipeline(
#         tensor_x,
#         labels,
#         indices,
#         device=device,
#         ae_epochs=20,
#         dec_epochs=75,
#         n_clusters=10,
#         latent_dim=10,
#         n_features=784,
#     )
#
#     ari_scores_knn.append(ari)
#     nmi_scores_knn.append(nmi)

# print(f"\n RESULTS ({mcar_percent}% MCAR): ARI={ari:.4f} | NMI={nmi:.4f}")