In [None]:
Incomplete Data imputation for effective Seed Data clustering. Using again the UCI seed dataset (without the label),
randomly delete 1 or 2 feature values in each data item. Then implement a regression approach (either a modified
K-NN regression or a modified locally weighted linear regression) to impute (predict) the values for the missing attributes
before applying clustering to the completed data to see how well the resulting clusters still represent the labels.
Evaluate how well your regressor can predict the missing values and how similar the resulting clusters are to clusters 
obtained on the original dataset.

In [1]:
import numpy as np
import random


def load_data():
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00236/seeds_dataset.txt"
    data = np.loadtxt(url)
    return data[:, :-1], data[:, -1]


def remove_random_values(X, random_state=52):
    random.seed(random_state)
    for row in X:
        indices_to_remove = random.sample(range(7), random.randint(1, 2))
        for index in indices_to_remove:
            row[index] = np.nan
    return X


def euclidean_distance(x1, x2):
    valid_indices = ~np.isnan(x1) & ~np.isnan(x2)
    return np.sqrt(np.sum((x1[valid_indices] - x2[valid_indices]) ** 2))



def cdist(X, centroids, metric='euclidean'):
    distances = []
    for x in X:
        row = []
        for centroid in centroids:
            row.append(euclidean_distance(x, centroid))
        distances.append(row)
    return np.array(distances)


def knn_impute(X, missing_val, n_neighbors=5):
    X_imputed = X.copy()
    n = X.shape[0]
    col_means = np.nanmean(X, axis=0)  # Compute column means
    for i in range(n):
        if missing_val[i].any():
            valid_samples = [j for j in range(n) if i != j and not np.any(missing_val[j])]
            if len(valid_samples) == 0:  # If no valid samples
#                 print(f"No valid samples found for instance {i}. Imputing with column mean.")
                X_imputed[i, missing_val[i]] = col_means[missing_val[i]]
                continue
            distances = np.array([euclidean_distance(X[i], X[j]) for j in valid_samples])
            nearest_indices = np.argsort(distances)[:n_neighbors]
            nearest_values = X[valid_samples][nearest_indices][:, ~missing_val[i]]
            if nearest_values.size == 0:  # If no nearest_values
                print(f"No nearest neighbors found for instance {i}. Imputing with column mean.")
                X_imputed[i, missing_val[i]] = col_means[missing_val[i]]
                continue
            missing_values = np.nanmean(nearest_values, axis=0)
            X_imputed[i, missing_val[i]] = missing_values[:np.sum(missing_val[i])]
    return X_imputed



def kmeans_clustering(X, n_clusters=3, random_state=42, max_iter=300):
    np.random.seed(random_state)
    centroids = X[np.random.choice(X.shape[0], n_clusters, replace=False)]
    for _ in range(max_iter):
        distances = cdist(X, centroids, metric='euclidean')
        cluster_labels = np.argmin(distances, axis=1)
        new_centroids = np.array([np.nanmean(X[cluster_labels == i], axis=0) for i in range(n_clusters)])
        if np.all(np.isnan(new_centroids)):
            break  # Break if all new centroids are NaN
        elif np.allclose(centroids, new_centroids, atol=1e-4):
            break
        centroids = new_centroids
    return cluster_labels



def mean_squared_error(y_true, y_pred):
    missing_indices = np.isnan(y_true)
    if np.any(missing_indices):
        print("Warning: Missing values found in y_true. Cannot compute MSE.")
        return np.nan
    squared_diff = (y_true[~missing_indices] - y_pred[~missing_indices]) ** 2
    mse = np.mean(squared_diff)
    return mse




def silhouette_score(X, labels):
    if np.any(np.isnan(X)):
        print("Warning: Missing values found in X. Cannot compute silhouette score.")
        return np.nan
    distances = cdist(X, X, metric='euclidean')
    n = X.shape[0]
    A = np.array([np.nanmean(distances[i, labels == labels[i]]) for i in range(n)])
    B = np.array([np.nanmean(distances[i, labels != labels[i]], axis=0) if np.any(labels != labels[i]) else np.nan for i in range(n)])
    silhouette = np.nanmean((B - A) / np.maximum(A, B))
    return silhouette




X, y = load_data()
X_incomplete = remove_random_values(X.copy(), random_state=52)

missing_val = np.isnan(X_incomplete)

X_imputed = knn_impute(X_incomplete, missing_val)

mse = mean_squared_error(X[missing_val], X_imputed[missing_val])
print(f"Mean Squared Error for imputed values: {mse:.4f}")


labels_original = kmeans_clustering(X)
labels_imputed = kmeans_clustering(X_imputed)


silhouette_original = silhouette_score(X, labels_original) 
silhouette_imputed = silhouette_score(X_imputed, labels_imputed)

print(f"Silhouette score for the original dataset: {silhouette_original:.2f}")
print(f"Silhouette score for the imputed dataset: {silhouette_imputed:.2f}")




Mean Squared Error for imputed values: 1.5011
Silhouette score for the original dataset: 0.61
Silhouette score for the imputed dataset: 0.55
