In [3]:
from Helpers import helpers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler



In [4]:
# Load data
heart_dataset = helpers.load_dataset('./data/kd-laplace/kd-Laplace/heart-dataset/full_perturbation.csv')
heart_dataset_9_dim = heart_dataset[heart_dataset['dimension'] == 9].drop(columns=['dimension', 'epsilon', 'mechanism', 'class'])
heart_dataset_9_dim.head()

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,histogram_width,histogram_min,histogram_max,histogram_number_of_peaks
14882,134.701799,-2.489474,-0.047867,0.017656,0.011263,64.002392,62.000254,126.00015,1.999982
14883,156.153689,1.839922,1.277743,4.364227,2.040978,130.01651,67.99838,197.999144,6.000041
14884,141.164462,-0.261983,-0.895195,4.983556,1.992754,129.996432,68.000071,198.000004,5.000003
14885,155.977233,3.250912,0.076764,6.009119,1.998609,116.999654,52.999926,170.00003,10.999995
14886,143.318773,14.383951,-1.054274,4.965365,-0.003948,117.000221,53.000093,169.999978,9.000002


In [5]:
heart_dataset_scaled = pd.DataFrame(StandardScaler().fit_transform(heart_dataset_9_dim), columns=heart_dataset_9_dim.columns)
heart_dataset_scaled.head()

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,histogram_width,histogram_min,histogram_max,histogram_number_of_peaks
0,-0.36199,-1.20287,-0.196524,-1.275349,-0.623632,-0.165446,-1.068554,-2.119584,-0.701403
1,1.542345,-0.20313,-0.160819,0.246438,0.188487,1.529548,-0.865594,1.893747,0.655151
2,0.211716,-0.6885,-0.219347,0.463273,0.169191,1.529032,-0.865536,1.893795,0.316004
3,1.526681,0.122695,-0.193167,0.822335,0.171534,1.195324,-1.3731,0.333035,2.350802
4,0.402959,2.693525,-0.223631,0.456904,-0.629718,1.195338,-1.373094,0.333032,1.672537


In [6]:
from sklearn.cluster import AgglomerativeClustering, AffinityPropagation
hierachichal_model = AgglomerativeClustering(n_clusters=4)
hierachichal_model.fit(heart_dataset_scaled)

In [14]:
plain_data = helpers.load_dataset('../data/heart-dataset/heart_numerical.csv')
plain_data.head()

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,histogram_width,histogram_min,histogram_max,histogram_number_of_peaks,class
0,120.0,0.0,0.0,0.0,0.0,64.0,62.0,126.0,2.0,1.0
1,132.0,4.0,0.0,4.0,2.0,130.0,68.0,198.0,6.0,0.0
2,133.0,2.0,0.0,5.0,2.0,130.0,68.0,198.0,5.0,0.0
3,134.0,2.0,0.0,6.0,2.0,117.0,53.0,170.0,11.0,0.0
4,132.0,4.0,0.0,5.0,0.0,117.0,53.0,170.0,9.0,0.0


In [38]:
from sklearn.metrics import silhouette_score


def determine_best_cluster_number_based_on_sc(dataset, dataset_name, columns):
    amount_of_columns = len(columns)
    range_n_clusters = list(range(2,8))

    # Store silhouette scores and corresponding cluster numbers
    silhouette_scores = []
    cluster_numbers = []

    for n_clusters in range_n_clusters:
        # Initialize the clusterer with n_clusters value and a random generator
        # seed of 10 for reproducibility.
        clusterer = AgglomerativeClustering(n_clusters=n_clusters)
        cluster_labels = clusterer.fit_predict(dataset)

        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation of the formed clusters
        silhouette_avg = silhouette_score(dataset, cluster_labels)
        print("For n_clusters =", n_clusters,
              "The average silhouette_score is :", silhouette_avg)

        silhouette_scores.append(silhouette_avg)
        cluster_numbers.append(n_clusters)

    # Plot silhouette scores against number of clusters
    plt.plot(cluster_numbers, silhouette_scores, 'bx-')
    plt.xlabel('Number of Clusters (k)')
    plt.ylabel('Silhouette Score (SC)')
    plt.title(f'Silhouette Score vs Number of Clusters for {dataset_name}')

    # Determine optimal number of clusters
    optimal_clusters = cluster_numbers[np.argmax(silhouette_scores)]
    save_loc = f'./results/params/{dataset_name}_agglomerative_optimal_cluster_{amount_of_columns}.png'
    plt.savefig(save_loc, dpi=300)
    plt.clf()
    print("Optimal number of clusters : ", optimal_clusters)
    return optimal_clusters

In [39]:
datasets = {
    'heart-dataset': '../data/heart-dataset/heart_numerical.csv',
    'seeds-dataset': '../data/seeds-dataset/rq2-nd.csv',
    'circle-dataset': '../RQ3/data/circle_1000_3d.csv',
    'line-dataset': '../RQ3/data/line_1000_3d.csv',
    'skewed-dataset': '../RQ3/data/skewed_1000_3d_.csv'
}

In [45]:
optimal_clusters = { 'dataset': [], 'optimal_clusters': [], 'dimensions': [] }

for dataset_name, dataset_loc in datasets.items():
    print(f'Running {dataset_name}')
    print('loc', dataset_loc)
    dataset = helpers.load_dataset(dataset_loc).drop(columns=['class'])
    for dimensions in [2, 3, len(dataset.columns)]:
        data_for_dim = dataset.iloc[:, :dimensions]
        optimal = determine_best_cluster_number_based_on_sc(data_for_dim, dataset_name, dataset.columns)
        optimal_clusters['dataset'].append(dataset_name)
        optimal_clusters['optimal_clusters'].append(optimal)
        optimal_clusters['dimensions'].append(dimensions)

Running heart-dataset
loc ../data/heart-dataset/heart_numerical.csv
For n_clusters = 2 The average silhouette_score is : 0.44796477101288795
For n_clusters = 3 The average silhouette_score is : 0.4005555113023231
For n_clusters = 4 The average silhouette_score is : 0.36328012843937796
For n_clusters = 5 The average silhouette_score is : 0.34241483561453595
For n_clusters = 6 The average silhouette_score is : 0.354018087996381
For n_clusters = 7 The average silhouette_score is : 0.358353340818107
Optimal number of clusters :  2
For n_clusters = 2 The average silhouette_score is : 0.9318303546396622
For n_clusters = 3 The average silhouette_score is : 0.9214291084413213
For n_clusters = 4 The average silhouette_score is : 0.7539862090407552
For n_clusters = 5 The average silhouette_score is : 0.4116414211244658
For n_clusters = 6 The average silhouette_score is : 0.4141808371644872
For n_clusters = 7 The average silhouette_score is : 0.3560560500540613
Optimal number of clusters :  2
For

<Figure size 640x480 with 0 Axes>

In [46]:
pd.DataFrame(optimal_clusters)

Unnamed: 0,dataset,optimal_clusters,dimensions
0,heart-dataset,2,2
1,heart-dataset,2,3
2,heart-dataset,3,9
3,seeds-dataset,3,2
4,seeds-dataset,3,3
5,seeds-dataset,2,7
6,circle-dataset,5,2
7,circle-dataset,2,3
8,circle-dataset,2,3
9,line-dataset,2,2
