In [3]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
import os

In [6]:
path_to_datasets = '../../proj/data'
filenames = os.listdir(path_to_datasets)
filenames.sort()
print(filenames)

['SS-A.csv', 'SS-B.csv', 'SS-C.csv', 'SS-D.csv', 'SS-E.csv', 'SS-F.csv', 'SS-G.csv', 'SS-H.csv', 'SS-I.csv', 'SS-J.csv', 'SS-K.csv', 'SS-L.csv', 'SS-M.csv', 'SS-N.csv', 'SS-O.csv']


In [8]:
def normalize_values(min_val, max_val, value):
    return (value - min_val) / (max_val - min_val)

In [9]:
for name in filenames:
    path = path_to_datasets + '/' + name
    data = pd.read_csv(path)

    #x columns do not end in !, +, -
    #y columns end in !, +, -
    X_columns = [col for col in data.columns if not (col.endswith('+') or col.endswith('-') or col.endswith('!'))]
    y_columns = [col for col in data.columns if col.endswith('+') or col.endswith('-') or col.endswith('!')]

    X = data[X_columns]
    y = data[y_columns]

    #replacing any ? with nan
    X.replace('?', np.nan, inplace=True)

    #getting the column names for future use
    column_names = X.columns.tolist()
    y_column_names = y.columns.tolist()
    features = y_column_names

    centroids_data = []

    #using minmax scaler to retain original values
    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)

    #nan values are replaced with the mean of the column
    imputer = SimpleImputer(strategy='mean')
    X = imputer.fit_transform(X)

    #dbscan model
    dbscan = DBSCAN(eps=0.1, min_samples=3)
    dbscan.fit(X)

    #generating cluster labels
    labels = dbscan.labels_
    data['Cluster'] = labels

    unique_labels = np.unique(labels)
    
    for label in unique_labels:
        if label == -1:
            continue  # Skip noise points
        cluster_size = np.sum(labels == label)
        cluster_centroid = np.mean(X[labels == label], axis=0)
        
        #getting back original values 
        cluster_centroid_original_scale = scaler.inverse_transform([cluster_centroid])
        
        cluster_centroid_formatted = [f"{value:.2f}" for value in cluster_centroid_original_scale[0]]
        
        y_centroid = np.mean(y.values[labels == label], axis=0)
        y_centroid_original_scale = y_centroid  # Assuming y values are not scaled
        y_centroid_formatted = [f"{value:.2f}" for value in y_centroid_original_scale]

        #zipping centroid values of x and y columns together
        centroid_data = {'Cluster': label, **dict(zip(column_names, cluster_centroid_formatted)), 
                         **dict(zip(y_column_names, y_centroid_formatted))}
        
        centroids_data.append(centroid_data)

    centroids_df = pd.DataFrame(centroids_data)
    #print(centroids_df)

    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    #print(f'Number of clusters formed: {n_clusters}')

    #converting strings to numeric
    centroids_df = centroids_df.apply(pd.to_numeric, errors='ignore')

    maxValues = centroids_df.max()
    minValues = centroids_df.min()

    d2h = []
    for index, centroid in centroids_df.iterrows():
        d = 0
        for column in features:
            if column.endswith("+"):
                d += (abs(1.0 - normalize_values(minValues[column], maxValues[column], centroid[column])))**2
            else:
                d += (abs(0.0 - normalize_values(minValues[column], maxValues[column], centroid[column])))**2
        d2h.append((d/len(features))**(1/2))
    
    d2h = [round(value, 3) for value in d2h]
    centroids_df['d2h'] = d2h
    
    #Print the centroids DataFrame
    #print(centroids_df)
    result_name = name + 'centroids.csv'
    centroids_df.to_csv(result_name,index=False)

    print("Min d2h: ",min(d2h))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.replace('?', np.nan, inplace=True)
  centroids_df = centroids_df.apply(pd.to_numeric, errors='ignore')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.replace('?', np.nan, inplace=True)
  centroids_df = centroids_df.apply(pd.to_numeric, errors='ignore')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.replace('?', np.nan, inplace=True)
  centroids_df = centroids_df.apply(pd.to_numeric, errors='ignore')
A value is trying to be set on a copy of a slice from a DataF

   Cluster  Spout_wait  Spliters  Counters  Throughput+  Latency-    d2h
0        0       96.25       1.0       9.5     13105.50    295.55  0.153
1        1       96.25       2.0       9.5     15326.82    267.19  0.056
2        2       96.25       3.0       9.5     16190.46    257.65  0.019
3        3       96.25       4.0       9.5     16255.56    253.92  0.016
4        4       96.25       5.0       9.5     16390.76    253.40  0.010
5        5       96.25       6.0       9.5     16617.69    250.31  0.000
6        6    10000.00       1.0       9.5       424.17   7551.33  1.000
7        7    10000.00       2.0       9.5       555.01   6308.13  0.914
8        8    10000.00       3.0       6.0       583.87   6545.54  0.928
Min d2h:  0.0
    Cluster    A      B    C     A-     B-    d2h
0         0  0.0  15.71  3.0   9.68   6.63  0.239
1         1  0.0  15.71  4.0   9.70   6.61  0.239
2         2  1.0  15.71  3.0   9.91   7.58  0.328
3         3  1.0  15.71  4.0   9.93   7.61  0.332
4     

ValueError: min() arg is an empty sequence