In [None]:
!pip install -U kaleido

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[K     |████████████████████████████████| 79.9 MB 104 kB/s 
[?25hInstalling collected packages: kaleido
Successfully installed kaleido-0.2.1


In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

from sklearn.model_selection import StratifiedKFold
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix

from sklearn.metrics import accuracy_score, f1_score

from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

import plotly.io as pio
pio.renderers.default = 'pdf'

In [None]:
data = [
    '../input/fdaproject1dataset/Data1.csv', # run3D 0
    '../input/fdaproject1dataset/Data2.csv', # run3D 1
    '../input/fdaproject1dataset/Data3.csv', # run3D 2
    '../input/fdaproject1dataset/Data4.csv', #Chain-link Dataset, run3D 3
    '../input/fdaproject1dataset/Data5.csv', #Atom Dataset, run3D 4
    '../input/fdaproject1dataset/Data8.csv',  # run3D 5
    '../input/fdaproject1dataset/Data6.csv', # run2D 6
    '../input/fdaproject1dataset/Data7.csv', #Circular Dataset, run 2D 7
]

model = [
    'k-means', # run3D
    'H-clusters', # run3D
    'k-means', # run3D
    'DBSCAN', #Chain-link Dataset, run3D
    'DBSCAN', #Atom Dataset, run3D
    'kmeans', #run2D
    'DBSCAN', #Circular Dataset, run 2D
    'H-clusters' #run3D
]

In [None]:
def label_mapper(og, pred):
    cm = confusion_matrix(og, pred)
    cm_argmax = cm.argmax(axis=0)
    y_pred = np.array([cm_argmax[i] for i in pred])
    return y_pred

In [None]:
def run3D(data, i, drop_cols = None, model_name = None, min_samples = 1): #for dataset data[0-4]
    
    if i == 5:  #ignore this... this is only for printing the title for specific data
        i = 7  
    
    k_values = len(data['X4'].unique())
    df = data.drop(drop_cols, axis = 1)
    
    if model_name == 'k-means':
        model = KMeans(n_clusters=k_values, random_state=0).fit(df.iloc[:, :3])
        df['y_pred'] = model.labels_
    
    elif model_name == 'H-clusters':
        model = AgglomerativeClustering(n_clusters=k_values, linkage="ward").fit(df.iloc[:, :3])
        df['y_pred'] = model.labels_

  
    elif model_name == 'DBSCAN':
        model = DBSCAN(eps=0.3, min_samples=min_samples).fit(df.iloc[:, :3])
        df['y_pred'] = model.labels_
    
    df['y_pred'] = label_mapper(df['X4'], df['y_pred'])
    accuracy = accuracy_score(df['X4'], df['y_pred'])
    
    fig = make_subplots(rows=1, cols=2, specs =[[{'type': 'scene'}, {'type': 'scene'}]], 
                        subplot_titles=('Original Labels', 'Predicted Labels'))
    
    X, Y, Z = df['X1'], df['X2'], df['X3']
    og_labels, pred_labels = df['X4'], df['y_pred']
    
    fig.add_trace(
        go.Scatter3d(x=X, y=Y, z=Z, mode='markers', hovertext = og_labels, marker = dict(color = og_labels)),
        row=1, col=1
    )

    fig.add_trace(
        go.Scatter3d(x=X, y=Y, z=Z, mode='markers', hovertext = pred_labels, marker = dict(color = pred_labels)),
        row=1, col=2
    )
    
    fig.update_layout(
        height=600, width=1000, 
        title_text= f"<b> DATA : {i+1}    MODEL : {model_name}   ACCURACY SCORE {accuracy*100} % </b>",
        font=dict(
        family="Courier New, monospace",
        size=12,  # Set the font size here
        color="black")
    )
    fig.show(renderer="pdf")
    
    if model_name == 'H-clusters':
        f = ff.create_dendrogram(df.iloc[:,:3])
        f.update_layout(width=1000, height=700)
        f.show(renderer="pdf")
    
    return {'y_pred' : df['y_pred'], 'model' : model, 'accuracy' : accuracy}

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
for i in range(6):
    df = pd.read_csv('/content/drive/MyDrive/IE5374 Workspace/Colab Notebooks/Project_1/Data1.csv')
    df.columns = ['X0', 'X1', 'X2', 'X3', 'X4']
    saves1 = run3D(df, i, ['X0'], model_name = 'k-means')
    saves2 = run3D(df, i, ['X0'], model_name = 'H-clusters')

ValueError: ignored

In [None]:
def run2D(data, i, drop_cols = None, model_name = None, min_samples = 1): #for dataset data[0-4]
    
    k_values = len(data['X3'].unique())
    df = data.drop(drop_cols, axis = 1)
    
    if model_name == 'k-means':
        model = KMeans(n_clusters=k_values, random_state=0).fit(df.iloc[:, :2])
        df['y_pred'] = model.labels_
        
    elif model_name == 'H-clusters':
        model = AgglomerativeClustering(n_clusters=k_values, linkage="ward").fit(df.iloc[:, :2])
        df['y_pred'] = model.labels_
  
    elif model_name == 'DBSCAN':
        model = DBSCAN(eps=0.3, min_samples=min_samples).fit(df.iloc[:, :2])
        df['y_pred'] = model.labels_
    
    df['y_pred'] = label_mapper(df['X3'], df['y_pred'])
    accuracy = accuracy_score(df['X3'], df['y_pred'])
    
    fig = make_subplots(rows=1, cols=2, specs =[[{'type': 'xy'}, {'type': 'xy'}]], 
                        subplot_titles=('Original Labels', 'Predicted Labels'))
    
    X, Y = df['X1'], df['X2']
    og_labels, pred_labels = df['X3'], df['y_pred']
    
    fig.add_trace(
        go.Scatter(x=X, y=Y, mode='markers', hovertext = og_labels, marker = dict(color = og_labels)),
        row=1, col=1
    )

    fig.add_trace(
        go.Scatter(x=X, y=Y, mode='markers', hovertext = pred_labels, marker = dict(color = pred_labels)),
        row=1, col=2
    )
    
    fig.update_layout(
        height=600, width=1000, 
        title_text= f"<b> DATA : {i}    MODEL : {model_name}   ACCURACY SCORE {accuracy*100} % </b>",
        font=dict(
        family="Courier New, monospace",
        size=12,  # Set the font size here
        color="black")
    )
    fig.show(renderer="pdf")
    
    if model_name == 'H-clusters':
        f = ff.create_dendrogram(df.iloc[:,:2])
        f.update_layout(width=1000, height=700)
        f.show(renderer="pdf")
    
    return {'y_pred' : df['y_pred'], 'model' : model, 'accuracy' : accuracy}

In [None]:
for i in range(6, len(data)):
    df = pd.read_csv(data[i])
    df.columns = ['X0', 'X1', 'X2', 'X3']
    saves1 = run2D(df, i, ['X0'], model_name = 'k-means')
    saves2 = run2D(df, i, ['X0'], model_name = 'H-clusters')

# Some Alternative Methods which give better clustering

When it comes Dataset number 4, 5, 7, this type of dataset are known as chain-linked, atom and circular dataset. Complexity of this dataset can be handle by unsupervised model like spectral clustering, DBSCAN, kernel-based K-means. Here we have used DBSCAN and the settings parameters were validated

In [None]:
df = pd.read_csv(data[3])
df.columns = ['X0', 'X1', 'X2', 'X3', 'X4']
saves = run3D(df, 3, ['X0'], model_name = 'DBSCAN')

In [None]:
df = pd.read_csv(data[4])
df.columns = ['X0', 'X1', 'X2', 'X3', 'X4']
saves = run3D(df, 4, ['X0'], model_name = 'DBSCAN')

In [None]:
df = pd.read_csv(data[7])
df.columns = ['X0', 'X1', 'X2', 'X3']
saves = run2D(df, 7, ['X0'], model_name = 'DBSCAN')