In [8]:
import numpy as np
import pandas as pd
import plotly.express as px
import random
 

#X = np.random.random_integers(0, 1000, 2000) # this works but it publishes an error "This function is deprecated. Please call randint(0, 1000 + 1) instead"
x1 = random.sample(range(0, 1000), 1000) # random.sample cannot select more than the range
X = random.choices(x1, k=2000) # random.choices does sampling with replacement or repeat
n = np.random.normal(0, 0.2, 2000)

Y1 = 0.001 * np.asarray(X)  + np.asarray(n)  # np.asarray() converts the list to numpy array
Y2 = 0.001 * np.asarray(X) + 2 + np.asarray(n)

d1 ={'X':X, 'Y':Y1, 'target':'A'}
df1 = pd.DataFrame(data=d1)

d2 ={'X':X, 'Y':Y2, 'target':'B'}
df2 = pd.DataFrame(data=d2)

df_vertical_stack = pd.concat([df1, df2], axis=0) # Stack the DataFrames on top of each other

fig = px.scatter(df_vertical_stack, x="X", y="Y", color="target")
fig.show()

In [11]:
df_vertical_stack.tail()

Unnamed: 0,X,Y,target,prediction
1995,831,2.967293,B,0
1996,162,2.253098,B,0
1997,738,2.674869,B,0
1998,461,2.592634,B,0
1999,172,2.270229,B,0


In [9]:
from sklearn import cluster, datasets, mixture
from sklearn.cluster import AgglomerativeClustering,KMeans
from sklearn.preprocessing import StandardScaler 

#1 KMeans Clustring
kmeans = KMeans(n_clusters = 2, random_state = 0)
scaler = StandardScaler()
df_vertical_stack['prediction'] = kmeans.fit_predict(scaler.fit_transform(df_vertical_stack[['X','Y']]))
df_vertical_stack['prediction'] = df_vertical_stack['prediction'].astype(str)
df_vertical_stack['target'] = df_vertical_stack['target'].astype(str)
fig_KMeans = px.scatter(df_vertical_stack, x = "X", y = "Y", color = "prediction")
fig_KMeans.show()


In [3]:
from sklearn import cluster, datasets, mixture
from sklearn.cluster import AgglomerativeClustering,KMeans
from sklearn.preprocessing import StandardScaler 

#2 Spectral Clustring
spectral = cluster.SpectralClustering(n_clusters = 2, eigen_solver = 'arpack',affinity = "nearest_neighbors")
scaler = StandardScaler()
df_vertical_stack['prediction'] = spectral.fit_predict(scaler.fit_transform(df_vertical_stack[['X','Y']]))  # A question for Mohammad: Why this clustering doesn't work without scaling? (#df['prediction']=spectral.fit_predict(df[['X','Y']]))
df_vertical_stack['prediction'] = df_vertical_stack['prediction'].astype(str)
df_vertical_stack['target'] = df_vertical_stack['target'].astype(str)
fig_Spectral = px.scatter(df_vertical_stack, x = "X", y = "Y", color = "prediction")
fig_Spectral.show()

In [4]:
from sklearn import cluster, datasets, mixture
from sklearn.cluster import AgglomerativeClustering,KMeans
from sklearn.preprocessing import StandardScaler 

#3 Agglomerative Ward Clustring
aggloclust = AgglomerativeClustering(n_clusters = 2, affinity='euclidean', compute_full_tree='auto',
            connectivity=None, linkage='ward', memory=None)
scaler = StandardScaler()
df_vertical_stack['prediction'] = aggloclust.fit_predict(scaler.fit_transform(df_vertical_stack[['X','Y']]))  # A question for Mohammad: Why this clustering doesn't work without scaling? (#df['prediction']=spectral.fit_predict(df[['X','Y']]))
df_vertical_stack['prediction'] = df_vertical_stack['prediction'].astype(str)
df_vertical_stack['target'] = df_vertical_stack['target'].astype(str)
fig_aggloclust = px.scatter(df_vertical_stack, x = "X", y = "Y", color = "prediction")
fig_aggloclust.show()



In [6]:
from sklearn.cluster import AgglomerativeClustering,KMeans
from sklearn import cluster, datasets, mixture
from sklearn.preprocessing import StandardScaler
import plotly.express as px
import numpy as np
import pandas as pd

n_samples=1500
random_state =7
data = datasets.make_blobs(n_samples=n_samples, centers=4, cluster_std=[1.5, 2.5, 2.5,1.5], random_state=random_state)
df_blobs=pd.DataFrame(data=data[0],columns=['x','y'])
df_blobs['label']=data[1]
scaler=StandardScaler()

km = KMeans(n_clusters=4, random_state=0)

sl = cluster.SpectralClustering(n_clusters=4, eigen_solver='arpack',affinity="nearest_neighbors")   

agglo_ward = AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
            connectivity=None, linkage='ward', memory=None, n_clusters=4)
            
agglo_avg = AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
            connectivity=None, linkage='average', memory=None, n_clusters=4)
           

clustering_methods = [ ('KMeans', km), ('SpectralClustering', sl),('Agglomerative_ward',agglo_ward), ('Agglomerative_average',agglo_avg)]


for name, model in clustering_methods:
    #print(name)
    df_blobs['prediction-'+name] = model.fit_predict(scaler.fit_transform(df_blobs[['x','y']]))
    df_blobs['prediction-'+name] = df_blobs['prediction-'+name].astype(str)

    fig = px.scatter(df_blobs, x="x", y="y", color=df_blobs['prediction-'+name],symbol='label', title=name)
    fig.update_layout(coloraxis_showscale=False)
    fig.update_xaxes(showticklabels=False)
    fig.update_yaxes(showticklabels=False)
    fig.show()

In [7]:
from sklearn import metrics
from sklearn.metrics.cluster import fowlkes_mallows_score
from sklearn.metrics.cluster import contingency_matrix
import plotly.express as px

ARI_matrix = []
AMI_matrix = []
NMI_matrix = []
FMI_matrix = []


for Col_Name in df_blobs.columns[3:]:
       
    # Adjusted Rand Index (ARI) 
    ARI = metrics.adjusted_rand_score(df_blobs['label'], df_blobs[Col_Name])
    ARI_matrix.append(ARI)
    
    # Unadjusted Rand Index (RI)
    #RI = metrics.rand_score(df_blobs['label'], df_blobs[Col_Name])
    
    # Adjusted Mutual Information (AMI)
    AMI = metrics.adjusted_mutual_info_score(df_blobs['label'], df_blobs[Col_Name])
    AMI_matrix.append(AMI)
    
    # Normalized Mutual Information (NMI) 
    NMI = metrics.normalized_mutual_info_score(df_blobs['label'], df_blobs[Col_Name])
    NMI_matrix.append(NMI)
    
    # Fowlkes Mallows Index
    FMI = metrics.fowlkes_mallows_score(df_blobs[Col_Name], df_blobs['label'])
    FMI_matrix.append(FMI)

    d ={'ARI':ARI_matrix, 'AMI':AMI_matrix, 'NMI':NMI_matrix, 'FMI':FMI_matrix}
    df = pd.DataFrame(data=d)   
    
df['Type']=df_blobs.columns[3:]
df2=pd.melt(df, id_vars=['Type'])
fig = px.bar(df2, y=df2['value'], x=df2['variable'], color=df2['Type'], barmode='group', text='value')
fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()    