In [None]:
import pandas as pd

df_model = pd.read_csv('../data/data-preprocessed.csv',index_col=0)
df_model.head()

# Hierarchical Clustering

In [2]:
from sklearn.metrics import silhouette_score, silhouette_samples
from matplotlib import pyplot as plt
import numpy as np

from sklearn.decomposition import PCA
import seaborn as sns

from scipy.cluster.hierarchy import dendrogram

def plot_dendrogram(model, **kwargs):
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count
    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)
    plt.title("Hierarchical Clustering Dendrogram")
    dendrogram(linkage_matrix, **kwargs,color_threshold=10000)
    plt.xlabel("Number of points in node (or index of point if no parenthesis).")
    plt.show()

pca = PCA(n_components=3)
pca_fit = pca.fit_transform(df_model)

def show_clusters_size(clusters):
    unique, counts = np.unique(clusters, return_counts=True)
    print(dict(zip(unique, counts)))

def show_pca(clusters):
    df_pca = pd.DataFrame(data=pca_fit, columns=['PC1', 'PC2', 'PC3'])
    fig, ax = plt.subplots(2, 2, figsize=(15,14))
    sns.scatterplot(data=df_pca, x='PC1', y='PC2', palette='Set1', hue=clusters, ax=ax[0][0])
    sns.scatterplot(data=df_pca, x='PC1', y='PC3', palette='Set1', hue=clusters, ax=ax[0][1])
    sns.scatterplot(data=df_pca, x='PC2', y='PC3', palette='Set1', hue=clusters, ax=ax[1][0])
    fig.delaxes(ax[1][1])

def show_parallel_coordinates(clusters):
    fig, ax = plt.subplots(1, 1, figsize=(8,7))
    df_model_par = df_model.copy()
    df_model_par['Cluster'] = clusters
    palette = sns.color_palette('Set1')
    pd.plotting.parallel_coordinates(df_model_par, 'Cluster', color=palette, ax = ax)

def show_mean_values(clusters):
    print(df_model.groupby(clusters).mean())

def show_silhouette_scores(clusters, k):
    score = silhouette_score(df_model, clusters, metric='euclidean')
    print('Silhouette Score: %.3f' % score)
    sample_silhouette_values = silhouette_samples(df_model, clusters)
    means_lst = []
    for label in range(k):
        means_lst.append(sample_silhouette_values[clusters == label].mean())
    print("Silhouette mean for each cluster:",means_lst)
    sample_silhouette_values = silhouette_samples(df_model, clusters)
    print("Std of silhouette mean:",np.std(means_lst))
    print("Number of negative silhouette values:",np.sum(sample_silhouette_values < 0, axis=0))

# AGNES - CompleteLinkage

In [None]:
from sklearn.cluster import AgglomerativeClustering

AGNES = AgglomerativeClustering(n_clusters=None, affinity='euclidean', linkage='complete', distance_threshold =0)
model = AGNES.fit(df_model)
plot_dendrogram(model, truncate_mode="lastp", p=6)

In [None]:
Ks = range(3,6)

In [None]:
for i in Ks:
    AGNES = AgglomerativeClustering(n_clusters=i, affinity='euclidean', linkage='complete', distance_threshold =None).fit(df_model)
    Clusters = AGNES.labels_
    show_clusters_size(Clusters)

In [None]:
for i in Ks:
    AGNES = AgglomerativeClustering(n_clusters=i, affinity='euclidean', linkage='complete', distance_threshold =None).fit(df_model)
    Clusters = AGNES.labels_
    show_pca(Clusters)

In [None]:
for i in Ks:
    AGNES = AgglomerativeClustering(n_clusters=i, affinity='euclidean', linkage='complete', distance_threshold =None).fit(df_model)
    Clusters = AGNES.labels_
    show_parallel_coordinates(Clusters)

In [None]:
for i in Ks:
    AGNES = AgglomerativeClustering(n_clusters=i, affinity='euclidean', linkage='complete', distance_threshold =None).fit(df_model)
    Clusters = AGNES.labels_
    show_mean_values(Clusters)

# AGNES - WardLinkage

In [None]:
AGNES = AgglomerativeClustering(n_clusters=None, affinity='euclidean', linkage='ward',distance_threshold =0)
model = AGNES.fit(df_model)
plot_dendrogram(model, truncate_mode="lastp", p=4)

In [None]:
Ks = range(3,6)

In [None]:
for i in Ks:
    AGNES = AgglomerativeClustering(n_clusters=i, affinity='euclidean', linkage='ward', distance_threshold =None).fit(df_model)
    Clusters = AGNES.labels_
    show_clusters_size(Clusters)

In [None]:
for i in Ks:
    AGNES = AgglomerativeClustering(n_clusters=i, affinity='euclidean', linkage='ward', distance_threshold =None).fit(df_model)
    Clusters = AGNES.labels_
    show_pca(Clusters)

In [None]:
for i in Ks:
    AGNES = AgglomerativeClustering(n_clusters=i, affinity='euclidean', linkage='ward', distance_threshold =None).fit(df_model)
    Clusters = AGNES.labels_
    show_parallel_coordinates(Clusters)

In [None]:
for i in Ks:
    AGNES = AgglomerativeClustering(n_clusters=i, affinity='euclidean', linkage='ward', distance_threshold =None).fit(df_model)
    Clusters = AGNES.labels_
    show_mean_values(Clusters)

# AGNES - SingleLinkage

In [None]:
AGNES = AgglomerativeClustering(n_clusters=None, affinity='euclidean', linkage='single', distance_threshold =0)
model = AGNES.fit(df_model)
plot_dendrogram(model, truncate_mode="lastp", p=100)

# BIRCH

In [None]:
from sklearn.cluster import Birch

for i in [1.5,1.6,1.7,1.8]:
    for j in (2,3,4):
        BIRCH=Birch(branching_factor=j, n_clusters = None, threshold=i).fit(df_model)
        Clusters = BIRCH.predict(df_model)
        unique, counts = np.unique(Clusters, return_counts=True)
        print("For RadiusThreshold =",i,"and L =",j,"we have:",dict(zip(unique, counts)))

In [None]:
RadiusThresholdRange = [1.5,1.6,1.7,1.8]
L = 4

In [None]:
for i in RadiusThresholdRange:
    BIRCH=Birch(branching_factor=L, n_clusters = None, threshold=i).fit(df_model)
    Clusters = BIRCH.predict(df_model)
    show_pca(Clusters)

In [None]:
for i in RadiusThresholdRange:
    BIRCH=Birch(branching_factor=L, n_clusters = None, threshold=i)
    BIRCH.fit(df_model)
    Clusters = BIRCH.predict(df_model)
    unique, counts = np.unique(Clusters, return_counts=True)
    palette = sns.color_palette('Set1', len(unique))
    print("For threshold = %.1f we have:" % (i))
    show_silhouette_scores(Clusters,len(unique))
# No Visualizer for BIRCH

In [None]:
for i in RadiusThresholdRange:
    BIRCH=Birch(branching_factor=10, n_clusters = None, threshold=i)
    BIRCH.fit(df_model)
    Clusters = BIRCH.predict(df_model)
    show_parallel_coordinates(Clusters)

In [None]:
for i in RadiusThresholdRange:
    BIRCH=Birch(branching_factor=10, n_clusters = None, threshold=i)
    BIRCH.fit(df_model)
    Clusters =  BIRCH.predict(df_model)
    show_mean_values(Clusters)

In [None]:
from sklearn import cluster
from itertools import cycle, islice
# Set up cluster parameters
plt.figure(figsize=(35, 6))

pca = PCA(n_components=2)
pca_fit = pca.fit_transform(df_model)
plot_num = 1

X = df_model

params = {"n_clusters": 5}
ward = cluster.AgglomerativeClustering(
    n_clusters=params["n_clusters"], linkage="ward"
)
complete = cluster.AgglomerativeClustering(
    n_clusters=params["n_clusters"], linkage="complete"
)
average = cluster.AgglomerativeClustering(
    n_clusters=params["n_clusters"], linkage="average"
)
single = cluster.AgglomerativeClustering(
    n_clusters=params["n_clusters"], linkage="single")

clustering_algorithms = (
    ("Single Linkage", single),
    ("Average Linkage", average),
    ("Complete Linkage", complete),
    ("Ward Linkage", ward),
)

for name, algorithm in clustering_algorithms:
    t0 = time.time()
    # catch warnings related to kneighbors_graph
    with warnings.catch_warnings():
        warnings.filterwarnings(
            "ignore",
            message="the number of connected components of the "
            + "connectivity matrix is [0-9]{1,2}"
            + " > 1. Completing it to avoid stopping the tree early.",
            category=UserWarning,
        )
        algorithm.fit(X)

    t1 = time.time()
    if hasattr(algorithm, "labels_"):
        y_pred = algorithm.labels_.astype(int)
    else:
        y_pred = algorithm.predict(X)

    plt.subplot(1, len(clustering_algorithms), plot_num)
    plt.title(name, size=18)

    colors = np.array(
        list(
            islice(
                cycle(
                    [
                        "#377eb8",
                        "#ff7f00",
                        "#4daf4a",
                        "#f781bf",
                        "#a65628",
                        "#984ea3",
                        "#999999",
                        "#e41a1c",
                        "#dede00",
                    ]
                ),
                int(max(y_pred) + 1),
            )
        )
    )
    df_pca2 = pd.DataFrame(data=pca_fit, columns=['PC1', 'PC2'])
    plt.scatter(
        df_pca2.iloc[:, 0], df_pca2.iloc[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors[y_pred], edgecolor="k"
    )
    plt.text(
        0.99,
        0.01,
        ("%.2fs" % (t1 - t0)).lstrip("0"),
        transform=plt.gca().transAxes,
        size=15,
        horizontalalignment="right",
    )
    plot_num += 1

plt.show()