## Clustring

In [None]:
%matplotlib inline  
# This line enables the inline plotting feature in Jupyter Notebook
import numpy as np  # Importing the numpy library for array and matrix operations
import seaborn;seaborn.set()  # Setting the plotting style to seaborn
import pylab as pl  # Importing the pylab module from matplotlib for creating plots
import matplotlib.pyplot as plt  # Importing the pyplot module from matplotlib for creating plots
from sklearn.datasets import make_blobs  # Importing the make_blobs function from scikit-learn to generate synthetic datasets
from sklearn.cluster import KMeans  # Importing the KMeans class from scikit-learn for performing K-Means clustering
from sklearn import datasets  # Importing the datasets module from scikit-learn for accessing pre-loaded datasets
from sklearn.decomposition import PCA  # Importing the PCA class from scikit-learn for performing Principal Component Analysis


In [None]:
iris = datasets.load_iris()  # Load the Iris dataset from the scikit-learn library

X, y = iris.data, iris.target  # Split the dataset into feature array X and target array y

pca = PCA(n_components=2)  # Create an instance of PCA with 2 components for dimensionality reduction

pca.fit(X)  # Fit the PCA model to the data

X_reduced = pca.transform(X)  # Transform the original data to reduced dimensions using PCA

print("Reduced dataset shape:", X_reduced.shape)  # Print the shape of the reduced dataset

k_means = KMeans(algorithm='auto', copy_x=True, init='k-means++',
                              max_iter=300, n_clusters=2, n_init=10)  # Create an instance of K-Means clustering algorithm with specified parameters

k_means.fit(X)  # Fit the K-Means model to the original data

y_pred = k_means.predict(X)  # Perform K-Means clustering on the original data and obtain predicted labels

pl.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y_pred,
           cmap='RdYlBu')  # Create a scatter plot of the reduced data with colors based on the predicted labels


In [None]:
# create dataset
X, y = make_blobs(
   n_samples=150, n_features=2,
   centers=3, cluster_std=0.5,
   shuffle=True, random_state=0
)

# plot
plt.scatter(
   X[:, 0], X[:, 1],
   c='white', marker='o',
   edgecolor='black', s=50
)
plt.show()

In [None]:
from sklearn.cluster import KMeans

km = KMeans(
    n_clusters=3, init='random',
    n_init=10, max_iter=300, 
    tol=1e-04, random_state=0
)
y_km = km.fit_predict(X)

In [None]:
# plot the 3 clusters
plt.scatter(
    X[y_km == 0, 0], X[y_km == 0, 1],
    s=50, c='lightgreen',
    marker='s', edgecolor='black',
    label='cluster 1'
)

plt.scatter(
    X[y_km == 1, 0], X[y_km == 1, 1],
    s=50, c='orange',
    marker='o', edgecolor='black',
    label='cluster 2'
)

plt.scatter(
    X[y_km == 2, 0], X[y_km == 2, 1],
    s=50, c='lightblue',
    marker='v', edgecolor='black',
    label='cluster 3'
)

# plot the centroids
plt.scatter(
    km.cluster_centers_[:, 0], km.cluster_centers_[:, 1],
    s=250, marker='*',
    c='red', edgecolor='black',
    label='centroids'
)
plt.legend(scatterpoints=1)
plt.grid()
plt.show()


In [None]:
# calculate distortion for a range of number of cluster
distortions = []
for i in range(1, 11):
    km = KMeans(
        n_clusters=i, init='random',
        n_init=10, max_iter=300,
        tol=1e-04, random_state=0
    )
    km.fit(X)
    distortions.append(km.inertia_)

# plot
plt.plot(range(1, 11), distortions, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.show()