In [27]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import hvplot.pandas

In [28]:
#Load the data
file_path="Resources/new_iris_data.csv"
df_iris= pd.read_csv(file_path)
df_iris.head()

Unnamed: 0,sepal_length,petal_length,sepal_width,petal_width
0,5.1,1.4,3.5,0.2
1,4.9,1.4,3.0,0.2
2,4.7,1.3,3.2,0.2
3,4.6,1.5,3.1,0.2
4,5.0,1.4,3.6,0.2


In [29]:
iris_scaled= StandardScaler().fit_transform(df_iris)
print(iris_scaled[0:5])

[[-0.90068117 -1.3412724   1.03205722 -1.31297673]
 [-1.14301691 -1.3412724  -0.1249576  -1.31297673]
 [-1.38535265 -1.39813811  0.33784833 -1.31297673]
 [-1.50652052 -1.2844067   0.10644536 -1.31297673]
 [-1.02184904 -1.3412724   1.26346019 -1.31297673]]


In [30]:
# Initialize PCA 4-2 features
pca=PCA(n_components=2)

In [31]:
# Get 2 components for the Iris Data
iris_pca=pca.fit_transform(iris_scaled)
print(iris_pca[0:5])

[[-2.26454173  0.5057039 ]
 [-2.0864255  -0.65540473]
 [-2.36795045 -0.31847731]
 [-2.30419716 -0.57536771]
 [-2.38877749  0.6747674 ]]


In [32]:
# Transform PCA to DataFrame
df_iris_pca= pd.DataFrame(data=iris_pca, columns=['principal Component 1', 'principal component 2'])
df_iris_pca.head()

Unnamed: 0,principal Component 1,principal component 2
0,-2.264542,0.505704
1,-2.086426,-0.655405
2,-2.36795,-0.318477
3,-2.304197,-0.575368
4,-2.388777,0.674767


In [33]:
# Fetch the explaines variance
pca.explained_variance_ratio_

array([0.72770452, 0.23030523])

In [34]:
# Find teh best value for K
inertia=[]
k=list(range(1,10))

# Calculate the inertia for the range of k values
for i in k :
    km=KMeans(n_clusters=i,random_state=0)
    km.fit(df_iris_pca)
    inertia.append(km.inertia_)
    
# Create elbow Curve
elbow_data= {"k": k, "inertia": inertia}
df_elbow= pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

  f"KMeans is known to have a memory leak on Windows "


In [35]:
# Initialize the K-Means model
model=KMeans(n_clusters=3, random_state=0)

# fit the model
model.fit(df_iris_pca)

# predict clusters
predictions= model.predict(df_iris_pca)

# add the new coulns of clusters
df_iris_pca["class"]= model.labels_
df_iris_pca.head()

Unnamed: 0,principal Component 1,principal component 2,class
0,-2.264542,0.505704,0
1,-2.086426,-0.655405,0
2,-2.36795,-0.318477,0
3,-2.304197,-0.575368,0
4,-2.388777,0.674767,0


In [38]:
df_iris_pca.hvplot.scatter(
    x="principal Component 1",
    y="principal component 2",
    hover_cols=["class"],
    by="class",
)