In [None]:
!conda install plotly

In [None]:
!conda install -c pyviz hvplot

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import hvplot.pandas

ModuleNotFoundError: No module named 'hvplot'

In [None]:
# Loading the preprocessed iris dataset
file_path = "Resources/new_iris_data.csv"
df_iris = pd.read_csv(file_path)
df_iris.head()

In [None]:
# Standardize the 4 features in the dataset with values on different scales using the StandardScaler library.

#Standardize data with StandardScaler
iris_scaled = StandardScaler().fit_transform(df_iris)
print(iris_scaled[0:5])

In [None]:
# Use PCA to reduce the number of features. The PCA method takes an argument of n_components, which will pass in the value of 2, thus reducing the features from 4 to 2

# Initialize PCA model
pca = PCA(n_components=2)

In [None]:
# NOTE: There is now a smaller set of dimensions called principal components. 
# These new components are just the two main dimensions of variation that 
# contain most of the information in the original dataset.

In [None]:
# Apply dimensionality reduction on the scaled dataset

# Get two principal components for the iris data.
iris_pca = pca.fit_transform(iris_scaled)

In [None]:
# Transform PCA data to a DataFrame
df_iris_pca = pd.DataFrame(
    data_iris_pca, columns=["principal component 1", "principal component 2"]
)
df_iris_pca.head()

In [None]:
# Use explained_variance_ratio to learn how much information can be attributed to each principal component

# Fetch the explained variance
pca.explained_variance_ratio_

In [None]:
# Use the elbow curve with the generated principal components and see the K value is 3

# Find the best value for K
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_iris_pca)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

In [None]:
# Use the principal components data with the K-means algorithm with a K value of 3. Note: we could consider 2, but the direction shifts more after 3

# Initialize the K-means model
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(df_iris_pca)

# Predict clusters
predictions = model.predict(df_iris_pca)

# Add the predicted class columns
df_iris_pca["class"] = model.labels_
df_iris_pca.head()

In [None]:
# Plot the clusters. Instead of a 3D plot, the data is easier to analyze with only two features. 

df_iris_pca.hvplot.scatter(
    x="principal component 1",
    y="principal component 2",
    hover_cols=["class"],
    by="class",
)