In [19]:
# Initial imports
import pandas as pd
from sklearn.cluster import KMeans
import plotly.express as px
import hvplot.pandas

In [4]:
# Load data
file_path = "Resources/shopping_data.csv"
df_shopping = pd.read_csv(file_path)
df_shopping.head(10)

Unnamed: 0,CustomerID,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,Yes,19.0,15000,39.0
1,2,Yes,21.0,15000,81.0
2,3,No,20.0,16000,6.0
3,4,No,23.0,16000,77.0
4,5,No,31.0,17000,40.0
5,6,No,22.0,17000,76.0
6,7,No,35.0,18000,6.0
7,8,No,23.0,18000,94.0
8,9,Yes,64.0,19000,3.0
9,10,No,30.0,19000,72.0


In [5]:
# Drop null
df_shopping = df_shopping.dropna()

In [6]:
# Find dupes
print(f"Duplicate entries: {df_shopping.duplicated().sum()}")

Duplicate entries: 0


In [7]:
df_shopping.drop(columns=["CustomerID"], inplace=True)
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,Yes,19.0,15000,39.0
1,Yes,21.0,15000,81.0
2,No,20.0,16000,6.0
3,No,23.0,16000,77.0
4,No,31.0,17000,40.0


In [8]:
# Transform string column
def change_string(member):
    if member == "Yes":
        return 1
    else:
        return 0
    
df_shopping["Card Member"] = df_shopping["Card Member"].apply(change_string)
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15000,39.0
1,1,21.0,15000,81.0
2,0,20.0,16000,6.0
3,0,23.0,16000,77.0
4,0,31.0,17000,40.0


In [9]:
# Transform annual income
df_shopping["Annual Income"] = df_shopping["Annual Income"] / 1000
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


In [10]:
# Save cleanded data
file_path = "c:/users/zwa00/Class/Cryptocurrencies/Resources/shopping_data_cleaned.csv"
df_shopping.to_csv(file_path, index=False)


In [11]:
# Load data
file_path = "Resources/shopping_data_cleaned.csv"
df_shopping = pd.read_csv(file_path)
df_shopping.head(10)

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0
5,0,22.0,17.0,76.0
6,0,35.0,18.0,6.0
7,0,23.0,18.0,94.0
8,1,64.0,19.0,3.0
9,0,30.0,19.0,72.0


In [12]:
df_shopping.hvplot.scatter(x="Annual Income", y="Spending Score (1-100)")

In [13]:
# Function to cluster and plot dataset
def test_cluster_amount(df, clusters):
    model = KMeans(n_clusters=clusters, random_state=5)
    model

    # Fitting model
    model.fit(df)

    # Add a new class column to df_iris
    df["class"] = model.labels_

In [16]:
test_cluster_amount(df_shopping, 7)
df_shopping.hvplot.scatter(x="Annual Income", y="Spending Score (1-100)", by="class")

In [17]:
fig = px.scatter_3d(
    df_shopping,
    x="Annual Income",
    y="Spending Score (1-100)",
    z="Age",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [18]:
# Find num of K values using elbow curve

inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_shopping)
    inertia.append(km.inertia_)


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.



In [20]:
# Create the elbow curve in hvplot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

In [21]:
def get_clusters(k, data):
    # Create a copy of the DataFrame
    data = data.copy()

    # Initialize the K-Means model
    model = KMeans(n_clusters=k, random_state=0)

    # Fit the model
    model.fit(data)

    # Predict clusters
    predictions = model.predict(data)

    # Create return DataFrame with predicted clusters
    data["class"] = model.labels_

    return data

In [22]:
five_clusters = get_clusters(5, df_shopping)
five_clusters.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100),class
0,1,19.0,15.0,39.0,0
1,1,21.0,15.0,81.0,4
2,0,20.0,16.0,6.0,0
3,0,23.0,16.0,77.0,4
4,0,31.0,17.0,40.0,0


In [24]:
six_clusters = get_clusters(6, df_shopping)
six_clusters.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100),class
0,1,19.0,15.0,39.0,5
1,1,21.0,15.0,81.0,3
2,0,20.0,16.0,6.0,5
3,0,23.0,16.0,77.0,3
4,0,31.0,17.0,40.0,5


In [25]:
# Plot 2D scatter w/ x="Annual Income" and y="Speding score 1-100"
five_clusters.hvplot.scatter(x='Annual Income', y='Spending Score (1-100)', by='class')

In [26]:
# Plot the 3D-scatter with x="Annual Income", y="Spending Score (1-100)" and z="Age"
fig = px.scatter_3d(
    five_clusters,
    x="Age",
    y="Spending Score (1-100)",
    z="Annual Income",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [27]:
# Plot 2D scatter w/ x="Annual Income" and y="Speding score 1-100"
six_clusters.hvplot.scatter(x='Annual Income', y='Spending Score (1-100)', by='class')

In [28]:
# Plotting the 3D-Scatter with x="Annual Income", y="Spending Score (1-100)" and z="Age"
fig = px.scatter_3d(
    six_clusters,
    x="Age",
    y="Spending Score (1-100)",
    z="Annual Income",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [29]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import hvplot.pandas

In [30]:
# Loading the preprocessed iris dataset
file_path = "Resources/new_iris_data.csv"
df_iris = pd.read_csv(file_path)
df_iris.head()


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,1
1,4.9,3.0,1.4,0.2,1
2,4.7,3.2,1.3,0.2,1
3,4.6,3.1,1.5,0.2,1
4,5.0,3.6,1.4,0.2,1


In [31]:
# Standardize data with StandardScaler
iris_scaled = StandardScaler().fit_transform(df_iris)
print(iris_scaled[0:5])

[[-0.90068117  1.03205722 -1.3412724  -1.31297673  0.19983354]
 [-1.14301691 -0.1249576  -1.3412724  -1.31297673  0.19983354]
 [-1.38535265  0.33784833 -1.39813811 -1.31297673  0.19983354]
 [-1.50652052  0.10644536 -1.2844067  -1.31297673  0.19983354]
 [-1.02184904  1.26346019 -1.3412724  -1.31297673  0.19983354]]


In [32]:
# Initialize PCA model
pca = PCA(n_components=2)

In [33]:
# Get two principal components for the iris data.
iris_pca = pca.fit_transform(iris_scaled)

In [34]:
df_iris_pca = pd.DataFrame(data=iris_pca, columns=["principal component 1", "principal component 2"])
df_iris_pca.head()

Unnamed: 0,principal component 1,principal component 2
0,-2.182256,-0.812917
1,-2.051688,0.016021
2,-2.316445,-0.255681
3,-2.263657,-0.06775
4,-2.298328,-0.946665


In [35]:
# Fetch the explained variance ratio
pca.explained_variance_ratio_

array([0.59174783, 0.28586907])

In [36]:
# Find the best value for K
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_iris_pca)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.



In [37]:
# Initialize the K-means model
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(df_iris_pca)

# Predict clusters
predictions = model.predict(df_iris_pca)

# Add the predicted class columns
df_iris_pca["class"] = model.labels_
df_iris_pca.head()

Unnamed: 0,principal component 1,principal component 2,class
0,-2.182256,-0.812917,0
1,-2.051688,0.016021,0
2,-2.316445,-0.255681,0
3,-2.263657,-0.06775,0
4,-2.298328,-0.946665,0


In [38]:
df_iris_pca.hvplot.scatter(
    x="principal component 1",
    y="principal component 2",
    hover_cols=["class"],
    by="class",
)

In [39]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
import hvplot.pandas

In [41]:
# Load data
file = "Resources/new_iris_data.csv"
df_iris = pd.read_csv(file)
df_iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,1
1,4.9,3.0,1.4,0.2,1
2,4.7,3.2,1.3,0.2,1
3,4.6,3.1,1.5,0.2,1
4,5.0,3.6,1.4,0.2,1


In [43]:
import plotly.figure_factory as ff

In [44]:
# Create the dendrogram
fig = ff.create_dendrogram(df_iris_pca, color_threshold=0)
fig.update_layout(width=800, height=500)
fig.show()

In [45]:
agg = AgglomerativeClustering(n_clusters=3)
model = agg.fit(df_iris_pca)

In [46]:
# Add a new class clolumn to df_iris
df_iris_pca["class"] = model.labels_
df_iris_pca.head()

Unnamed: 0,principal component 1,principal component 2,class
0,-2.182256,-0.812917,1
1,-2.051688,0.016021,1
2,-2.316445,-0.255681,1
3,-2.263657,-0.06775,1
4,-2.298328,-0.946665,1


In [48]:
df_iris_pca.hvplot.scatter(
    x="principal component 1",
    y="principal component 2",
    hover_cols=["class"],
    by="class" 
    
)