In [1]:
# Import the modules
import pandas as pd
import hvplot.pandas
from pathlib import Path

## Part 1: Create a Pandas DataFrame

In [2]:
# Set the file path
file_path = Path("../Resources/customers.csv")

# Read the csv file into a pandas DataFrame
customers_df = pd.read_csv(file_path)

# Review the DataFrame
customers_df.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10
0,1.148534,4.606077,2.699069,-2.661824,1.526433,1.236671,0.211421,1.482896,-4.445627,-1.936831
1,-1.14941,-1.650549,2.530167,-3.227088,0.572138,4.1626,-0.291679,-1.237575,3.604765,-1.635689
2,0.332427,-0.887985,-0.309216,0.399891,0.828492,3.641945,-0.916946,-1.978024,1.056772,-1.882747
3,2.245599,3.826309,0.264039,0.095471,1.98438,0.373991,-0.280279,1.602786,-5.993331,-2.258925
4,0.705503,-1.312329,0.895406,-0.405408,1.116187,3.699562,-1.427985,-1.494409,1.156908,-1.434964


In [3]:
# Use the "info()" Pandas function to validate data types and null values
customers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   feature_1   1000 non-null   float64
 1   feature_2   1000 non-null   float64
 2   feature_3   1000 non-null   float64
 3   feature_4   1000 non-null   float64
 4   feature_5   1000 non-null   float64
 5   feature_6   1000 non-null   float64
 6   feature_7   1000 non-null   float64
 7   feature_8   1000 non-null   float64
 8   feature_9   1000 non-null   float64
 9   feature_10  1000 non-null   float64
dtypes: float64(10)
memory usage: 78.2 KB


In [4]:
# Use the Pandas "describe()" function to compute summary statistics
customers_df.describe()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,-0.022428,0.805748,1.942896,-2.36403,0.85498,1.232422,0.146269,0.833486,-0.53432,-1.219393
std,2.382021,2.335796,1.411307,1.716566,1.742986,3.250231,1.635576,2.039563,4.211831,1.979172
min,-6.259471,-4.649286,-2.894995,-8.735778,-4.641509,-9.11147,-4.260013,-4.911903,-9.522425,-6.083462
25%,-2.091657,-1.214774,1.026128,-3.438149,-0.23531,-0.333722,-0.967569,-0.894817,-4.129561,-2.505366
50%,0.16167,1.096439,1.905107,-2.437602,1.084556,1.367371,-0.222299,1.519069,-0.536849,-1.706372
75%,2.030005,2.513648,2.851613,-1.22973,2.287268,3.637304,1.061269,2.298862,2.626514,-0.553571
max,6.275723,7.955158,5.897102,4.296552,4.74135,8.705423,7.123969,5.789222,10.047819,5.413623


## Part 2. Use the Elbow Method to determine the optimal number of clusters for KMeans.

In [5]:
# Import the KMeans, Birch, and AgglomerativeClustering modules from SKLearn
from sklearn.cluster import KMeans, AgglomerativeClustering, Birch

In [6]:
# Create a list to store inertia values and the values of k
inertia = []

# Create a list to set the range of k values to test
k = list(range(1, 11))

In [7]:
# Create a for loop where each value of k is evaluated using the K-means algorithm
# Fit the model using the "customers_df" DataFrame
# Append the value of the computed inertia from the `inertia_` attribute of the KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=0)
    k_model.fit(customers_df)
    inertia.append(k_model.inertia_)

In [8]:
# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow.head()

Unnamed: 0,k,inertia
0,1,58103.759171
1,2,32183.537923
2,3,17080.936423
3,4,14890.068176
4,5,12816.235532


In [9]:
# Plot the DataFrame to identify the optimal value for k
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

## Part 3: Segment the data with K-means using the optimal number of clusters

In [10]:
# Define the model with optimal number of clusters
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(customers_df)

# Make predictions
kmeans_predictions = model.predict(customers_df)

## Part 4. Cluster the data using AgglomerativeClustering and Birch

Using your optimal number of clusters found above, additionally estimate clusters by using both `AgglomerativeClustering` and `Birch`. Save each of these models and their results for comparison.

In [11]:
agglo_model = AgglomerativeClustering(n_clusters=3)
agglo_predictions = agglo_model.fit_predict(customers_df)

In [12]:
birch_model = Birch(n_clusters=3)
birch_model.fit(customers_df)
birch_predictions = birch_model.predict(customers_df)

In [13]:
# Previewing the predicted customer classifcations for Birch
birch_predictions[-10:]

array([0, 1, 0, 0, 0, 0, 1, 0, 1, 1], dtype=int64)

## Part 5. Compare the cluster results from using Kmeans, AgglomerativeClustering, Birch

In [14]:
# Create a copy of the customers_df DataFrame
customers_predictions_df = customers_df.copy()
# Add class columns with the labels to the new DataFrame
customers_predictions_df["kmeans-segments"] = kmeans_predictions
customers_predictions_df["agglomerative-segments"] = agglo_predictions
customers_predictions_df["birch-segments"] = birch_predictions
customers_predictions_df[['kmeans-segments','agglomerative-segments', 'birch-segments']].head(3)

Unnamed: 0,kmeans-segments,agglomerative-segments,birch-segments
0,1,1,0
1,0,0,1
2,0,0,1


In [15]:
# Plot the kmeans clusters using the "feature_1" and "feature_2" columns
customers_predictions_df.hvplot.scatter(
    x="feature_1",
    y="feature_2",
    by="kmeans-segments"
)

In [16]:
# Plot the agglomerative clusters using the "feature_1" and "feature_2" columns
customers_predictions_df.hvplot.scatter(
    x="feature_1",
    y="feature_2",
    by="agglomerative-segments"
)

In [17]:
# Plot the birch clusters using the "feature_1" and "feature_2" columns
customers_predictions_df.hvplot.scatter(
    x="feature_1",
    y="feature_2",
    by="birch-segments"
)

In [18]:
customers_predictions_df[['kmeans-segments','agglomerative-segments', 'birch-segments']].corr()

Unnamed: 0,kmeans-segments,agglomerative-segments,birch-segments
kmeans-segments,1.0,0.982562,0.302117
agglomerative-segments,0.982562,1.0,0.30896
birch-segments,0.302117,0.30896,1.0


### Bonus

In [19]:
# Preview the predictions for one of the algorithms
birch_predictions[0:10]

array([0, 1, 1, 0, 1, 1, 0, 0, 0, 1], dtype=int64)

In [20]:
# Equivalently, preview the labels_ attribute for one of the algorithms
birch_model.labels_[0:10]

array([0, 1, 1, 0, 1, 1, 0, 0, 0, 1], dtype=int64)

In [21]:
# Create a list to store values and the values of k
score_kmeans = []
score_agglomerative = []
score_birch = []

# Create a list to set the range of k values to test, starting at 2
k = list(range(2, 11))

In [22]:
from sklearn import metrics

for i in k:
    k_model = KMeans(n_clusters=i, random_state=0)
    k_model.fit(customers_df)
    labels = k_model.labels_
    score = metrics.calinski_harabasz_score(customers_df, labels)    
    score_kmeans.append(score)
    
    agglo_model = AgglomerativeClustering(n_clusters=i)
    agglo_predictions = agglo_model.fit_predict(customers_df)
    labels = agglo_model.labels_
    score = metrics.calinski_harabasz_score(customers_df, labels)    
    score_agglomerative.append(score)    
    
    birch_model = Birch(n_clusters=i)
    birch_model.fit(customers_df)
    labels = birch_model.labels_
    score = metrics.calinski_harabasz_score(customers_df, labels)    
    score_birch.append(score)

In [23]:
display(score_kmeans)

[803.7767901000835,
 1197.2339591364607,
 963.5244943567246,
 878.984431645905,
 781.1913330352697,
 722.008246754843,
 678.3825232189985,
 638.9870291772176,
 603.5166505532345]

In [24]:
score_agglomerative

[793.1761769443768,
 1173.3765904855773,
 920.430407435551,
 783.1374540348882,
 698.3124513125239,
 642.0342150282685,
 609.5331449471877,
 573.5727292902812,
 542.4260224059782]

In [25]:
score_birch

[792.7549736617844,
 1172.1940395784054,
 905.8303632361597,
 807.3524405928957,
 710.299103155839,
 650.134014299598,
 601.7209094043105,
 569.5499222834262,
 533.4727554559031]

**Bonus Question:**If larger metric values indicate a better number of clusters, what cluster count is best? Does it vary by the algorithm selected?

>**Sample Answer**: Based on each of the three lists, the highest value for each of the three algorithms appears to be at the three-cluster count. Based on this metric, three clusters would actually be sufficient to classify these customers, regardless of which of these three algorithms were used.