In [1]:
# Required imports
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

## Load the Data Into a Pandas DataFrame

In [2]:
df = pd.read_csv("../Resources/ccinfo_transformed.csv")
df.head()

Unnamed: 0,limit_bal,marriage,age,bill_amt,pay_amt,default,other,post-grad,primary,secondary,customer_segments
0,-1.117341,1,24,-0.660703,-0.542779,1,0,0,0,1,2
1,-0.349942,0,26,-0.63637,-0.463994,1,0,0,0,1,2
2,-0.580162,0,34,-0.416808,-0.354013,0,0,0,0,1,0
3,-0.887121,1,37,-0.080152,-0.402077,0,0,0,0,1,0
4,-0.887121,1,57,-0.396855,0.523771,0,0,0,0,1,1


In [2]:
# Read in the CSV file as a Pandas Dataframe



Unnamed: 0,limit_bal,marriage,age,bill_amt,pay_amt,default,other,post-grad,primary,secondary,customer_segments
0,-1.117341,1,24,-0.660703,-0.542779,1,0,0,0,1,2
1,-0.349942,0,26,-0.63637,-0.463994,1,0,0,0,1,2
2,-0.580162,0,34,-0.416808,-0.354013,0,0,0,0,1,0
3,-0.887121,1,37,-0.080152,-0.402077,0,0,0,0,1,0
4,-0.887121,1,57,-0.396855,0.523771,0,0,0,0,1,1


In [4]:
df.hvplot.scatter(
    x="limit_bal",
    y="age",
    by="customer_segments"
)

In [3]:
# Plot the clusters using the "limit_bal" and "age" columns



In [5]:
df.hvplot.scatter(
    x="bill_amt",
    y="pay_amt",
    by="customer_segments"
)

In [4]:
# Plot the clusters using the "bill_amt" and "pay_amt" columns



## Use PCA to reduce the number of factors 

In [7]:
# Import the PCA module
from sklearn.decomposition import PCA

In [55]:
# Instantiate the PCA instance and declare the number of PCA variables
model = PCA(n_components=2)


In [56]:
# Fit the PCA model on the transformed credit card DataFrame
dc_pca = model.fit_transform(df)

# Review the first 5 rows of list data



In [57]:
sum(model.explained_variance_ratio_), model.explained_variance_ratio_

(0.9559803169329603, array([0.93682923, 0.01915109]))

In [36]:
pca_df = pd.DataFrame(dc_pca, columns=["PCA1", "PCA2"])
pca_df

Unnamed: 0,PCA1,PCA2
0,-11.410632,-1.194262
1,-9.424725,-0.757322
2,-1.336207,-0.695344
3,1.678845,-0.766763
4,21.589432,-0.937315
...,...,...
4994,0.674327,-1.059701
4995,-1.294033,-0.219982
4996,9.640978,0.394783
4997,8.748418,2.784093


## PCA explained variance ratio

In [8]:
# Calculate the PCA explained variance ratio



array([0.95017303, 0.01898131])

## Creating the PCA DataFrame

In [9]:
# Create the PCA DataFrame


# Review the PCA DataFrame



Unnamed: 0,PCA1,PCA2
0,-11.410632,-1.194262
1,-9.424725,-0.757322
2,-1.336207,-0.695344
3,1.678845,-0.766763
4,21.589432,-0.937315


## Incorporating the PCA DataFrame into the elbow method

In [38]:
# Create a a list to store inertia values and the values of k
inertia = []
k =list(range(2,11))
for i in k:
    k_model = KMeans(n_clusters=i)
    k_model.fit(pca_df)
    inertia.append(k_model.inertia_)
# Append the value of the computed inertia from the `inertia_` attribute of teh KMeans model instance


# Define a DataFrame to hold the values for k and the corresponding inertia


# Review the DataFrame



In [40]:
pd.DataFrame({"k":k, "inertia":inertia}).set_index("k").hvplot()

In [11]:
# Plot the Elbow Curve



## Segmentation of the PCA data with K-means 

In [41]:
k_model = KMeans(n_clusters=4)
k_model.fit(pca_df)
label = k_model.predict(pca_df)

In [42]:
pca_df["cluster"] = label

In [45]:
pca_df.hvplot.scatter(x="PCA1", y="PCA2", by="cluster")

In [46]:
df["cluster"] = label

In [58]:
df

Unnamed: 0,limit_bal,marriage,age,bill_amt,pay_amt,default,other,post-grad,primary,secondary,customer_segments,cluster
0,-1.117341,1,24,-0.660703,-0.542779,1,0,0,0,1,2,1
1,-0.349942,0,26,-0.636370,-0.463994,1,0,0,0,1,2,1
2,-0.580162,0,34,-0.416808,-0.354013,0,0,0,0,1,0,3
3,-0.887121,1,37,-0.080152,-0.402077,0,0,0,0,1,0,3
4,-0.887121,1,57,-0.396855,0.523771,0,0,0,0,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...
4994,-1.117341,1,36,-0.392558,-0.422089,0,0,0,0,1,0,3
4995,0.110497,1,34,-0.589218,-0.152107,0,1,0,0,0,0,3
4996,0.263977,1,45,1.115249,-0.163492,1,0,0,0,1,1,0
4997,1.108115,1,44,3.338132,0.760455,0,0,1,0,0,0,0


In [48]:
df.columns

Index(['limit_bal', 'marriage', 'age', 'bill_amt', 'pay_amt', 'default',
       'other', 'post-grad', 'primary', 'secondary', 'customer_segments',
       'cluster'],
      dtype='object')

In [51]:
df.hvplot.scatter(x="limit_bal", y="bill_amt", by="cluster")

In [12]:
# Define the model with 3 clusters


# Fit the model


# Make predictions


# Create a copy of the PCA DataFrame


# Add a class column with the labels



In [13]:
# Plot the clusters

