In [1]:
# Required imports
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Read in the CSV file as a Pandas Dataframe
ccinfo_default_df = pd.read_csv("https://static.bc-edx.com/mbc/ai/m2/datasets/ccinfo-transformed.csv")
ccinfo_default_df.head()

###################
#   Plot Clusters #
###################

# Plot the clusters using the "limit_bal" and "age" columns
ccinfo_default_df.plot.scatter(
  x="limit_bal",
  y="age",
  c="customer_segments",
  colormap="winter"
)

# Plot the clusters using the "bill_amt" and "pay_amt" columns
ccinfo_default_df.plot.scatter(
  x="bill_amt",
  y="pay_amt",
  c="customer_segments",
  colormap="winter"
)

###################
#  Fit PCA Model  #
###################

pca = PCA(n_components=2)

# Fit the PCA model on the transformed credit card DataFrame
ccinfo_pca = pca.fit_transform(ccinfo_default_df)

# Calculate the PCA explained variance ratio for reference
pca.explained_variance_ratio_

# Create the PCA DataFrame
ccinfo_pca_df = pd.DataFrame(
  ccinfo_pca,
  columns=["PCA1", "PCA2"]
)

###################
#   Elbow Method  #
###################

# Create a a list to store inertia values and the values of k
inertia = []
k = list(range(1, 11))

# Append the value of the computed inertia from the `inertia_` attribute of the KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, n_init='auto', random_state=1)
    k_model.fit(ccinfo_pca_df)
    inertia.append(k_model.inertia_)

# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow.head(10)

# Plot the Elbow Curve
df_elbow.plot.line(
  x="k",
  y="inertia"
)

# Determine the rate of decrease between each k value.
k = df_elbow["k"]
inertia = df_elbow["inertia"]
for i in range(1, len(k)):
  percentage_decrease = (inertia[i-1] - inertia[i]) / inertia[i-1] * 100
  print(f"Percentage decrease from k={k[i-1]} to k={k[i]}: {percentage_decrease:.2f}%")


#######################
#   PCA Segmentation  #
#######################

##
# Define the model with 3 clusters
model = KMeans(n_clusters=3, n_init='auto', random_state=0)

# Fit the model
model.fit(ccinfo_pca_df)

# Make predictions
k_3 = model.predict(ccinfo_pca_df)

# Create a copy of the PCA DataFrame
ccinfo_pca_predictions_df = ccinfo_pca_df.copy()

# Add a class column with the labels
ccinfo_pca_predictions_df["customer_segments"] = k_3

ccinfo_pca_predictions_df.plot.scatter(
  x="PCA1",
  y="PCA2",
  c="customer_segments",
  colormap="winter"
)

ModuleNotFoundError: No module named 'sklearn'