## Import Packages

In [73]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

## Import The Data

In [74]:
def wrangle(filepath):
    #read_file_to dataframe
    df=pd.read_csv(filepath)
    mask= (df["TURNFEAR"]==1) & (df["NETWORTH"]< 2e6)
    df=df[mask]
    return df

In [75]:
df = wrangle("SCFP2019.csv")
print(df.shape)
df.head()

(4418, 351)


Unnamed: 0,YY1,Y1,WGT,HHSEX,AGE,AGECL,EDUC,EDCL,MARRIED,KIDS,...,NWCAT,INCCAT,ASSETCAT,NINCCAT,NINC2CAT,NWPCTLECAT,INCPCTLECAT,NINCPCTLECAT,INCQRTCAT,NINCQRTCAT
5,2,21,3790.476607,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,4,2,2
6,2,22,3798.868505,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,3,2,2
7,2,23,3799.468393,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,4,2,2
8,2,24,3788.076005,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,4,2,2
9,2,25,3793.066589,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,4,2,2


## Explore The Data

In [71]:
# Calculate variance, get 5 largest features
Top5Var = df.var().sort_values().tail(5)
print(Top5Var)

NHNFIN      2.254163e+10
HOUSES      2.388459e+10
NETWORTH    4.847029e+10
NFIN        5.713939e+10
ASSET       8.303967e+10
dtype: float64


In [77]:
# Create horizontal bar chart of `top_ten_var`
fig = px.bar(
    x=Top5Var,
    y=Top5Var.index,
    title="SCF: High Variance Features"
            )
fig.update_layout(xaxis_title="Variance",yaxis_title="Feature")
fig.show()

In [78]:
Top5Var= Top5Var.tail(5).index.to_list()
Top5Var

['NHNFIN', 'HOUSES', 'NETWORTH', 'NFIN', 'ASSET']

## Normalization The Data

In [79]:
X = df[Top5VarT]
print("X shape:", X.shape)
X.head(2)

X shape: (4418, 5)


Unnamed: 0,DEBT,NETWORTH,HOUSES,NFIN,ASSET
5,12200.0,-6710.0,0.0,3900.0,5490.0
6,12600.0,-4710.0,0.0,6300.0,7890.0


In [80]:
X_summary = X.describe().loc[['mean', 'std']].astype(int)
X_summary

Unnamed: 0,DEBT,NETWORTH,HOUSES,NFIN,ASSET
mean,72701,76387,74530,117330,149089
std,135950,220159,154546,239038,288166


### Check Duplicates And Nulls

In [81]:
print(f"Number Of Duplicated Values Is : {X.duplicated().sum()}")

Number Of Duplicated Values Is : 1277


In [82]:
X.drop_duplicates(inplace=True, ignore_index=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [83]:
print(f"Number Of Duplicated Values Is : {X.duplicated().sum()}")

Number Of Duplicated Values Is : 0


In [84]:
X.head(2)

Unnamed: 0,DEBT,NETWORTH,HOUSES,NFIN,ASSET
0,12200.0,-6710.0,0.0,3900.0,5490.0
1,12600.0,-4710.0,0.0,6300.0,7890.0


In [85]:
X.shape

(3141, 5)

#### Check Nulls

In [86]:
X.isna().sum()

DEBT        0
NETWORTH    0
HOUSES      0
NFIN        0
ASSET       0
dtype: int64

### Check Outliers

In [87]:
features = ['DEBT', 'NETWORTH', 'HOUSES', 'NFIN', 'ASSET']

for feature in features:
    fig = px.box(
        data_frame=X,
        x=feature,
        title=f"Distribution of {feature}"
    )
    fig.update_layout(xaxis_title="Value [$]")
    fig.show()

### Dealing With Outliers

In [88]:
Scaler = StandardScaler()

XS = Scaler.fit_transform(X)

XS = pd.DataFrame(XS,columns=X.columns)

In [89]:
print("X shape:", XS.shape)
XS.head(2)

X shape: (3141, 5)


Unnamed: 0,DEBT,NETWORTH,HOUSES,NFIN,ASSET
0,-0.512534,-0.398979,-0.529599,-0.539286,-0.559401
1,-0.509517,-0.390282,-0.529599,-0.528988,-0.550994


In [91]:
features = ['DEBT', 'NETWORTH', 'HOUSES', 'NFIN', 'ASSET']

for feature in features:
    fig = px.box(
        data_frame=XS,
        x=feature,
        title=f"Distribution of {feature}"
    )
    fig.update_layout(xaxis_title="Value [$]")
    fig.show()

In [92]:
FX =XS.rank()

In [93]:
features = ['DEBT', 'NETWORTH', 'HOUSES', 'NFIN', 'ASSET']

for feature in features:
    fig = px.box(
        data_frame=FX,
        x=feature,
        title=f"Distribution of {feature}"
    )
    fig.update_layout(xaxis_title="Value [$]")
    fig.show()

In [94]:
FX.isna().sum()

DEBT        0
NETWORTH    0
HOUSES      0
NFIN        0
ASSET       0
dtype: int64

In [58]:
FX.shape

(3141, 5)

## Create Model (KMeans)

### PCA For My Featuers

In [59]:
pca = PCA(n_components=2,random_state=42)
XPT = pca.fit_transform(FX)
XP = pd.DataFrame(XPT,columns=["PC1","PC2"])
print("X_pca shape:", XP.shape)

X_pca shape: (3141, 2)


### Elpow Method (inertia_errors) & silhouette_scores To Find Best K 

In [60]:
n_clusters = range(2,10)
inertia_errors = []
silhouette_scores = []

for k in n_clusters:
    
    model=make_pipeline(StandardScaler(),KMeans(n_clusters=k,random_state=42))
    
    #train model
    model.fit(XP)
    
    #calculate inertia_errors
    inertia_errors.append(model.named_steps["kmeans"].inertia_)
    
    #calculate silhouette_scores
    silhouette_scores.append(silhouette_score(XP,model.named_steps["kmeans"].labels_))



















In [95]:
print("Inertia:", inertia_errors)
print("\nSilhouette Scores:", silhouette_scores)

Inertia: [3789.0060745695673, 1974.826974932752, 1460.566615788342, 1162.5552353444073, 981.6252777797031, 836.3105683897344, 732.9171112940924, 648.5617211779478]

Silhouette Scores: [0.5352518071071541, 0.43367114920001704, 0.3884448100857108, 0.339290755913379, 0.3674970726143814, 0.3580867911180859, 0.37116631339505207, 0.33729733079976276]


In [96]:
fig = px.line(
    x=n_clusters ,y=inertia_errors , title="K-Means Model: Inertia vs Number of Clusters"
)
fig.update_layout(xaxis_title="Number of Clusters",yaxis_title="Inertia")
fig.show()

In [97]:
fig = px.line(
    x=n_clusters ,y=silhouette_scores , title="K-Means Model: Silhouette Score vs Number of Clusters"
)
fig.update_layout(xaxis_title="Number of Clusters",yaxis_title="Silhouette Score")
fig.show()

### Final Model With 3 Clusters

In [98]:
final_model = make_pipeline(StandardScaler(), KMeans(n_clusters=3,random_state=42))

In [99]:
final_model.fit(XP)





In [100]:
labels = final_model.named_steps["kmeans"].labels_
print(labels[:250])

[0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 0 0 0
 0 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 0 0 0 0 0 1 1 2 2 2 2 2 2 2 2 2 0 0 0
 0 0 1 1 0 1 0 1 2 2 2 2 2 1 1 1 1 1 2 2 2 2 2 0 0 0 0 0 2 2 2 2 2 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 1 1 1 1 0 1 1 1 1 1 1 1 1 2 2 2 2 2 1
 1 2 2 2 2 2 1 2 2 2 2 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 0 0 0 0 2 2 2 2 1 1 1
 2 2 2 2 2 2 2 2 2 2 0 0 0 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 1 1 0 2 2
 2 2 2 2 2 2 1 1 1 1 0 0 0 0 0 1 1 1 1 1 2 2 2 1 1 1 1 0]


In [101]:
XG = FX.groupby(labels).mean()
XG

Unnamed: 0,DEBT,NETWORTH,HOUSES,NFIN,ASSET
0,2010.053021,447.84217,1193.102343,1276.160912,1173.272503
1,697.182422,1497.607812,1100.109375,946.996484,1016.332031
2,2297.109048,2527.974286,2436.919048,2559.418095,2554.36381


In [103]:
fig = px.bar(
    XG,
    barmode="group",
    title="Mean Household Finances by Cluster" 
)
fig.update_layout(xaxis_title="Cluster",yaxis_title="Value [$]")
fig.show()

# Model-agnostic interpretability using PCA

In [104]:
# Create scatter plot of `PC2` vs `PC1`
fig = px.scatter(
    data_frame=XP,
    x="PC1",
    y="PC2",
    color=labels.astype(str),
    title="PCA Representation of Clusters"
                
)
fig.update_layout(xaxis_title="PC1",yaxis_title="PC2")
fig.show()

# And Finally We Done !!