# Consumer_Finance_In_USA_2022

In [141]:
import pandas as pd
import plotly.express as px

from jupyter_dash import JupyterDash
from scipy.stats.mstats import trimmed_var
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from dash import Input, Output, dcc, html
from sklearn.metrics import silhouette_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# importing data

In [142]:
df = pd.read_csv("SCFP2022.csv")
mask = df["TURNFEAR"] == 1
df = df[mask]
print("df shape", df.shape)
df.head()

df shape (3839, 356)


Unnamed: 0,YY1,Y1,WGT,HHSEX,AGE,AGECL,EDUC,EDCL,MARRIED,KIDS,...,NWCAT,INCCAT,ASSETCAT,NINCCAT,NINC2CAT,NWPCTLECAT,INCPCTLECAT,NINCPCTLECAT,INCQRTCAT,NINCQRTCAT
20,5,51,7191.481109,2,19,1,8,2,1,0,...,1,3,1,2,1,2,5,4,2,2
21,5,52,7352.487205,2,19,1,8,2,1,0,...,1,3,1,2,1,2,5,4,2,2
22,5,53,7270.703541,2,19,1,8,2,1,0,...,1,3,1,3,1,2,5,5,2,2
23,5,54,7383.866597,2,19,1,8,2,1,0,...,1,3,1,3,1,2,5,5,2,2
24,5,55,7330.537669,2,19,1,8,2,1,0,...,1,3,1,2,1,2,5,4,2,2


## Using Jupyter_dash for creating app

In [143]:
# assigning the app
app = JupyterDash(__name__)


JupyterDash is deprecated, use Dash instead.
See https://dash.plotly.com/dash-in-jupyter for more details.



### Layout

In [144]:
app.layout = html.Div(
    [
        html.H1("Survey of Consumer Finances"),
        html.H2("High Variance Features"),
        # Bar chart element
        dcc.Graph(id="bar-chart"),
        dcc.RadioItems(
            options=[
                {"label":"trimmed", "value": True},
                {"label":"not trimmed", "value": False}
            ],
            value=True,
            id="trim-button"
        
        ),
        html.H2("K-means clustering"),
        html.H3("Number of clustering (k)"),
        dcc.Slider(min=2, max=12, step=1, value=2, id="k-slider"),
        html.Div(id ="metrics"),
        dcc.Graph(id="pca-scatter")
    ]

)

# 1. Variance 

### Business Layer

In [145]:
def get_high_var_features(trimmed=True, return_feat_names=True):

    """Returns the five highest-variance features of ``df``. """
    if trimmed:
        top_five_features = (
            df.apply(trimmed_var).sort_values().tail(5)
        )
    else:
        top_five_features = df.var().sort_values().tail(5)
    if return_feat_names:
        top_five_features = top_five_features.index.tolist()
    return top_five_features

### Service Layer

In [146]:
@app.callback(
    Output("bar-chart", "figure"), Input("trim-button", "value")
)
def serve_bar_chart(trimmed=True):

    """Returns a horizontal bar chart of five highest-variance features."""
    top_five_features = get_high_var_features(trimmed=trimmed, return_feat_names=False)
    
    fig= px.bar(x=top_five_features, y=top_five_features.index, orientation="h")
    fig.update_layout(xaxis_title="Variance", yaxis_title="Features")
    return fig

# k-means Slider

### Business Layer

In [147]:
def get_model_metrics(trimmed=True, k=2, return_metrics=False):

    features = get_high_var_features(trimmed=trimmed, return_feat_names=True)
    X = df[features]
    model= make_pipeline(StandardScaler(), KMeans(n_clusters=k, n_init=10, random_state=42))
    model.fit(X)
    
    if return_metrics:
        i = model.named_steps["kmeans"].inertia_
        ss = silhouette_score(X, model.named_steps["kmeans"].labels_)
        metrics = {
            "inertia": round(i),
            "silhouette": round(ss, 3)
        }
        return metrics
    return model

### Service Layer

In [148]:
@app.callback(
    Output("metrics", "children"),
    Input("trim-button", "value"),
    Input("k-slider", "value")
)
def serve_metrics(trimmed=True, k=2):

    metrics = get_model_metrics(trimmed=trimmed, k = k, return_metrics=True)
    text = [
        html.H3(f"Inertia: {metrics['inertia']}"),
        html.H3(f"Silhouette Score: {metrics['silhouette']}")
    ]
    return text

# PCA Scatter plot

### Business Layer

In [149]:
def get_pca_labels(trimmed=True, k=2):

    """
    ``KMeans`` labels.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    k : int, default=2
        Number of clusters.
    """
    features = get_high_var_features(trimmed=trimmed, return_feat_names=True)
    X = df[features]
    
    transformer = PCA(n_components=2, random_state=42)
    
    X_t = transformer.fit_transform(X)
    X_pca = pd.DataFrame(X_t, columns=["PC1", "PC2"])
    
    model = get_model_metrics(trimmed=trimmed, k=k, return_metrics=False)
    X_pca["labels"] = model.named_steps["kmeans"].labels_.astype(str)
    X_pca.sort_values("labels", inplace = True)
    
    return X_pca

### Service Layer

In [150]:
@app.callback(
    Output("pca-scatter", "figure"),
    Input("trim-button", "value"),
    Input("k-slider", "value")
)
def serve_scatter_plot(trimmed=True, k=2):

    fig = px.scatter(
        data_frame=get_pca_labels(trimmed=trimmed, k=k),
        x="PC1",
        y="PC2",
        color="labels",
        title="PCA Representation of Clusters"
    )
    fig.update_layout(xaxis_title="PC1", yaxis_title="PC2")
    return fig

In [152]:
app.run_server(host="localhost", port=6001)

Dash app running on http://localhost:6001/
