<a href="https://colab.research.google.com/github/tuancompa2610/Unsupervised-Consumed-Finance-in-USA/blob/main/Interactive_Dash_App.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install dash
!pip install jupyter_dash

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting dash
  Downloading dash-2.8.1-py3-none-any.whl (9.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m66.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dash-table==5.0.0
  Downloading dash_table-5.0.0-py3-none-any.whl (3.9 kB)
Collecting dash-html-components==2.0.0
  Downloading dash_html_components-2.0.0-py3-none-any.whl (4.1 kB)
Collecting dash-core-components==2.0.0
  Downloading dash_core_components-2.0.0-py3-none-any.whl (3.8 kB)
Installing collected packages: dash-table, dash-html-components, dash-core-components, dash
Successfully installed dash-2.8.1 dash-core-components-2.0.0 dash-html-components-2.0.0 dash-table-5.0.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting jupyter_dash
  Downloading jupyter_dash-0.4.2-py3-none-any.whl (23 kB)
Collecting ansi2html
  Downloadin

In [3]:
import pandas as pd
import plotly.express as px
from dash import Input, Output, dcc, html
from jupyter_dash import JupyterDash
from scipy.stats.mstats import trimmed_var
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [5]:
from google.colab import files
uploaded = files.upload()

Saving scfp2019excel.zip to scfp2019excel.zip


In [4]:
def wrangle(filepath):

    """Read SCF data file into ``DataFrame``.

    Returns only credit fearful households whose net worth is less than $2 million.

    Parameters
    ----------
    filepath : str
        Location of CSV file.
    """
    df = pd.read_csv(filepath)
    mask = (df["TURNFEAR"] == 1) & (df["NETWORTH"] < 2e6)
    df = df[mask]
    return df

In [6]:
df = wrangle("/content/scfp2019excel.zip")
print(df.shape)
df.head()

(4418, 351)


Unnamed: 0,YY1,Y1,WGT,HHSEX,AGE,AGECL,EDUC,EDCL,MARRIED,KIDS,...,NWCAT,INCCAT,ASSETCAT,NINCCAT,NINC2CAT,NWPCTLECAT,INCPCTLECAT,NINCPCTLECAT,INCQRTCAT,NINCQRTCAT
5,2,21,3790.476607,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,4,2,2
6,2,22,3798.868505,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,3,2,2
7,2,23,3799.468393,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,4,2,2
8,2,24,3788.076005,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,4,2,2
9,2,25,3793.066589,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,4,2,2


In [7]:
app = JupyterDash(__name__)

In [8]:
app.layout = html.Div(
	[
        #Application title
        html.H1("Survey of Consumer Finances"),
        # Bar chart element
        html.H2("High Variance Features"),
        # Bar chart graph
        dcc.Graph(id = "bar-chart"),
        dcc.RadioItems(
        	options = [
                {"label": "trimmed", "value": True},
                {"label": "not trimmed", "value": False}
            ],
            value = True,
            id = "trim-button"
        ),
        html.H2("K-means Clustering"),
        html.H3("Number of Clusters [k]"),
        dcc.Slider(min = 2, max = 12, step = 1, value = 2, id = "k-slider"),
        html.Div(id = "metrics"),
        # PCA scatter plot
        dcc.Graph(id = "pca-scatter")
    ]
)


In [9]:
def get_high_var_features(trimmed = True, return_feat_names = True):

    """Returns the five highest-variance features of ``df``.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    return_feat_names : bool, default=False
        If ``True``, returns feature names as a ``list``. If ``False``
        returns ``Series``, where index is feature names and values are
        variances.
    """
    # Calculate variance
    if trimmed:
        top_five_features = (
        	df.apply(trimmed_var).sort_values().tail(5)
        )
    else:
        top_five_features = df.var().sort_values().tail(5)
    # Extract names
    if return_feat_names:
        top_five_features = top_five_features.index.to_list()
    return top_five_features

In [10]:
get_high_var_features()

['DEBT', 'NETWORTH', 'HOUSES', 'NFIN', 'ASSET']

In [11]:
@app.callback(
	Output("bar-chart", "figure"), Input("trim-button", "value")
)
def serve_bar_chart(trimmed = True):

    """Returns a horizontal bar chart of five highest-variance features.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.
    """
    # Get features
    top_five_features = get_high_var_features(trimmed = trimmed, return_feat_names = False)
    # Build bar chart
    fig = px.bar(
    	x = top_five_features,
        y = top_five_features.index,
        orientation = "h"
    )
    fig.update_layout(xaxis_title = "Variance", yaxis_title = "Feature")
    
    return fig

In [12]:
def get_model_metrics(trimmed = True, k = 2, return_metrics = False):

    """Build ``KMeans`` model based on five highest-variance features in ``df``.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    k : int, default=2
        Number of clusters.

    return_metrics : bool, default=False
        If ``False`` returns ``KMeans`` model. If ``True`` returns ``dict``
        with inertia and silhouette score.

    """
    # Get high var features
    features = get_high_var_features(trimmed = trimmed, return_feat_names = True) 
    # Create features matrix
    X = df[features]
    # Build model
    model = make_pipeline(StandardScaler(), KMeans(n_clusters = k, random_state = 42))
    model.fit(X)
    
    if return_metrics:
        i = model.named_steps["kmeans"].inertia_
        ss = silhouette_score(X, model.named_steps["kmeans"].labels_)
        metrics = {
            "inertia": round(i),
            "silhouette": round(ss, 3)
        }
        return metrics
    return model

In [13]:
get_model_metrics(trimmed = True, k = 20, return_metrics = False)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('kmeans', KMeans(n_clusters=20, random_state=42))])

In [14]:
@app.callback(
	Output("metrics", "children"),
    Input("trim-button", "value"),
    Input("k-slider", "value")
)
def serve_metrics(trimmed = True, k = 2):

    """Returns list of ``H3`` elements containing inertia and silhouette score
    for ``KMeans`` model.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    k : int, default=2
        Number of clusters.
    """
    # Get metrics
    metrics = get_model_metrics(trimmed = trimmed, k = k, return_metrics = True)
    # Add metrics to HTML elements
    text = [
        html.H3(f"Inertia: {metrics['inertia']}"),
        html.H3(f"Silhouette Score: {metrics['silhouette']}")
    ]
    return text

In [15]:
serve_metrics(k = 20)

[H3('Inertia: 1456'), H3('Silhouette Score: 0.497')]

In [16]:
def get_pca_labels(trimmed = True, k = 2):

    """
    ``KMeans`` labels.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    k : int, default=2
        Number of clusters.
    """
    # Create feature matrix
    features = get_high_var_features(trimmed = trimmed, return_feat_names = True)
    X = df[features]
    # Build transformer
    transformer = PCA(n_components=2, random_state = 42)
    # Transform data
    X_t = transformer.fit_transform(X)
    X_pca = pd.DataFrame(X_t, columns = ["PC1", "PC2"])
    
    # Add labels
    model = get_model_metrics(trimmed = trimmed, k = k, return_metrics= False)
    X_pca["labels"] = model.named_steps["kmeans"].labels_.astype(str)
    X_pca.sort_values("labels", inplace = True)
    return X_pca

In [17]:
get_pca_labels().head()

Unnamed: 0,PC1,PC2,labels
2208,889749.557584,467355.407904,0
1056,649765.113978,174994.130637,0
1057,649536.017166,176269.044416,0
1058,649536.017166,176269.044416,0
1059,649765.113978,174994.130637,0


In [18]:
@app.callback(
	Output("pca-scatter", "figure"),
    Input("trim-button", "value"),
    Input("k-slider", "value")
)
def serve_scatter_plot(trimmed = True, k = 2):

    """Build 2D scatter plot of ``df`` with ``KMeans`` labels.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    k : int, default=2
        Number of clusters.
    """
    fig = px.scatter(
    	data_frame = get_pca_labels(trimmed = trimmed, k = k),
        x = "PC1",
        y = "PC2",
        color = "labels",
        title = "PCA Representation of Clusters"
    )
    fig.update_layout(xaxis_title = "PCA1", yaxis_title = "PCA2")
    return fig

In [19]:
serve_scatter_plot(k = 2)

In [23]:
app.run_server(host="0.0.0.0", mode="external")

Dash app running on:


<IPython.core.display.Javascript object>