In [72]:
import warnings
warnings.filterwarnings('ignore')
import plotly.express as px
import plotly.graph_objects as go
import matplotlib
import pandas as pd
import numpy as np
from scipy import stats
from ipywidgets import widgets, Layout
#import distinctipy

In [73]:
# read data
df = pd.read_csv(r"https://github.com/x-zhe/CS765-project/raw/main/atus_embed_cluster.csv")

In [74]:
# replace names
df["Age"] = df["TEAGE"]
df["Sex"] = df["TESEX"].map({1: "Male", 2: "Female"})
df["Employment"] = df["TELFS"].map({1: "employed", 2: "employed",3: "unemployed",4: "unemployed",5: "unemployed"})
df["No of child"] = df["TRCHILDNUM"].map({0:"0",1: "1", 2: "2",3: "3",4: "4",5: "5",6: "6", 7: "7", 8: "8" ,9: "9"})
df["Race"] = df["PTDTRACE"].map({1: "White",
                                 2: "Black",
                                 4: "Asian"}).fillna('Other')

df["Weekly earnings"] = df["TRERNWA"]
df.loc[df["Weekly earnings"] < 0, "Weekly earnings"] = np.nan

df['Personal Care'] = df['t01']
df.loc[df["Personal Care"] < 0, "Personal Care"] = np.nan

df['Household Activity'] = df['t02']
df.loc[df["Household Activity"] < 0, "Household Activity"] = np.nan

df['Care for Household'] = df['t03']
df.loc[df["Care for Household"] < 0, "Care for Household"] = np.nan

df['Care for non Household'] = df['t04']
df.loc[df["Care for non Household"] < 0, "Care for non Household"] = np.nan

df['Work'] = df['t05']
df.loc[df["Work"] < 0, "Work"] = np.nan

df['Education'] = df['t06']
df.loc[df["Education"] < 0, "Education"] = np.nan

df['Consumer Purchases'] = df['t07']
df.loc[df["Consumer Purchases"] < 0, "Consumer Purchases"] = np.nan

df['Professional & Personal Care'] = df['t08']
df.loc[df["Professional & Personal Care"] < 0, "Professional & Personal Care"] = np.nan

df['Household Services'] = df['t09']
df.loc[df["Household Services"] < 0, "Household Services"] = np.nan

df['Govt. & Civic service'] = df['t10']
df.loc[df["Govt. & Civic service"] < 0, "Govt. & Civic service"] = np.nan

df['Eat & Drink'] = df['t11']
df.loc[df["Eat & Drink"] < 0, "Eat & Drink"] = np.nan

df['Socialization, Relax, & Leisure'] = df['t12']
df.loc[df["Socialization, Relax, & Leisure"] < 0, "Socialization, Relax, & Leisure"] = np.nan

df['Sports, Exercise, & Recreation'] = df['t13']
df.loc[df["Sports, Exercise, & Recreation"] < 0, "Sports, Exercise, & Recreation"] = np.nan

df['Religious & Spiritual Activities'] = df['t14']
df.loc[df["Religious & Spiritual Activities"] < 0, "Religious & Spiritual Activities"] = np.nan

df['Volunteer'] = df['t15']
df.loc[df["Volunteer"] < 0, "Volunteer"] = np.nan

df['Telephone Calls'] = df['t16']
df.loc[df["Telephone Calls"] < 0, "Telephone Calls"] = np.nan

In [75]:
# generate colors
# N = df["cluster"].nunique()
# # -1(outliers) uses gray (#E3E3E3)
# colors = distinctipy.get_colors(N-1, exclude_colors=[(227,227,227), (1, 1, 1), (0,0,0)])
# print(colors)
# display the colours
# distinctipy.color_swatch(colors, show_text=True)

In [76]:
colors = [(0.08582131724458708, 0.9990722354012624, 0.12296636585395293), (1.0, 0.0, 1.0), (0.0, 0.5, 1.0), (1.0, 0.5, 0.0), (0.5, 0.25, 0.5), (0.3226565662101364, 0.9757753294094614, 0.8107101474996194), (0.8031057559399262, 0.9821537186624913, 0.12585635298868825), (0.82921568158574, 0.4872287479907974, 0.9660413907118607), (1.0, 0.0, 0.0), (0.0, 0.0, 1.0), (0.0, 0.5, 0.0), (0.0, 0.0, 0.5), (0.054590962374978846, 0.6206575613600794, 0.5116058034515054), (0.6476246931731252, 0.6768290017659921, 0.46447648399808406), (0.9676884698905099, 0.3694512938988951, 0.49459453214639815), (0.5271571233202129, 0.08681608998515178, 0.04337190607094521),
          (0.524967211661483, 0.49607909539830974, 0.007472566233008626), (0.4800442592595744, 0.16986339954670204, 0.9792886188527113), (1.0, 0.0, 0.5), (0.42304651757512224, 0.6111611365346267, 0.8377963756236279), (0.015205753913233089, 0.31836695668585147, 0.34345903340237427), (0.9919328878489869, 0.8895907805231833, 0.5449656662031589), (0.44694411649385746, 0.955760163674265, 0.3495385133608162), (0.0, 1.0, 0.5), (0.09978108978860067, 0.28702725260404804, 0.7584249020239144), (0.7802882434469021, 0.1641298593239623, 0.7219683153992351), (0.688134232152867, 0.8202108346270685, 0.9598752142320948), (0.3723021299514827, 0.7662881796695086, 0.06152480442771757)]


In [77]:
# allocate clusters to colors
cls = df["cluster"].sort_values().unique()
cl_colors = ["#d8d8d8"] + colors
colormap = {cls[i]: matplotlib.colors.to_hex(cl_colors[i])
            for i in range(len(cls))}

In [78]:
# cluster subplot

def plot_cluster(t, title="All clusters"):
    """plot a scatter plot for clusters

    Args:
        t (Pandas DataFrame): data source
        title (str): plot title. Must be "All clusters" or "Cluster X".

    Returns:
        fig (Plotly figure): the plot
    """    
    fig = go.Figure()
    for cl in cls:  # add cluster traces one by one
        d = t.query("cluster == {}".format(cl))
        fig.add_trace(go.Scatter(x=d["embed_x"],
                                y=d["embed_y"],
                                marker=dict(color=colormap[cl]),
                                mode="markers",
                                name="Cluster {}".format(cl),
                                ),
                    )


    fig.update_layout(height=700,
                    width=650,
                    template="seaborn",
                    title=dict(text=title),
                    legend=dict(yanchor="top",
                                y=0.99,
                                xanchor="left",
                                x=0.01,
                                ),
                    font=dict(family="Arial"),
                    margin=dict(l=0, r=0, t=30, b=0)
                    )
    
    fig.update_xaxes(title=dict(text="Embedding x"))
    fig.update_yaxes(title=dict(text="Embedding y"))
    return fig

# t = df
# fig = plot_cluster(t, title="All clusters")
# fig.show()


In [79]:
# 95% CI
def mean_confidence_interval(data, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), stats.sem(a)
    h = se * stats.t.ppf((1 + confidence) / 2., n-1)
    return h * 2

In [80]:
# activity plot: check mean/median
def plot_activity(t, aggmode, isSort, acts, color="silver"):
    """plot a bar chart for activity pattern.

    Args:
        t (Pandas DataFrame): data source
        aggmode (str): "mean" or "median"
        isSort (bool): if the bars are sorted descendingly
        acts (list): activities to show
        color (str): bar color

    Returns:
        fig (Plotly figure): the plot
    """    
    fig = go.Figure()

    if aggmode == "mean":
        means = t[acts].mean(axis=0)
        cis = t[acts].apply(lambda x: mean_confidence_interval(
            x, confidence=0.95), axis=0)
        fig.add_trace(go.Bar(x=acts,
                            y=means,
                            error_y=dict(array=cis),
                            marker=dict(color=color),
                            ),
                    )
        
    if aggmode == "median":
        meds = t[acts].median(axis=0)
        fig.add_trace(go.Bar(x=acts,
                            y=meds,
                            marker=dict(color=color),
                            ),
                    )

    fig.update_layout(height=300,
                    width=600,
                    template="seaborn",
                    font=dict(family="Arial"),
                    margin=dict(l=0, r=0, t=0, b=0)
                    )
    fig.update_xaxes(title=dict(text="Activity"))
    fig.update_yaxes(title=dict(text=aggmode.capitalize() + " minute per day"))

    categoryorder = 'total descending' if isSort else 'trace'
    fig.update_xaxes(categoryorder=categoryorder)
    return fig


# t = df
# aggmode = "mean"
# isSort = True

# acts = ["t01", "t02", "t03", "t04", "t05", "t06", "t07", "t08",
#         "t09", "t10", "t11", "t12", "t13", "t14", "t15", "t16", "t18", ]
# fig = plot_activity(t, aggmode, isSort, acts)
# fig.show()


In [81]:
# demo variable distribution
def plot_demo(t, display_mode, vars, group_var, continuous_vars):
    """plot distribution for demographic variables

    Args:
        t (Pandas DataFrame): data source
        display_mode (str): Vis type. Options: "Histogram", "Density", "Pie"
        vars (tuple of str): x variable, y variable. Must be column names in t. "None" refers to missing var
        group_var (str): variable for grouping. Must be column names in t.
        continuous_vars (list): a list of continuous variables

    Returns:
        fig (Plotly figure): the plot
    """
    fig = go.Figure() # default fig
    var1, var2 = vars  # x-variable, y-variable. None shows blank
    group_var = group_var if group_var != "None" else None
    var_num = (var1 != "None") + (var2 != "None")
    if var_num == 1:
        var = var1 if (var1 != "None") else var2
        if display_mode == "Histogram":  # histogram
            fig = px.histogram(t,
                            x=var,
                            color=group_var)
        elif display_mode == "Density diagram":  # violin plot
            fig = px.violin(t,
                            x=var,
                            box=True,
                            color=group_var,
                            )

    if var_num == 2:
        x, y = t[var1], t[var2]
        # adjust orientation based on data type
        if var1 in continuous_vars and var2 not in continuous_vars:
            orientation = "h"
        elif var1 not in continuous_vars:
            orientation = "v"
        else:
            orientation = None
        if orientation != None:  # one var is categorical
            if display_mode == "Histogram":  # 2D histogram
                fig = px.density_heatmap(t,
                                        x=var1,
                                        y=var2,
                                        facet_col=group_var,
                                        facet_col_wrap=2,
                                        )
            elif display_mode == "Density diagram":
                fig = px.violin(t,
                                x=var1,
                                y=var2,
                                color=group_var,
                                box=True,
                                orientation=orientation,
                                )

        else:  # both var are continuous
            if display_mode == "Histogram":  # 2D histogram
                fig = px.density_heatmap(t,
                                        x=var1,
                                        y=var2,
                                        facet_col=group_var,
                                        facet_col_wrap=2,
                                        )
            elif display_mode == "Density diagram":
                fig = px.density_contour(t,
                                        x=var1,
                                        y=var2,
                                        facet_col=group_var,
                                        facet_col_wrap=2,
                                        histnorm="density",
                                        )
                fig.update_traces(selector=dict(type="histogram2dcontour"),
                                contours=dict(coloring="fill",
                                                showlabels=True),
                                showscale=False
                                )

    if display_mode == "Pie":  # pie chart, only use x
        var = var1 if var1 != "None" else var2
        if var != "None":
            if var in continuous_vars:
                t["bin"] = [str(i) for i in pd.cut(t[var], bins=4)]
            else:
                t["bin"] = t[var]
            fig = px.pie(t,
                        names="bin",
                        facet_col=group_var,
                        facet_col_wrap=2,)

    fig.update_layout(height=300,
                    width=600,
                    template="seaborn",
                    font=dict(family="Arial"),
                    margin=dict(l=0, r=0, t=30, b=0)
                    )
    return fig
    
# t = df
# display_mode = "Histogram"  # hist, violin, pie
# vars = ("Age", "Weekly earnings") ##shusmi
# group_var = "Race"
# continuous_vars = ["Age", "Weekly earnings"]
# fig = plot_demo(t, display_mode, vars, group_var, continuous_vars)
# fig.show()

In [82]:
# make interactive plot

continuous_vars = ["Age", "Weekly earnings","Personal Care","Household Activity","Care for Household","Care for non Household","Work",
"Education","Consumer Purchases","Professional & Personal Care","Household Services","Govt. & Civic service",
"Eat & Drink","Socialization, Relax, & Leisure","Sports, Exercise, & Recreation","Religious & Spiritual Activities","Volunteer","Telephone Calls"]
cat_vars = ["Sex", "Race","Employment","No of child"]
variables = cat_vars + continuous_vars

cluster_box = widgets.RadioButtons(description='Clusters',
                                   value="All clusters",
                                   options=["All clusters"] + list(cls),
                                   layout={'width': 'max-content'}
                                   )

aggmode_box = widgets.ToggleButtons(description='Aggregation: ',
                                    value='mean',
                                    options=["mean", "median"],
                                    style={"button_width": "auto"},
                                    )

sort_box = widgets.Checkbox(description="Sort by size",
                            value=False,
                            layout=Layout(width='30%', margin="0 px")
                            )

display_box = widgets.Dropdown(description='Graph type: ',
                               value="Histogram",
                               options=["Histogram", "Density diagram", "Pie"]
                               )

var1_box = widgets.Dropdown(description='X: ',
                            value="Race",
                            options=["None"]+variables)

var2_box = widgets.Dropdown(description='Y: ',
                            value="Age",
                            options=["None"]+variables)

group_box = widgets.Dropdown(description='Group: ',
                             value="Sex",
                             options=["None"]+cat_vars)


def cluster_on_click(change):
    """click event for figure 1 - cluster selection

    Args:
        change (_type_): _description_
    """
    t = df if cluster_box.value == "All clusters" else df[df["cluster"]
                                                          == cluster_box.value]
    new_g1 = plot_cluster(t,
                          title="All clusters" if cluster_box.value == "All clusters" else "Cluster {}".format(cluster_box.value))
    new_g2 = plot_activity(t,
                           aggmode=aggmode_box.value,
                           isSort=sort_box.value,
                           acts=acts,
                           color="silver" if cluster_box.value == "All clusters" else colormap[cluster_box.value])
    new_g3 = plot_demo(t,
                       display_mode=display_box.value,
                       vars=(var1_box.value, var2_box.value),
                       group_var=group_box.value,
                       continuous_vars=continuous_vars)
    # use dict data and layout to update all
    g1.update(new_g1.to_dict(), overwrite=True)
    if cluster_box.value == "All clusters":
        # add click event to all traces
        g1.for_each_trace(lambda x: x.on_click(point_on_click))
    g2.update(new_g2.to_dict(), overwrite=True)
    g3.update(new_g3.to_dict(), overwrite=True)


def point_on_click(trace, points, selector):
    """click event for clicking the points on scatter plot

    Args:
        trace (_type_): _description_
        points (_type_): _description_
        selector (_type_): _description_
    """
    # Skip when no point was actually clicked
    if not points.point_inds:
        return
    cluster_name = trace.name
    id = int(cluster_name.split()[1])
    cluster_box.value = id  # reset cluster box and trigger event


def fig2_on_click(change):
    """click event for figure 2

    Args:
        change (_type_): _description_
    """
    t = df if cluster_box.value == "All clusters" else df[df["cluster"]
                                                          == cluster_box.value]
    new_g2 = plot_activity(t,
                           aggmode=aggmode_box.value,
                           isSort=sort_box.value,
                           acts=acts,
                           color="silver" if cluster_box.value == "All clusters" else colormap[cluster_box.value])
    # use dict data and layout to update all
    g2.update(new_g2.to_dict(), overwrite=True)


def fig3_on_click(change):
    """click event for figure 3

    Args:
        change (_type_): _description_
    """
    t = df if cluster_box.value == "All clusters" else df[df["cluster"]
                                                          == cluster_box.value]
    new_g3 = plot_demo(t,
                       display_mode=display_box.value,
                       vars=(var1_box.value, var2_box.value),
                       group_var=group_box.value,
                       continuous_vars=continuous_vars)
    # use dict data and layout to update all
    for a in g3.layout.annotations:
        a.update(text="")  # remove subplot titles
    g3.update(new_g3.to_dict(), overwrite=True)


cluster_box.observe(cluster_on_click, names="value")
aggmode_box.observe(fig2_on_click, names="value")
sort_box.observe(fig2_on_click, names="value")
display_box.observe(fig3_on_click, names="value")
var1_box.observe(fig3_on_click, names="value")
var2_box.observe(fig3_on_click, names="value")
group_box.observe(fig3_on_click, names="value")

t = df
acts = ["t01", "t02", "t03", "t04", "t05", "t06", "t07", "t08",
        "t09", "t10", "t11", "t12", "t13", "t14", "t15", "t16", "t18", ]
g1 = go.FigureWidget(plot_cluster(t))
g2 = go.FigureWidget(plot_activity(t, aggmode="mean", isSort=False, acts=acts))
g3 = go.FigureWidget(plot_demo(t,
                               display_mode="Histogram",
                               vars=("Race", "Age"),
                               group_var="Sex",
                               continuous_vars=continuous_vars))
g1.for_each_trace(lambda x: x.on_click(point_on_click))

box = widgets.HBox([widgets.HBox([widgets.VBox([widgets.HTML(value="<b><font size=5>(A)</b>"),cluster_box],
                                               layout=Layout(display='flex', flex_flow='column', align_items="center", width='20%')),
                                  g1],
                                 layout=Layout(align_items="center")),
                    widgets.VBox([widgets.HBox([widgets.HTML(value="<b><font size=5>(B)</b>"),aggmode_box, sort_box]),
                                  g2,
                                  widgets.HTML(value="<b><font size=5>(C)</b>"),
                                 widgets.HBox([var1_box, var2_box]),
                                 widgets.HBox([group_box, display_box]),
                                 g3],
                                 layout=Layout(justify_content="flex-end")
                                 )
                    ])

# ATUS data clustering and exploratory data analysis

* (A) Clustering results of survey respondents. Embeddings are computed by UMAP based on time usage of activities.
* (B) Time usage pattern for respondents in certain clusters.
* (C) Multi-variate distribution of respondents.

In [83]:
display(box)

HBox(children=(HBox(children=(VBox(children=(HTML(value='<b><font size=5>(A)</b>'), RadioButtons(description='…

**Note**


* Activity codes

| t01                 | t02                         | t03                             | t04                                   | t05                                 | t06                                   | t07                     | t08                                        |
|---------------------|-----------------------------|---------------------------------|---------------------------------------|-------------------------------------|---------------------------------------|-------------------------|--------------------------------------------|
|    Personal Care    |   Household   Activities    | Caring for household   members  |  Caring for non   household members   |       Work   and Work Related       |               Education               |  Consumer   Purchases   | Professional and   Personal care services  |
| **t09**             | **t10**                     | **t11**                         | **t12**                               | **t13**                             | **t14**                               | **t15**                 | **t16**                                    |
| Household Services  | Govt. & Civic   Obligation  |      Eating and   Drinking      | Socializing,   Relaxing, and Leisure  | Sports,   Exercise, and Recreation  | Religious and   Spiritual Activities  | Volunteer   Activities  |              Telephone Calls               |