# Compare before & after CB

In [1]:
import os, sys, re, io, math
import matplotlib, shap, xgboost
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import *
from math import isnan, nan
from matplotlib.widgets import Slider
from glob import glob
from ipywidgets import *
from datetime import datetime, timedelta
from dateutil.tz import tzlocal
import ipywidgets as widgets
from IPython.display import *
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.options.display.width = 1000
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 100000
pd.options.display.float_format = '{0:f}'.format

# Disable scrolling for all output widgets
style = """
    <style>
       .jupyter-widgets-output-area .output_scroll {
            height: unset !important;
            border-radius: unset !important;
            -webkit-box-shadow: unset !important;
            box-shadow: unset !important;
        }
        .jupyter-widgets-output-area  {
            height: auto !important;
        }
    </style>
    """
display(HTML(style))

# disable autosave so that multiple user browser can attach the same notebook
%autosave 0

from pandas_serializer import *
from compare_CB import *

CB_start_date = pd.Timestamp('2020-4-7', tz='tzlocal()')
CB_boundary_gap = pd.to_timedelta('3D')  # at least N days from CB start date
CB_boundary_end = pd.to_timedelta('45D')  # at most N days from CB start date

# Singapore COVID dates: 2020-4-7 2020-6-2 2020-6-19
DateRangeA = [CB_start_date-CB_boundary_end, CB_start_date-CB_boundary_gap]
DateRangeB = [CB_start_date+CB_boundary_gap, CB_start_date+CB_boundary_end]
DateRanges = [DateRangeA, DateRangeB]

Autosave disabled


In [2]:
button = widgets.Button(description='Load Data', style={'font_weight': 'bold'})
def on_button_clicked(b):
    global all_data, all_cols
    for f in ['5.decrypted/izedAa85XXrDS85XlwrOsIDU/all-data.pson.gz', '5.decrypted/q8KKsBwu0cryrVM3VMBNW35Q/all-data2.pson.gz']:
        if os.path.exists(f):
            print('Loading files ...', end='')
            all_data = pandas_load(f)
            all_cols = all_data[list(all_data.keys())[0]].columns.tolist()
            print('Done')
            break
button.on_click(on_button_clicked)
display(button)

Button(description='Load Data', style=ButtonStyle(font_weight='bold'))

In [3]:
button1 = widgets.Button(description='Set Comparison Date-range', style={'font_weight': 'bold'}, layout=Layout(width='250px'))
out1 = widgets.Output()
datepicker4 = [widgets.DatePicker(value=val, description=desc, layout=Layout(width='250px'), disabled=False) for desc,val in 
               [['RangeA Start', DateRangeA[0]], ['RangeA Stop', DateRangeA[1]], ['RangeB Start', DateRangeB[0]], ['RangeB Stop', DateRangeB[1]]]]
@out1.capture(clear_output=True)
def button_set_compare_range(b):
    global DateRangeA, DateRangeB, out
    dates = [pd.to_datetime(i.value) for i in datepicker4]
    if dates[0:2].count(None)>1 or dates[2:4].count(None)>1:
        return
    DateRangeA, DateRangeB = dates[0:2], dates[2:4]
    print('DateRange set to A=[%.10s, %.10s), B=[%.10s, %.10s)'%(DateRangeA[0], DateRangeA[1], DateRangeB[0], DateRangeB[1]))
button1.on_click(button_set_compare_range)
display(HBox(datepicker4+[button1]), out1)

HBox(children=(DatePicker(value=Timestamp('2020-02-22 00:00:00+0800', tz='tzlocal()'), description='RangeA Sta…

Output()

# Compare stats

In [4]:
@interact(show=widgets.ToggleButton(value=False,description='Show Comparison'))
def show_compare(show):
    global compare_res
    if not show:
        return clear_output()
    compare_res = get_compare(all_data, DateRangeA, DateRangeB)
    display(compare_res)

interactive(children=(ToggleButton(value=False, description='Show Comparison'), Output()), _dom_classes=('widg…

In [5]:
if 'compare_res' in globals():
    compare_res.loc['heart.daily_HR_mean heart.daily_HR_min steps.daily_n_steps steps.daily_n_pos_readings gps-mobility_Hometime gps-mobility_RoG sleep_tot_hrs sleep_mean_efficiency sleep_tot_deep_hrs'.split(),
                    'before_mean 	after_mean 	ttest_rel 	wilcoxon 	p-test 	t-test'.split()]\
    .applymap(lambda t: t.split()[0] if type(t)==str else t).applymap(lambda t: "%#.4g"%float(t)).rename(index={'steps.daily_n_pos_readings':'steps.daily_walk_minutes'})

# Plot Shap

In [6]:
@interact(figwidth=widgets.IntText(value=20, description='Figure width'), show=widgets.ToggleButton(value=False,description='Show Shap Plot'))
def show_shap(figwidth, show):
    if not show:
        return clear_output()
    f, shap_values, X = get_shap(all_data, DateRangeA, DateRangeB, figwidth=figwidth, max_display=9999)
    # f.savefig("/summary_plot1.png", bbox_inches='tight', dpi=600)

interactive(children=(IntText(value=20, description='Figure width'), ToggleButton(value=False, description='Sh…

# Compare Distribution

In [9]:
dt2str = lambda dt: str(dt).split()[0]
dr2str = lambda dr: dt2str(dr[0]) + '\t' + dt2str(dr[1])
def plot_kde(Users, Feature, DateRanges, Plot):
    global compare_res
    
    if not Plot or not Users:
        return clear_output()
    def get(df, feature):
        return df[feature] if feature in df.columns else summarize(df)[0][feature]
    drs = [L.split() for L in DateRanges.split('\n') if L.strip()]
    samples = []
    for ii, dr in enumerate(drs, 1):
        sample = pd.concat([get(all_data[user][dr[0]:dr[1]], Feature) for user in Users], ignore_index=True).dropna()
        if sample.empty: return print('Population in DateRange%d is empty!'%ii)
        if len(set(sample))==1: return print('Population in DateRange%d has only 1 value! %s'%(ii, set(sampleA)))
        samples += [sample]
    os.dfc = dfc = pd.concat(samples, axis=1, ignore_index=True)
    dfc.columns = ['KDE [%s, %s)'%(dt2str(dr[0]), dt2str(dr[1])) for dr in drs]
    fig, axs = plt.subplots(nrows=1, ncols=2, figsize=[16,4], gridspec_kw={'width_ratios':[2,1]})
    dfc.plot.kde(bw_method='silverman', ax=axs[0], figsize=[24,9])
    dfc.boxplot(ax=axs[1], rot=45)
    
def onclick_show_hourly(t):
    dropdown2.options = cols_all if showhourly0.value else cols_nohourly

if 'all_data' in globals():
    dr2str = lambda dr: str(dr[0]).split()[0] + '\t' + str(dr[1]).split()[0]
    cols_all = sorted(set([c[:-4] for c in all_cols if re.search('_\d\dh$', c)])|set(all_cols))
    cols_nohourly = [c for c in cols_all if not re.search('_\d\dh$', c)]
    dropdown1 = widgets.SelectMultiple(options=sorted(all_data.keys()), description='Participant', layout=Layout(width='400px'))
    dropdown2 = widgets.Dropdown(options=cols_nohourly, description='Feature', layout=Layout(width='400px'))
    showhourly0 = widgets.Checkbox(value=False, description='Show hourly features')
    showhourly0.observe(onclick_show_hourly)
    dateranges = widgets.Textarea(value='2020-02-22\t2020-04-04\n2020-04-10\t2020-05-22\n2020-06-02\t2020-06-19\n2020-06-20\t2020-08-31', description='DateRanges')
    button2 = widgets.ToggleButton(value=False, description='Plot Distribution')
    W = interactive(plot_kde, Users=dropdown1, Feature=dropdown2, DateRanges=dateranges, Plot=button2)
    display(HBox([dropdown1, VBox([dropdown2, showhourly0]), dateranges, button2]), W.children[-1])

SyntaxError: 'return' outside function (<ipython-input-9-c5d5c681d10e>, line 26)

# For Testing

In [8]:
if 'all_data' in globals():
    for ii,(name, df) in enumerate(all_data.items()):
        if ii!=2 or True:
            continue
        df1 = df[[col for col in df.columns if (col.startswith('tapsLog') and 'dur' in col)]]
        display(name)
        display(df1.max())