# Compare before & after CB

In [1]:
import os, sys, re, io, math
import matplotlib, shap, xgboost
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import *
from math import isnan, nan
from matplotlib.widgets import Slider
from glob import glob
from ipywidgets import *
from datetime import datetime, timedelta
from dateutil.tz import tzlocal
import ipywidgets as widgets
from IPython.display import clear_output, display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.options.display.width = 1000
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 100000
pd.options.display.float_format = '{0:f}'.format

from pandas_serializer import *

CB_start_date = pd.Timestamp('2020-04-07', tz='tzlocal()')
CB_boundary_gap = pd.to_timedelta('7D')
main_path = './5.decrypted/q8KKsBwu0cryrVM3VMBNW35Q/'

In [2]:
all_data = pandas_load('5.decrypted/q8KKsBwu0cryrVM3VMBNW35Q/all-data2.pson.gz')

In [3]:
def get_stats(df, funcs=['max']):
    return [eval('df.%s()'%f) for f in funcs]

def compare_stats(df):
    df_before = df[df.index < CB_start_date-CB_boundary_gap]
    df_after = df[df.index > CB_start_date+CB_boundary_gap]
    return pd.DataFrame({'before_mean':df_before.mean(), 'after_mean':df_after.mean(),
                         'before_std':df_before.std(), 'after_std':df_after.std(),
                         'before_max':df_before.max(), 'after_max':df_after.max(),
                         'before_min':df_before.min(), 'after_min':df_after.min(),
                         'before_median':df_before.median(), 'after_median':df_after.median()})
    
def summarize(df):
    col_groups = defaultdict(lambda :[])
    for col in df.columns:
        if re.search('_[0-9][0-9]h$', col):
            col_groups[col[:-4]] += [col]
    ret = df[[col for col in df.columns if col[:-4] not in col_groups]]
    for grp, cols in col_groups.items():
        ret[grp] = df[cols].mean(axis=1)
    return ret, col_groups
    

# Compare stats

In [4]:
@interact(show=widgets.ToggleButton(value=False,description='Show Comparison'))
def show_compare(show):
    if not show:
        return clear_output()
    dfs = pd.concat([compare_stats(summarize(df)[0]) for p, df in all_data.items()])
    df_compare = dfs.groupby(dfs.index).mean()
    display(df_compare)

interactive(children=(ToggleButton(value=False, description='Show Comparison'), Output()), _dom_classes=('widg…

# Plot Shap

In [5]:
np.random.seed(0)
from sklearn.ensemble import RandomForestRegressor

@interact(show=widgets.ToggleButton(value=False,description='Show Shap Plot'))
def show_shap(show):
    if not show:
        return clear_output()
    df = pd.concat([summarize(df)[0] for p, df in all_data.items()])
    df_before = df[df.index < CB_start_date-CB_boundary_gap]
    df_after = df[df.index > CB_start_date+CB_boundary_gap]
    X = pd.concat([df_before, df_after])
    Y = pd.Series([-1]*len(df_before.index)+[1]*len(df_after.index))

    # Build the model
    if True:
        # use xgboost algorithm
        model = xgboost.train({"learning_rate": 0.01}, xgboost.DMatrix(X, label=Y), 100)
    else:
        # use random forest regression algorithm
        model = RandomForestRegressor(max_depth=6, random_state=0, n_estimators=10)
        model.fit(X, Y)

    shap_values = shap.TreeExplainer(model).shap_values(X)
    f = plt.figure()
    shap.summary_plot(shap_values, X, max_display=9999)
    # f.savefig("/summary_plot1.png", bbox_inches='tight', dpi=600)

interactive(children=(ToggleButton(value=False, description='Show Shap Plot'), Output()), _dom_classes=('widge…

# For Testing

In [6]:
for ii,(name, df) in enumerate(all_data.items()):
    if ii!=2 or True:
        continue
    df1 = df[[col for col in df.columns if (col.startswith('tapsLog') and 'dur' in col)]]
    display(name)
    display(df1.max())
