<h1 id="tocheading">Table of Contents</h1>
<div id="toc"></div>

# Imports and Scripts

In [3]:
%%javascript
// ToC script
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

In [4]:
# Code hider, source: http://chris-said.io/2016/02/13/how-to-make-polished-jupyter-presentations-with-optional-code-visibility/
from IPython.display import HTML

HTML('''
<script>
  function code_toggle() {
    if (code_shown){
      $('div.input').hide('500');
      $('#toggleButton').val('Show Code')
    } else {
      $('div.input').show('500');
      $('#toggleButton').val('Hide Code')
    }
    code_shown = !code_shown
  }

  $( document ).ready(function(){
    code_shown=false;
    $('div.input').hide()
  });
</script>
<form action="javascript:code_toggle()"><input type="submit" id="toggleButton" value="Show Code"></form>
''')



In [1]:
# imports and constants
import json
import pickle
import datetime as dt

import gmaps
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import multiprocessing
import numpy as np
import pandas as pd
import pingouin as pg
import seaborn as sns

from IPython.display import display, HTML
import ipywidgets as widgets

# user imports
import data_processing.analysis_utils as ls_utils

%matplotlib inline

import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 300

# Load Data

In [14]:
all_spin_fill_df = pd.read_pickle("ls_data/jama_data/all_state_shift_no_imp.df")
all_shift_df = pd.read_pickle("ls_data/jama_data/all_phq_shift_imp.df")

**note**: sourced originally from lifesense_cluster_change_over_time, under "Correlates Over Time" heading

In [15]:
# build cal_daily
all_cal = pd.read_pickle("wave1_features/cal_hr.df")
all_cal['date'] = all_cal['hour'].dt.floor('d')
cal_daily = all_cal.groupby(['pid', 'date']).sum()
cal_daily = cal_daily.reset_index()
cal_daily["is_wkday"] = (pd.to_datetime(cal_daily['date']).dt.dayofweek < 5).astype(float)

display(cal_daily.head())

Unnamed: 0,pid,date,tot_call_count,tot_call_duration,is_wkday
0,746649,2019-07-23,12,1814,1.0
1,746649,2019-07-24,15,4572,1.0
2,746649,2019-07-25,20,2363,1.0
3,746649,2019-07-26,15,8063,1.0
4,746649,2019-07-27,10,1748,0.0


In [16]:
# build sms_daily
all_sms = pd.read_pickle("wave1_features/sms_hr.df")
all_sms['date'] = all_sms['hour'].dt.floor('d')
sms_daily = all_sms.groupby(['pid', 'date']).sum()
sms_daily = sms_daily.reset_index()
sms_daily["is_wkday"] = (pd.to_datetime(sms_daily['date']).dt.dayofweek < 5).astype(float)

display(sms_daily.head())

Unnamed: 0,pid,date,tot_sms_count,tot_sms_length,in_sms_count,in_sms_length,out_sms_count,out_sms_length,is_wkday
0,746649,2019-07-23,22,1738,22.0,1738.0,0.0,0.0,1.0
1,746649,2019-07-24,24,1464,24.0,1464.0,0.0,0.0,1.0
2,746649,2019-07-25,14,936,14.0,936.0,0.0,0.0,1.0
3,746649,2019-07-26,23,2135,23.0,2135.0,0.0,0.0,1.0
4,746649,2019-07-27,21,990,21.0,990.0,0.0,0.0,0.0


In [17]:
# build fga_daily
all_fga = pd.read_pickle("wave1_features/fga_hr.df")
all_fga['all_browser'] = all_fga['browser'] + all_fga['chrome']
all_fga['date'] = all_fga['hr'].dt.floor('d')
fga_daily = all_fga.groupby(['pid', 'date']).sum()
fga_daily = fga_daily.reset_index()
fga_daily["is_wkday"] = (pd.to_datetime(fga_daily['date']).dt.dayofweek < 5).astype(float)

display(fga_daily.head())

Unnamed: 0,pid,date,katana,orca,messaging,launcher,chrome,email,instagram,youtube,maps,snapchat,browser,all_browser,is_wkday
0,746649,2019-07-23,0.0,0.0,3380.0,1753.0,1490.0,1113.0,211.0,1080.0,240.0,0.0,165.0,1655.0,1.0
1,746649,2019-07-24,0.0,105.0,1888.0,2831.0,965.0,713.0,1203.0,0.0,0.0,0.0,931.0,1896.0,1.0
2,746649,2019-07-25,0.0,135.0,5835.0,1826.0,98.0,770.0,368.0,0.0,73.0,0.0,509.0,607.0,1.0
3,746649,2019-07-26,0.0,301.0,5288.0,1986.0,195.0,1362.0,842.0,0.0,1410.0,0.0,1065.0,1260.0,1.0
4,746649,2019-07-27,0.0,0.0,3166.0,686.0,0.0,110.0,3310.0,0.0,847.0,0.0,0.0,0.0,0.0


In [18]:
%%time
# build sloc_daily
all_sloc = pd.read_pickle("wave1_features/sloc_hr.df")
all_sloc['date'] = all_sloc['hr'].dt.floor('d')

CPU times: user 246 ms, sys: 5 s, total: 5.25 s
Wall time: 5.24 s


In [19]:
%%time

sloc_cols = list(all_sloc.columns[all_sloc.columns.str.endswith('dur')])
all_sloc[sloc_cols] = all_sloc[sloc_cols].clip(0, 100000000)
sloc_daily = all_sloc.groupby(['pid', 'date']).sum()
sloc_daily = sloc_daily.reset_index()
sloc_daily["is_wkday"] = (pd.to_datetime(sloc_daily['date']).dt.dayofweek < 5).astype(float)

display(sloc_daily.head())

Unnamed: 0,pid,date,food_dur,home_dur,work_dur,exercise_dur,anothers_home_dur,religion_dur,travel_dur,shopping_dur,...,n/a_religion,n/a_travel,n/a_shopping,n/a_errand,n/a_health,n/a_education,n/a_entertainment,n/a_other,tot_tansitions,is_wkday
0,746649,2019-07-23,0.0,21869.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,2,0,0,0,9,1.0
1,746649,2019-07-24,0.0,54305.0,7379.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,1,13,1.0
2,746649,2019-07-25,0.0,48121.0,0.0,0.0,0.0,753.0,0.0,0.0,...,1,0,0,0,1,0,1,1,10,1.0
3,746649,2019-07-26,0.0,49696.0,0.0,3824.0,3726.0,0.0,0.0,3790.0,...,0,0,2,0,1,0,1,1,17,1.0
4,746649,2019-07-27,0.0,28339.0,0.0,536.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,6,0.0


CPU times: user 1.23 s, sys: 4.13 s, total: 5.36 s
Wall time: 5.36 s


In [20]:
drop_cols = sloc_daily.columns[sloc_daily.columns.str.contains('n/a')]
sloc_daily = sloc_daily.drop(drop_cols, axis='columns')

In [21]:
# feature columns
sloc_dur_cols = sloc_daily.columns[sloc_daily.columns.str.endswith('dur')].tolist()
#sms_cols
#cal_cols
loc_cols = circ_cols + fus_cols

feat_sensor_cols = sloc_dur_cols + sms_cols + cal_cols + loc_cols + fga_cols

sensor_cols_shift = [s + '_shift' for s in feat_sensor_cols]
sensor_cols_change = [s + '_change' for s in feat_sensor_cols]

# All SPIN/GAD

## Build SPIN/GAD Sequence DF

In [24]:
# load fus and circ data
fus_daily = pd.read_pickle("ls_data/jama_data/fus_gad_start_14_-1.df")
circ_dict = pickle.load(open("ls_data/jama_data/circ_gad_start_14_-1.dict", "rb"))

In [25]:
# correction for Series values in list, though it doesn't seem to have affected anything??
new_dict = {}
for col, vals in circ_dict.items():
    new_vals = []
    for v in vals:
        if isinstance(v, pd.Series):
            new_vals.append(v[0])
        else:
            new_vals.append(v)
    new_dict[col] = new_vals
circ_dict = new_dict

In [26]:
# columns
sms_cols = ['tot_sms_count', 'tot_sms_length', 'in_sms_count', 'in_sms_length', 'out_sms_count', 'out_sms_length']
cal_cols = ['tot_call_count', 'tot_call_duration']
fga_cols = ['katana', 'orca', 'messaging', 'email', 'instagram', 'youtube', 'maps', 'snapchat', 'browser', 'chrome']

circ_cols = ['circ_movt_tot', 'circ_movt_wkday', 'circ_movt_wkend']
fus_cols = list(fus_daily.columns[(fus_daily.columns.str.endswith('total') | fus_daily.columns.str.endswith('wkend') | fus_daily.columns.str.endswith('wkday')) & ~fus_daily.columns.str.startswith('date')])
sloc_cols = list(sloc_daily.columns.drop(['pid', 'date', 'is_wkday']))
#sloc_cols = list(sloc_daily.columns[sloc_daily.columns.str.endswith('dur')])

In [27]:
all_spin_fill_df['date'] = all_spin_fill_df['spin_start'].dt.normalize()

In [28]:
all_spin_fill_df['spin_total_shift'] = all_spin_fill_df.groupby('pid')['spin_total'].shift(1)
all_spin_fill_df['spin_diff'] = all_spin_fill_df['spin_total'] - all_spin_fill_df['spin_total_shift']

In [29]:
all_spin_fill_df['gad_total_shift'] = all_spin_fill_df.groupby('pid')['gad_total'].shift(1)
all_spin_fill_df['gad_diff'] = all_spin_fill_df['gad_total'] - all_spin_fill_df['gad_total_shift']

In [30]:
%%time

sms_dict = ls_utils.build_col_dict(sms_cols, sms_daily, all_spin_fill_df, 'spin_start', pre_days=14, post_days=-1)
cal_dict = ls_utils.build_col_dict(cal_cols, cal_daily, all_spin_fill_df, 'spin_start', pre_days=14, post_days=-1)
fga_dict = ls_utils.build_col_dict(fga_cols, fga_daily, all_spin_fill_df, 'spin_start', pre_days=14, post_days=-1)
sloc_dict = ls_utils.build_col_dict(sloc_cols, sloc_daily, all_spin_fill_df, 'spin_start', pre_days=14, post_days=-1)

CPU times: user 51.3 s, sys: 65.7 ms, total: 51.3 s
Wall time: 51.4 s


In [31]:
all_dig_state_df = all_spin_fill_df.copy()

sensor_dicts = [sms_dict, cal_dict, fga_dict, sloc_dict, circ_dict]
sensor_cols = sms_cols + cal_cols + fga_cols + sloc_cols + circ_cols

for sensor_dict in sensor_dicts:
    for col, vals in sensor_dict.items():
        all_dig_state_df[col] = vals

# merge fused location       
all_dig_state_df = all_dig_state_df.merge(fus_daily, how='left', on=['pid', 'study_wk'])

all_dig_state_df = all_dig_state_df.replace([np.inf, -np.inf], np.nan)

## Shift feature columns

In [32]:
all_dig_state_df[sensor_cols_shift] = all_dig_state_df.groupby('pid')[feat_sensor_cols].shift(1)

In [33]:
for cur, prev, change in zip(feat_sensor_cols, sensor_cols_shift, sensor_cols_change):
    all_dig_state_df[change] = all_dig_state_df[cur] - all_dig_state_df[prev]

# Reverse all SPIN/GAD

## Build SPIN/GAD Sequence DF

In [34]:
# load fus and circ data
fus_daily = pd.read_pickle("ls_data/jama_data/fus_gad_start_0_14.df")
circ_dict = pickle.load(open("ls_data/jama_data/circ_gad_start_0_14.dict", "rb"))

In [35]:
# correction for Series values in list, though it doesn't seem to have affected anything??
new_dict = {}
for col, vals in circ_dict.items():
    new_vals = []
    for v in vals:
        if isinstance(v, pd.Series):
            new_vals.append(v[0])
        else:
            new_vals.append(v)
    new_dict[col] = new_vals
circ_dict = new_dict

In [36]:
# columns
sms_cols = ['tot_sms_count', 'tot_sms_length', 'in_sms_count', 'in_sms_length', 'out_sms_count', 'out_sms_length']
cal_cols = ['tot_call_count', 'tot_call_duration']
fga_cols = ['katana', 'orca', 'messaging', 'email', 'instagram', 'youtube', 'maps', 'snapchat', 'browser', 'chrome']

circ_cols = ['circ_movt_tot', 'circ_movt_wkday', 'circ_movt_wkend']
fus_cols = list(fus_daily.columns[(fus_daily.columns.str.endswith('total') | fus_daily.columns.str.endswith('wkend') | fus_daily.columns.str.endswith('wkday')) & ~fus_daily.columns.str.startswith('date')])
sloc_cols = list(sloc_daily.columns.drop(['pid', 'date', 'is_wkday']))
#sloc_cols = list(sloc_daily.columns[sloc_daily.columns.str.endswith('dur')])

In [37]:
all_spin_fill_df['date'] = all_spin_fill_df['spin_start'].dt.normalize()

In [38]:
all_spin_fill_df['spin_total_shift'] = all_spin_fill_df.groupby('pid')['spin_total'].shift(1)
all_spin_fill_df['spin_diff'] = all_spin_fill_df['spin_total'] - all_spin_fill_df['spin_total_shift']

In [39]:
all_spin_fill_df['gad_total_shift'] = all_spin_fill_df.groupby('pid')['gad_total'].shift(1)
all_spin_fill_df['gad_diff'] = all_spin_fill_df['gad_total'] - all_spin_fill_df['gad_total_shift']

In [40]:
%%time

sms_dict = ls_utils.build_col_dict(sms_cols, sms_daily, all_spin_fill_df, 'spin_start', pre_days=0, post_days=14)
cal_dict = ls_utils.build_col_dict(cal_cols, cal_daily, all_spin_fill_df, 'spin_start', pre_days=0, post_days=14)
fga_dict = ls_utils.build_col_dict(fga_cols, fga_daily, all_spin_fill_df, 'spin_start', pre_days=0, post_days=14)
sloc_dict = ls_utils.build_col_dict(sloc_cols, sloc_daily, all_spin_fill_df, 'spin_start', pre_days=0, post_days=14)

CPU times: user 50.3 s, sys: 48.1 ms, total: 50.4 s
Wall time: 50.4 s


In [41]:
all_rev_dig_state_df = all_spin_fill_df.copy()

sensor_dicts = [sms_dict, cal_dict, fga_dict, sloc_dict, circ_dict]
sensor_cols = sms_cols + cal_cols + fga_cols + sloc_cols + circ_cols

for sensor_dict in sensor_dicts:
    for col, vals in sensor_dict.items():
        all_rev_dig_state_df[col] = vals

# merge fused location       
all_rev_dig_state_df = all_rev_dig_state_df.merge(fus_daily, how='left', on=['pid', 'study_wk'])

all_rev_dig_state_df = all_rev_dig_state_df.replace([np.inf, -np.inf], np.nan)

In [42]:
all_rev_dig_state_df['pid'].unique().shape

(267,)

## Shift feature columns

In [43]:
all_rev_dig_state_df[sensor_cols_shift] = all_rev_dig_state_df.groupby('pid')[feat_sensor_cols].shift(1)

In [46]:
for cur, prev, change in zip(feat_sensor_cols, sensor_cols_shift, sensor_cols_change):
    all_rev_dig_state_df[change] = all_rev_dig_state_df[cur] - all_rev_dig_state_df[prev]

# All PHQ

## Build PHQ Sequence DF

In [47]:
# load fus and circ data
fus_daily = pd.read_pickle("ls_data/jama_data/fus_phq_start_14_-1.df")
circ_dict = pickle.load(open("ls_data/jama_data/circ_phq_start_14_-1.dict", "rb"))

In [48]:
# correction for Series values in list, though it doesn't seem to have affected anything??
new_dict = {}
for col, vals in circ_dict.items():
    new_vals = []
    for v in vals:
        if isinstance(v, pd.Series):
            new_vals.append(v[0])
        else:
            new_vals.append(v)
    new_dict[col] = new_vals
circ_dict = new_dict

In [49]:
# columns
sms_cols = ['tot_sms_count', 'tot_sms_length', 'in_sms_count', 'in_sms_length', 'out_sms_count', 'out_sms_length']
cal_cols = ['tot_call_count', 'tot_call_duration']
fga_cols = ['katana', 'orca', 'messaging', 'email', 'instagram', 'youtube', 'maps', 'snapchat', 'browser', 'chrome']

circ_cols = ['circ_movt_tot', 'circ_movt_wkday', 'circ_movt_wkend']
fus_cols = list(fus_daily.columns[(fus_daily.columns.str.endswith('total') | fus_daily.columns.str.endswith('wkend') | fus_daily.columns.str.endswith('wkday')) & ~fus_daily.columns.str.startswith('date')])
sloc_cols = list(sloc_daily.columns.drop(['pid', 'date', 'is_wkday']))
#sloc_cols = list(sloc_daily.columns[sloc_daily.columns.str.endswith('dur')])

In [50]:
all_shift_df['date'] = all_shift_df['phq_start'].dt.normalize()

In [51]:
all_shift_df['phq_total_shift'] = all_shift_df.groupby('pid')['phq_total_sc'].shift(1)
all_shift_df['phq_diff'] = all_shift_df['phq_total_sc'] - all_shift_df['phq_total_shift']

In [52]:
%%time

sms_dict = ls_utils.build_col_dict(sms_cols, sms_daily, all_shift_df, 'phq_start', pre_days=14, post_days=-1)
cal_dict = ls_utils.build_col_dict(cal_cols, cal_daily, all_shift_df, 'phq_start', pre_days=14, post_days=-1)
fga_dict = ls_utils.build_col_dict(fga_cols, fga_daily, all_shift_df, 'phq_start', pre_days=14, post_days=-1)
sloc_dict = ls_utils.build_col_dict(sloc_cols, sloc_daily, all_shift_df, 'phq_start', pre_days=14, post_days=-1)

CPU times: user 55.7 s, sys: 51.9 ms, total: 55.8 s
Wall time: 55.8 s


In [53]:
all_phq_df = all_shift_df.copy()

sensor_dicts = [sms_dict, cal_dict, fga_dict, sloc_dict, circ_dict]
sensor_cols = sms_cols + cal_cols + fga_cols + sloc_cols + circ_cols

for sensor_dict in sensor_dicts:
    for col, vals in sensor_dict.items():
        all_phq_df[col] = vals

# merge fused location       
all_phq_df = all_phq_df.merge(fus_daily, how='left', on=['pid', 'study_wk'])
#all_phq_df = all_phq_df.replace([np.inf, -np.inf], np.nan)

In [54]:
all_phq_df['pid'].unique().shape

(272,)

## Shift feature columns

In [55]:
all_phq_df[sensor_cols_shift] = all_phq_df.groupby('pid')[feat_sensor_cols].shift(1)

In [56]:
for cur, prev, change in zip(feat_sensor_cols, sensor_cols_shift, sensor_cols_change):
    all_phq_df[change] = all_phq_df[cur] - all_phq_df[prev]

# Reverse all PHQ

In [57]:
# load fus and circ data
fus_daily = pd.read_pickle("ls_data/jama_data/fus_phq_start_0_14.df")
circ_dict = pickle.load(open("ls_data/jama_data/circ_phq_start_0_14.dict", "rb"))

In [58]:
# correction for Series values in list, though it doesn't seem to have affected anything??
new_dict = {}
for col, vals in circ_dict.items():
    new_vals = []
    for v in vals:
        if isinstance(v, pd.Series):
            new_vals.append(v[0])
        else:
            new_vals.append(v)
    new_dict[col] = new_vals
circ_dict = new_dict

In [59]:
# columns
sms_cols = ['tot_sms_count', 'tot_sms_length', 'in_sms_count', 'in_sms_length', 'out_sms_count', 'out_sms_length']
cal_cols = ['tot_call_count', 'tot_call_duration']
fga_cols = ['katana', 'orca', 'messaging', 'email', 'instagram', 'youtube', 'maps', 'snapchat', 'browser', 'chrome']

circ_cols = ['circ_movt_tot', 'circ_movt_wkday', 'circ_movt_wkend']
fus_cols = list(fus_daily.columns[(fus_daily.columns.str.endswith('total') | fus_daily.columns.str.endswith('wkend') | fus_daily.columns.str.endswith('wkday')) & ~fus_daily.columns.str.startswith('date')])
sloc_cols = list(sloc_daily.columns.drop(['pid', 'date', 'is_wkday']))
#sloc_cols = list(sloc_daily.columns[sloc_daily.columns.str.endswith('dur')])

In [60]:
all_shift_df['date'] = all_shift_df['phq_start'].dt.normalize()

In [61]:
all_shift_df['phq_total_shift'] = all_shift_df.groupby('pid')['phq_total_sc'].shift(1)
all_shift_df['phq_diff'] = all_shift_df['phq_total_sc'] - all_shift_df['phq_total_shift']

In [62]:
%%time

sms_dict = ls_utils.build_col_dict(sms_cols, sms_daily, all_shift_df, 'phq_start', pre_days=0, post_days=14)
cal_dict = ls_utils.build_col_dict(cal_cols, cal_daily, all_shift_df, 'phq_start', pre_days=0, post_days=14)
fga_dict = ls_utils.build_col_dict(fga_cols, fga_daily, all_shift_df, 'phq_start', pre_days=0, post_days=14)
sloc_dict = ls_utils.build_col_dict(sloc_cols, sloc_daily, all_shift_df, 'phq_start', pre_days=0, post_days=14)

CPU times: user 54.7 s, sys: 55.1 ms, total: 54.8 s
Wall time: 54.8 s


In [63]:
all_rev_phq_df = all_shift_df.copy()

sensor_dicts = [sms_dict, cal_dict, fga_dict, sloc_dict, circ_dict]
sensor_cols = sms_cols + cal_cols + fga_cols + sloc_cols + circ_cols

for sensor_dict in sensor_dicts:
    for col, vals in sensor_dict.items():
        all_rev_phq_df[col] = vals

# merge fused location       
all_rev_phq_df = all_rev_phq_df.merge(fus_daily, how='left', on=['pid', 'study_wk'])
#all_rev_phq_df = all_rev_phq_df.replace([np.inf, -np.inf], np.nan)

In [64]:
all_rev_phq_df['pid'].unique().shape

(272,)

## Shift feature columns

In [65]:
all_rev_phq_df[sensor_cols_shift] = all_rev_phq_df.groupby('pid')[feat_sensor_cols].shift(1)

In [66]:
for cur, prev, change in zip(feat_sensor_cols, sensor_cols_shift, sensor_cols_change):
    all_rev_phq_df[change] = all_rev_phq_df[cur] - all_rev_phq_df[prev]

## Dump feature dfs for all participants

In [67]:
all_dig_state_df.to_pickle("wave1_features/all_gad_spin_corr_ind_feats.df")
all_rev_dig_state_df.to_pickle("wave1_features/all_rev_gad_spin_corr_ind_feats.df")
all_phq_df.to_pickle("wave1_features/all_phq_corr_ind_feats.df")
all_rev_phq_df.to_pickle("wave1_features/all_rev_phq_corr_ind_feats.df")