<h1 id="tocheading">Table of Contents</h1>
<div id="toc"></div>

# Imports and Scripts

In [1]:
%%javascript
// ToC script
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

In [2]:
# Code hider, source: http://chris-said.io/2016/02/13/how-to-make-polished-jupyter-presentations-with-optional-code-visibility/
from IPython.display import HTML

HTML('''
<script>
  function code_toggle() {
    if (code_shown){
      $('div.input').hide('500');
      $('#toggleButton').val('Show Code')
    } else {
      $('div.input').show('500');
      $('#toggleButton').val('Hide Code')
    }
    code_shown = !code_shown
  }

  $( document ).ready(function(){
    code_shown=false;
    $('div.input').hide()
  });
</script>
<form action="javascript:code_toggle()"><input type="submit" id="toggleButton" value="Show Code"></form>
''')



In [3]:
# imports and constants
import json
import pickle
import datetime as dt

import gmaps
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import multiprocessing
import numpy as np
import pandas as pd
import pingouin as pg
import seaborn as sns

from IPython.display import display, HTML
import ipywidgets as widgets

# user imports
import data_processing.analysis_utils as ls_utils

%matplotlib inline

import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 300

# Load Data

In [4]:
spin_fill_df = pd.read_pickle("ls_data/jama_data/all_state_shift_no_imp.df")

# SPIN/GAD

In [5]:
# load fus and circ data
fus_daily = pd.read_pickle("ls_data/jama_data/fus_gad_start_14_-1.df")
circ_dict = pickle.load(open("ls_data/jama_data/circ_gad_start_14_-1.dict", "rb"))

In [6]:
# correction for Series values in list, though it doesn't seem to have affected anything??
new_dict = {}
for col, vals in circ_dict.items():
    new_vals = []
    for v in vals:
        if isinstance(v, pd.Series):
            new_vals.append(v[0])
        else:
            new_vals.append(v)
    new_dict[col] = new_vals
circ_dict = new_dict

In [7]:
circ_cols = ['circ_movt_tot', 'circ_movt_wkday', 'circ_movt_wkend']
fus_cols = list(fus_daily.columns[(fus_daily.columns.str.endswith('total') | fus_daily.columns.str.endswith('wkend') | fus_daily.columns.str.endswith('wkday')) & ~fus_daily.columns.str.startswith('date')])


### Calculate shifted change

In [8]:
spin_fill_df['date'] = spin_fill_df['spin_start'].dt.normalize()

In [9]:
spin_fill_df['spin_total_shift'] = spin_fill_df.groupby('pid')['spin_total'].shift(1)
spin_fill_df['spin_diff'] = spin_fill_df['spin_total'] - spin_fill_df['spin_total_shift']

In [10]:
spin_fill_df['gad_total_shift'] = spin_fill_df.groupby('pid')['gad_total'].shift(1)
spin_fill_df['gad_diff'] = spin_fill_df['gad_total'] - spin_fill_df['gad_total_shift']

In [11]:
dig_state_df = spin_fill_df.copy()

sensor_dicts = [circ_dict]
sensor_cols = circ_cols

for sensor_dict in sensor_dicts:
    for col, vals in sensor_dict.items():
        dig_state_df[col] = vals

# merge fused location       
dig_state_df = dig_state_df.merge(fus_daily, how='left', on=['pid', 'study_wk'])
dig_state_df = dig_state_df.replace([np.inf, -np.inf], np.nan)

In [12]:
id_cols = ['pid', 'study_wk']

## Movement split

In [13]:
loc_cols = fus_cols + circ_cols

distance_cols = ['velocity_total', 'dist_total']
duration_cols = ['entropy_total', 'norm_entropy_total', 'circ_movt_tot']
variance_cols = ['cluster_total', 'loc_var_total']

In [14]:
agg_df = dig_state_df.copy()
agg_df[loc_cols] = (agg_df[loc_cols] - agg_df[loc_cols].mean()) / agg_df[loc_cols].std()
agg_df['movt_distance_agg'] = agg_df[distance_cols].mean(axis=1)
agg_df['movt_variance_agg'] = agg_df[variance_cols].mean(axis=1)
agg_df['movt_duration_agg'] = agg_df[duration_cols].mean(axis=1)

## Shift columns

In [15]:
agg_cols = agg_df.columns[agg_df.columns.str.endswith("agg")]
agg_cols = list(agg_cols) + loc_cols
agg_cols_shift = [s + '_shift' for s in agg_cols]
agg_cols_change = [s + '_change' for s in agg_cols]

In [16]:
agg_df[agg_cols_shift] = agg_df.groupby('pid')[agg_cols].shift(1)

In [17]:
for cur, prev, change in zip(agg_cols, agg_cols_shift, agg_cols_change):
    agg_df[change] = agg_df[cur] - agg_df[prev]

In [18]:
agg_df['gad_diff'].describe()

count    1033.000000
mean       -0.073572
std         3.814295
min       -14.000000
25%        -2.000000
50%         0.000000
75%         2.000000
max        15.000000
Name: gad_diff, dtype: float64

In [19]:
agg_df['spin_diff'].describe()

count    1032.000000
mean        0.005814
std         6.628713
min       -39.000000
25%        -3.000000
50%         0.000000
75%         3.000000
max        36.000000
Name: spin_diff, dtype: float64

# Reverse SPIN/GAD

## Build reversed df

In [20]:
# load fus and circ data
rev_fus_daily = pd.read_pickle("ls_data/jama_data/fus_gad_start_0_14.df")
rev_circ_dict = pickle.load(open("ls_data/jama_data/circ_gad_start_0_14.dict", "rb"))

In [21]:
# correction for Series values in list, though it doesn't seem to have affected anything??
rev_new_dict = {}
for col, vals in rev_circ_dict.items():
    new_vals = []
    for v in vals:
        if isinstance(v, pd.Series):
            new_vals.append(v[0])
        else:
            new_vals.append(v)
    rev_new_dict[col] = new_vals
rev_circ_dict = rev_new_dict

### Calculate shifted change

In [22]:
spin_fill_df['date'] = spin_fill_df['spin_start'].dt.normalize()

In [23]:
spin_fill_df['spin_total_shift'] = spin_fill_df.groupby('pid')['spin_total'].shift(1)
spin_fill_df['spin_diff'] = spin_fill_df['spin_total'] - spin_fill_df['spin_total_shift']

In [24]:
spin_fill_df['gad_total_shift'] = spin_fill_df.groupby('pid')['gad_total'].shift(1)
spin_fill_df['gad_diff'] = spin_fill_df['gad_total'] - spin_fill_df['gad_total_shift']

In [25]:
rev_dig_state_df = spin_fill_df.copy()

sensor_dicts = [rev_circ_dict]
sensor_cols = circ_cols

for sensor_dict in sensor_dicts:
    for col, vals in sensor_dict.items():
        rev_dig_state_df[col] = vals

# merge fused location       
rev_dig_state_df = rev_dig_state_df.merge(fus_daily, how='left', on=['pid', 'study_wk'])
rev_dig_state_df = rev_dig_state_df.replace([np.inf, -np.inf], np.nan)

## Aggregate features

In [26]:
rev_agg_df = rev_dig_state_df.copy()
rev_agg_df[loc_cols] = (rev_agg_df[loc_cols] - rev_agg_df[loc_cols].mean()) / rev_agg_df[loc_cols].std()
rev_agg_df['movt_distance_agg'] = rev_agg_df[distance_cols].mean(axis=1)
rev_agg_df['movt_variance_agg'] = rev_agg_df[variance_cols].mean(axis=1)
rev_agg_df['movt_duration_agg'] = rev_agg_df[duration_cols].mean(axis=1)

## Shift columns

In [27]:
rev_agg_df[agg_cols_shift] = rev_agg_df.groupby('pid')[agg_cols].shift(1)

In [28]:
for cur, prev, change in zip(agg_cols, agg_cols_shift, agg_cols_change):
    rev_agg_df[change] = rev_agg_df[cur] - rev_agg_df[prev]

In [29]:
rev_agg_df['gad_diff'].describe()

count    1033.000000
mean       -0.073572
std         3.814295
min       -14.000000
25%        -2.000000
50%         0.000000
75%         2.000000
max        15.000000
Name: gad_diff, dtype: float64

In [30]:
rev_agg_df['spin_diff'].describe()

count    1032.000000
mean        0.005814
std         6.628713
min       -39.000000
25%        -3.000000
50%         0.000000
75%         3.000000
max        36.000000
Name: spin_diff, dtype: float64

# 2 weeks look-back PHQ

In [31]:
shift_df = pd.read_pickle("ls_data/jama_data/all_phq_shift_imp.df")

In [32]:
# load fus and circ data
fus_daily = pd.read_pickle("ls_data/jama_data/fus_phq_start_14_-1.df")
circ_dict = pickle.load(open("ls_data/jama_data/circ_phq_start_14_-1.dict", "rb"))

In [33]:
# correction for Series values in list, though it doesn't seem to have affected anything??
new_dict = {}
for col, vals in circ_dict.items():
    new_vals = []
    for v in vals:
        if isinstance(v, pd.Series):
            new_vals.append(v[0])
        else:
            new_vals.append(v)
    new_dict[col] = new_vals
circ_dict = new_dict

In [34]:
shift_df['date'] = shift_df['phq_start'].dt.normalize()

In [35]:
phq_state_df = shift_df.copy()

sensor_dicts = [circ_dict]
sensor_cols = circ_cols

for sensor_dict in sensor_dicts:
    for col, vals in sensor_dict.items():
        phq_state_df[col] = vals
phq_state_df = phq_state_df.merge(fus_daily, how='left', on=['pid', 'study_wk'])
phq_state_df = phq_state_df.replace([np.inf, -np.inf], np.nan)

phq_state_df.head()

Unnamed: 0,cluster,diff,mean_phq,phq01_sc,phq02_sc,phq03_sc,phq04_sc,phq05_sc,phq06_sc,phq07_sc,...,entropy_wkend,loc_var_total,loc_var_wkday,loc_var_wkend,norm_entropy_total,norm_entropy_wkday,norm_entropy_wkend,velocity_total,velocity_wkday,velocity_wkend
0,3.0,NaT,9.714286,1,2,1,2,0,1,0,...,0.516003,-3.270545,-2.933953,-5.241677,0.228997,0.262965,0.190544,7.228884,8.305369,4.826638
1,3.0,20 days 19:32:10,9.714286,1,2,2,2,0,1,1,...,0.318589,-3.992103,-3.525627,-6.091229,0.180792,0.189827,0.177808,5.144077,6.141039,2.649001
2,3.0,21 days 00:04:12,9.714286,1,1,2,1,0,1,1,...,0.661245,-4.514528,-4.420466,-4.85595,0.171747,0.149136,0.228775,5.056099,4.347594,6.839301
3,3.0,21 days 02:46:57,9.714286,1,1,3,2,0,1,1,...,0.357938,-4.964505,-4.796124,-5.919636,0.18079,0.201921,0.149272,3.989529,4.033579,3.825982
4,3.0,20 days 21:08:34,9.714286,1,1,3,3,1,2,2,...,0.687668,-3.686849,-4.510184,-3.103347,0.213816,0.211217,0.268102,4.790311,4.550216,5.398051


## Aggregate Features

In [36]:
phq_agg_df = phq_state_df.copy()
phq_agg_df[loc_cols] = (phq_agg_df[loc_cols] - phq_agg_df[loc_cols].mean()) / phq_agg_df[loc_cols].std()
phq_agg_df['movt_distance_agg'] = phq_agg_df[distance_cols].mean(axis=1)
phq_agg_df['movt_variance_agg'] = phq_agg_df[variance_cols].mean(axis=1)
phq_agg_df['movt_duration_agg'] = phq_agg_df[duration_cols].mean(axis=1)

## Shift feature columns

In [37]:
phq_agg_df[agg_cols_shift] = phq_agg_df.groupby('pid')[agg_cols].shift(1)

In [38]:
for cur, prev, change in zip(agg_cols, agg_cols_shift, agg_cols_change):
    phq_agg_df[change] = phq_agg_df[cur] - phq_agg_df[prev]

In [39]:
phq_agg_df['phq_diff'].describe()

count    1360.000000
mean        0.048272
std         3.013952
min       -15.000000
25%        -1.000000
50%         0.000000
75%         1.000000
max        18.000000
Name: phq_diff, dtype: float64

# Reverse PHQ

## Build reversed df

In [40]:
# load fus and circ data
rev_fus_daily = pd.read_pickle("ls_data/jama_data/fus_phq_start_0_14.df")
rev_circ_dict = pickle.load(open("ls_data/jama_data/circ_phq_start_0_14.dict", "rb"))

In [41]:
# correction for Series values in list, though it doesn't seem to have affected anything?
rev_new_dict = {}
for col, vals in rev_circ_dict.items():
    new_vals = []
    for v in vals:
        if isinstance(v, pd.Series):
            new_vals.append(v[0])
        else:
            new_vals.append(v)
    rev_new_dict[col] = new_vals
rev_circ_dict = rev_new_dict

### Calculate shifted change

In [42]:
rev_phq_df = shift_df.copy()

sensor_dicts = [rev_circ_dict]
sensor_cols = circ_cols

for sensor_dict in sensor_dicts:
    for col, vals in sensor_dict.items():
        rev_phq_df[col] = vals

# merge fused location       
rev_phq_df = rev_phq_df.merge(rev_fus_daily, how='left', on=['pid', 'study_wk'])
rev_phq_df = rev_phq_df.replace([np.inf, -np.inf], np.nan)

## Aggregate features

In [43]:
rev_phq_agg_df = rev_phq_df.copy()
rev_phq_agg_df[loc_cols] = (rev_phq_agg_df[loc_cols] - rev_phq_agg_df[loc_cols].mean()) / rev_phq_agg_df[loc_cols].std()
rev_phq_agg_df['movt_distance_agg'] = rev_phq_agg_df[distance_cols].mean(axis=1)
rev_phq_agg_df['movt_variance_agg'] = rev_phq_agg_df[variance_cols].mean(axis=1)
rev_phq_agg_df['movt_duration_agg'] = rev_phq_agg_df[duration_cols].mean(axis=1)

## Shift columns

In [44]:
rev_phq_agg_df[agg_cols_shift] = rev_phq_agg_df.groupby('pid')[agg_cols].shift(1)

In [45]:
for cur, prev, change in zip(agg_cols, agg_cols_shift, agg_cols_change):
    rev_phq_agg_df[change] = rev_phq_agg_df[cur] - rev_phq_agg_df[prev]

In [46]:
rev_phq_agg_df['phq_diff'].describe()

count    1360.000000
mean        0.048272
std         3.013952
min       -15.000000
25%        -1.000000
50%         0.000000
75%         1.000000
max        18.000000
Name: phq_diff, dtype: float64

# Dump final aggregate DataFrames

In [47]:
agg_df.to_pickle("wave1_features/all_gad_spin_gps_corr.df")
rev_agg_df.to_pickle("wave1_features/all_rev_gad_spin_gps_corr.df")

In [48]:
phq_agg_df.to_pickle("wave1_features/all_phq_gps_corr.df")
rev_phq_agg_df.to_pickle("wave1_features/all_rev_phq_gps_corr.df")