# Imports and Utils

In [1]:
# imports and constants
import json
import pickle
import datetime as dt

import gmaps
from linearmodels import PanelOLS, FirstDifferenceOLS
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import multiprocessing
import numpy as np
import pandas as pd
import pingouin as pg
import statsmodels.api as sm
import seaborn as sns

from IPython.display import display, HTML
import ipywidgets as widgets

# user imports
from utils.process_data import *
from utils.stats import *
from utils.lifesense_utils import *


import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 300

%matplotlib inline

# Import Data

In [2]:
with open("data_pull/ids/wave1_ids.txt", "r") as internal_f:
    wave1_ids = [line.strip() for  line in internal_f.readlines()]

# Semantic Location Processing

In [20]:
phq_loc = "/data/tliu/all_ema_data/evening_phq8"

even_phq_df = pd.DataFrame()

for pid in wave1_ids:
    df = pd.read_pickle("{}/{}.df".format(phq_loc, pid))
    even_phq_df = even_phq_df.append(df, sort=True)

In [21]:
even_ema_loc = "/data/tliu/all_ema_data/evening_ema"

even_ema_df = pd.DataFrame()

for pid in wave1_ids:
    df = pd.read_pickle("{}/{}.df".format(even_ema_loc, pid))
    even_ema_df = even_ema_df.append(df, sort=True)


In [49]:
even_phq_df = format_time(even_phq_df)
even_ema_df = format_time(even_ema_df)


print(even_phq_df.shape)
print(even_ema_df.shape)

(1280, 132)
(7268, 139)


In [50]:
keep_cols = ['source', 'timestamp', 'timezone-offset', 'data_source', 'date']

def extract_locs(row, loc_attr, max_nbr):
    """extracts location information from each row"""
    loc_df = pd.DataFrame()
    for i in range(max_nbr):
        # check for lat/long
        lat = "place-latitude-{}".format(i)
        if not pd.isnull(row[lat]):
            # we have a valid row
            row_dict = {}
            # populate metadata
            for col in keep_cols:
                row_dict[col] = row[col]
            # populate location attributes
            for attr in loc_attr:
                attr_name = "{}-{}".format(attr, i) 
                if attr_name in row.index and not pd.isnull(row[attr_name]):
                    row_dict[attr] = row[attr_name]
                else:
                    row_dict[attr] = np.nan
            series = pd.Series(row_dict)
            #print(series)
            loc_df = loc_df.append(series, ignore_index=True)
    return loc_df    

In [51]:
cols = list(even_ema_df.columns[even_ema_df.columns.str.startswith('place')].values)
# remove trailing numbering
attributes = ['-'.join(col.split('-')[:-1]) for col in cols]
attributes = list(set(attributes))

In [52]:
nbrs = [int(col.split('-')[-1]) for col in cols]
max(nbrs)

13

In [53]:
attributes

['place-kind',
 'place-name',
 'place-with-others',
 'place-medical-type',
 'place-longitude',
 'place-other',
 'place-latitude',
 'place-home-other',
 'place-home-type']

In [55]:
%%time

semantic_locs = pd.DataFrame()
for i, row in even_ema_df.iterrows():
    df = extract_locs(row, attributes, 14)
    semantic_locs = semantic_locs.append(df)

CPU times: user 3min 13s, sys: 993 ms, total: 3min 14s
Wall time: 3min 14s


In [59]:
cols = list(even_phq_df.columns[even_phq_df.columns.str.startswith('place')].values)
nbrs = [int(col.split('-')[-1]) for col in cols]
max(nbrs)

12

In [60]:
%%time

semantic_phq_locs = pd.DataFrame()
for i, row in even_phq_df.iterrows():
    df = extract_locs(row, attributes, 13)
    semantic_phq_locs = semantic_phq_locs.append(df)

CPU times: user 26.1 s, sys: 6.94 ms, total: 26.2 s
Wall time: 26.1 s


In [62]:
semantic_locs.shape

(20919, 14)

In [65]:
semantic_locs.head()

Unnamed: 0,data_source,date,place-home-other,place-home-type,place-kind,place-latitude,place-longitude,place-medical-type,place-name,place-other,place-with-others,source,timestamp,timezone-offset
0,wave1,2019-07-31,,,Work,41.69691753636364,-91.61501816363636,,,,,8343773,1564554000.0,-18000.0
1,wave1,2019-07-31,,,Home,41.75054922,-91.613768644,,,,,8343773,1564554000.0,-18000.0
2,wave1,2019-07-31,,,Other…,42.037710972000006,-91.657516312,,Tax Act,This is a tax software company that was conduc...,,8343773,1564554000.0,-18000.0
3,wave1,2019-07-31,,,Food and Drink,41.70016786,-91.60922634,,Culver's,,alone,8343773,1564554000.0,-18000.0
0,wave1,2019-07-31,,,Home,41.750478669662925,-91.61372453370784,,,,,8343773,1564628000.0,-18000.0


In [66]:
semantic_locs = semantic_locs.append(semantic_phq_locs)
semantic_locs = semantic_locs.sort_values(by=['source', 'date'])

In [119]:
semantic_locs.shape
semantic_locs['place-kind'].value_counts()

Home                                                       8859
Work                                                       5271
Food and Drink                                             2291
Another Person's Home                                      1837
I Was Not There                                            1815
Shopping                                                   1688
Other…                                                     1241
Errand                                                      962
Medical/Dentist/Mental Health                               945
Education                                                   713
Entertainment                                               679
Gym/Exercise                                                673
Commute/Travel (Airport, Bus Stop, Train Station, Etc.)     644
Place of Worship (Church, Temple, Etc.)                     307
Beauty/Grooming                                             141
Childcare                               

In [80]:
semantic_locs['pid'] = semantic_locs['source']

In [81]:
semantic_locs.to_pickle("ls_data/slocs.df")

## Semantic Location labelling

In [110]:
from geopy.distance import distance

def tag_semantic_locs(pid, sloc_df, file_loc, cluster_rad=500):
    """
    Tags each location sensor reading with a semantic label, if applicable.
    
    We only use labelled locations from the same week of data collection, or earlier.
    
    Args:
        pid (str): participant id
        sloc_df (df): the semantic location DataFrame loaded from file
        file_loc (str): the file location for the location df
        cluster_rad (int): the maximum cluster radius
        
    Returns:
        "raw" DataFrame with long/lat labelled
    """
    print(pid)
    loc_df = pd.read_pickle("{}/{}.df".format(file_loc, pid))
    if loc_df.shape[0] < 1:
        return 
    loc_df = format_time(loc_df)
    sloc_pid = sloc_df.loc[sloc_df['pid'] == pid]
    sloc_pid = sloc_pid[sloc_pid['date'] <= max(loc_df['date'])]
    places = []

    for i, loc_row in loc_df.iterrows():

        dist = cluster_rad + 5
        for j, sloc_row in sloc_pid.iterrows():
            dist = distance((loc_row['latitude'], loc_row['longitude']), (sloc_row['place-latitude'], sloc_row['place-longitude'])).m
            if dist < cluster_rad:
                break
                
        if dist < cluster_rad:
            places.append(sloc_row['place-kind'])
        else:
            places.append(np.nan)

    loc_df['place-kind'] = places
    
    return loc_df[['pid', 'date', 'time', 'latitude', 'longitude', 'place-kind']]
    
    

In [89]:
max(semantic_locs['date'])

Timestamp('2019-11-19 00:00:00')

In [None]:
pd.read_pickle

In [95]:
%%time
fus_loc = "/data/tliu/wk4_ls_data/pdk-location"
test = tag_semantic_locs(wave1_ids[0], semantic_locs, fus_loc)

08343773
Index(['accuracy', 'altitude', 'bearing', 'encrypted_transmission',
       'generator', 'generator-id', 'latitude', 'location_timestamp',
       'longitude', 'observed', 'provider', 'source', 'speed', 'timestamp',
       'timezone', 'timezone-offset', 'pid', 'data_source', 'adj_ts', 'time',
       'date', 'hour'],
      dtype='object')
CPU times: user 10.4 s, sys: 25.8 ms, total: 10.4 s
Wall time: 10.4 s


In [114]:
def build_sloc(pid, in_loc, out_loc):
    """Builds and dumps raw semantic location df"""
    
    df = tag_semantic_locs(pid, semantic_locs, in_loc)
    pd.to_pickle(df, "{}/{}.df".format(out_loc, pid))

In [115]:
wks = [4, 7,10,13,16]
fus_loc = "/data/tliu/wk{}_ls_data/pdk-location"
sem_loc = "/data/tliu/wk{}_ls_data/semantic-location"

f_args = []
for wk in wks:
    for pid in wave1_ids:
        f_args.append((pid, fus_loc.format(wk), sem_loc.format(wk)))
        

In [116]:
%%time

import multiprocessing

with multiprocessing.Pool(12) as pool:
    results = pool.starmap(build_sloc, f_args)

21594071
01254121
75282136
11770862
08343773
71219000
57473014
91788916
28949890
58081753
01225297
74626135
20206315
03384972
65381988
73518938
19663467
47688944
18583649
05261598
33250639
39106805
56184073
27761141
34262165
00746649
32573840
50550619
90229239
56723660
72685265
93519386
07854544
61131074
89434074
08007167
99050875
31456993
80657933
38646138
53236058
80504454
17328943
27099517
29149362
01495950
14113160
80700486
78681731
69335292
35493515
18156803
83062037
89346491
58780031
02144163
55979795
79819446
09489685
79510141
72038219
59764431
48625414
70483015
03327555
26080346
58740880
59654069
28939704
62860600
65143770
87400142
86283726
64292248
71043609
74575289
17294720
13567195
27762780
42258080
45517860
99338619
14753485
76854891
53435128
22086591
51456954
80206225
95556839
06638392
73916801
79316475
38890840
03233601
69452375
39548248
26957252
28244292
46002724
55542659
14549710
50765631
27330785
56910929
87929316
12807049
86756971
77842251
62375942
52982527
81049144
7

38890840
55463070
62463869
02144163
55979795
29878406
00746649
68756107
89346491
32309079
80504454
44933937
59764431
91788916
20206315
09611865
08007167
76366191
58740880
53236058
74626135
18740846
80700486
35493515
79819446
58081753
14113160
83062037
22498610
77842251
11927637
74739196
99338619
47688944
93519386
79510141
07854544
29149362
23066392
64292248
17294720
39106805
56184073
27762780
09269616
42871706
43292038
11436422
26957252
22086591
48625414
53808826
80206225
73916801
71043609
03233601
13567195
23388083
27099517
89346491
71676393
75348018
45517860
44293762
44667026
28949890
70483015
03327555
61762096
27330785
50765631
14549710
28021601
62375942
52982527
81049144
59764431
09489685
95556839
06638392
79819446
87929316
28244292
64142475
46002724
21894119
90763832
42215399
58740880
45433155
22656406
59222410
86283726
08103884
44655272
52064875
64292248
93606382
32718334
50707558
36969413
67900112
21594071
76432041
48367404
47363974
91048552
35576469
28458341
22086591
55915099
9

## Semantic Location Processing

In [131]:
sloc_map = {
    "Food and Drink" : "food",
    "Home" : "home",
    "Work" : "work",
    "Gym/Exercise" : "exercise",
    "Another Person's Home" : "anothers_home",
    "Place of Worship" : "religion",
    "Commute/Travel" : "travel",
    "Shopping" : "shopping",
    "Errand" : "errand",
    "Medical/Dentist/Mental Health" : "health",
    "Education" : "education",
    "Entertainment" : "entertainment",
    "Other..." : "other",
    np.nan : "n/a"
}

In [145]:
def process_transition_hr(time, sloc_group):
    #sloc_pid = sloc_all_df.loc[sloc_all_df['pid'] == test_id]
    num_transitions = 0
    transition_dict = {}
    transition_dict['hr'] = time
    
    for sloc in sloc_map.values():
        transition_dict[sloc + '_dur'] = 0

    for sloc_i in sloc_map.values():
        for sloc_j in sloc_map.values():
            if sloc_i is not sloc_j:
                transition_dict[sloc_i + '_' + sloc_j] = 0
    
    cur_loc = sloc_group.iloc[0]['place-kind-fmt']
    cur_time = sloc_group.iloc[0]['time']
    for i, row in sloc_group.iterrows():
        next_loc = row['place-kind-fmt']
        next_time = row['time']
        if next_loc is not cur_loc:
            num_transitions += 1
            transition_dict[cur_loc + '_dur'] += (next_time - cur_time).total_seconds()
            transition_dict[cur_loc + '_' + next_loc] += 1
            cur_loc = next_loc
            cur_time = next_time
    
    # at the bottom of the hour
    transition_dict[cur_loc + '_dur'] += ((time + pd.Timedelta(1, unit='h')) - cur_time).total_seconds()
    
    transition_dict['tot_tansitions'] = num_transitions
    #print(transition_dict)
    return transition_dict


def build_sloc_hr(pid, loc):
    print(pid)
    sloc_hr = pd.DataFrame()
    sloc_pid = pd.read_pickle("{}/{}.df".format(loc, pid))
    if sloc_pid is None:
        return
    if sloc_pid.shape[0] < 1:
        return
    
    sloc_pid['hour'] = sloc_pid['time'].dt.floor('H')
    sloc_pid['place-kind-fmt'] = sloc_pid['place-kind'].map(sloc_map)
    sloc_pid['place-kind-fmt'] = sloc_pid['place-kind-fmt'].fillna('other')

    for time, group in sloc_pid.groupby("hour"):
        sl = pd.DataFrame(process_transition_hr(time, group), index=[0])
        sloc_hr = sloc_hr.append(sl)

    sloc_hr = sloc_hr.set_index('hr')
    sloc_hr = sloc_hr.resample('1H').sum()
    sloc_hr = sloc_hr.reset_index()
    sloc_hr['pid'] = pid

    return sloc_hr

In [146]:
%%time

test = build_sloc_hr(wave1_ids[0], sem_loc.format(4))

08343773
CPU times: user 6.57 s, sys: 7.97 ms, total: 6.58 s
Wall time: 6.57 s


In [147]:
sensor_locs = [sem_loc]
wks = [4, 7, 10, 13, 16]

# load data
with open("data_pull/ids/wave1_ids.txt", "r") as internal_f:
    wave1_ids = [line.strip() for  line in internal_f.readlines()]

    
def process_sensor_data(pids, loc, out_loc, func, n_procs=4):
    """Wrapper function for processing sensor data.
    
    Args:
        pids (list): list of pids to process
        #wk (int): the week of data to process
        loc (str): the file location
        out_loc (str): the output file name and location
        func (function): the processing function to apply
        n_procs (int): the number of processes to spin up
    
    Returns:
        None, but writes to 
    """
    #loc = loc.format(wk)
    func_args = [(pid, loc) for pid in pids]
    with multiprocessing.Pool(n_procs) as pool:
        results = pool.starmap(func, func_args)
        
    df = pd.DataFrame()

    for res in results:
        df = df.append(res)
    
    df.to_pickle(out_loc)
        

sloc_str = "ls_data/wk{}/sloc_hr.df"

def process_all_data(pids, wks, n_procs=4):    
    for wk in wks:
        process_sensor_data(pids, sem_loc.format(wk), sloc_str.format(wk), build_sloc_hr, n_procs)

In [None]:
%%time

process_all_data(wave1_ids, wks, n_procs=12)

44667026
08343773
28244292
27099517
61762096
21594071
29878406
50550619
84469352
48367404
47363974
28949890
58780031
01254121
46002724
59222410
44655272
18583649
91788916
20206315
10285142
32718334
56912666
09489685
80504454
59654069
51419094
28939704
05261598
70483015
33250639
97678130
62860600
99050875
76854891
86283726
53435128
03327555
27761141
34262165
49001726
47688944
37168430
42258080
31456993
32573840
87485171
77842251
36969413
90496706
16777771
50765631
15565415
19410615
98250113
74575289
81729157
75348018
39106805
56184073
95556839
50730294
55313474
71219000
71676393
98621494
01495950
39548248
52982527
73518938
81049144
69452375
44909649
52581458
44293762


# Notes

- sanity check: whether or not visits to a medical professional corresponds with a change in self reported medical visits
- look at location activity across different temporal strides

## Implementation Plan

- pull semantic location from cluster labels from current and previous weeks, but not future weeks
    - do we want to do previous weeks?
- TODO port to lifesense utils file
- use same clustering algorithm as previously implemented in lifesense_analysis.ipynb notebook
- then run same t+1 analysis, with a held-out set
- validate circadian rhythm 