# Imports and Functions

In [1]:
# imports and constants
import json
import pickle

import gmaps
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import multiprocessing
import numpy as np
import pandas as pd
import pingouin as pg
import seaborn as sns

from IPython.display import display, HTML
import ipywidgets as widgets

# user imports
from utils.process_data import *
from utils.stats import *
from utils.lifesense_utils import *

%matplotlib inline

# Extract REDCap Surveys

In [119]:
wk_redcap_df = pd.read_excel("data_pull/LS_Wave1_REDCap_wk1_4_7_10_110719.xlsx",
                            sheet_name=0)
baseline_redcap_df = pd.read_excel("data_pull/LS_Wave1_SC_BL_WK1_Data_081419-revised.xlsx",
                            sheet_name=0)

mapping_df = pd.read_csv("data_pull/Wave1LifeSenseEnroll_DATA_LABELS_2019-08-15_0929.csv")

baseline_redcap_dict = dict(zip(mapping_df['Case#:'], mapping_df['LifeSense Study App ID:    ']))
wk_redcap_dict = dict(zip(mapping_df['ID:'], mapping_df['LifeSense Study App ID:    ']))

In [120]:
with open("data_pull/ids/wave1_ids.txt", "r") as internal_f:
    wave1_ids = [line.strip() for  line in internal_f.readlines()]

In [121]:
wk_redcap_df['pid'] = wk_redcap_df['record_id'].map(wk_redcap_dict)
baseline_redcap_df['pid'] = baseline_redcap_df['study_id'].map(baseline_redcap_dict)

def pad_pid(row):
    if len(row['pid']) < 8:
        return ('0' * (8-len(row['pid']))) + row['pid']
    else:
        return row['pid']

wk_redcap_df['pid'] = wk_redcap_df['pid'].astype(str)
wk_redcap_df['pid'] = wk_redcap_df.apply(pad_pid, axis=1)

baseline_redcap_df['pid'] = baseline_redcap_df['pid'].astype(str)
baseline_redcap_df['pid'] = baseline_redcap_df.apply(pad_pid, axis=1)

## Build State DataFrame

In [130]:
base_rc_cols = baseline_redcap_df.columns[baseline_redcap_df.columns.str.startswith('shaps') |
                                           baseline_redcap_df.columns.str.startswith('phq') | 
                                           baseline_redcap_df.columns.str.startswith('gad') |
                                           baseline_redcap_df.columns.str.startswith('spin') |
                                           baseline_redcap_df.columns.str.startswith('psqi')]

wk_rc_cols = wk_redcap_df.columns[wk_redcap_df.columns.str.startswith('shaps') |
                                           wk_redcap_df.columns.str.startswith('phq') | 
                                           wk_redcap_df.columns.str.startswith('gad') |
                                           wk_redcap_df.columns.str.startswith('spin') |
                                           wk_redcap_df.columns.str.startswith('psqi')]

In [131]:
state_df = wk_redcap_df[['pid', 'redcap_event_name'] + list(wk_rc_cols)]
# no surveys conducted in wk1
state_df = state_df[~(state_df['redcap_event_name'] == 'week_1_arm_1')]

In [132]:
start_cols = list(state_df.columns[state_df.columns.str.endswith('start')])

for col in start_cols:
    baseline_redcap_df[col] = baseline_redcap_df['dt_screen']

# populate baseline columns to match the week columns
baseline_redcap_df['redcap_event_name'] = 'week_0_arm_1'
baseline_redcap_df['psqi_05j_1'] = baseline_redcap_df['psqi_other']
baseline_redcap_df['psqi_05j_2'] = baseline_redcap_df['psqi_05j']

In [133]:
state_df = state_df.append(baseline_redcap_df[state_df.columns])

In [134]:
state_df = state_df.sort_values(by=['pid', 'gad_start'])
print(state_df.shape)
# drop rows with no survey responses
print(state_df.dropna(subset=wk_rc_cols, how='all').shape)
state_df = state_df.dropna(subset=wk_rc_cols, how='all')
state_df.head(15)

(2628, 65)
(1066, 65)


Unnamed: 0,pid,redcap_event_name,gad_start,gad01,gad02,gad03,gad04,gad05,gad06,gad07,...,spin_08,spin_09,spin_10,spin_11,spin_12,spin_13,spin_14,spin_15,spin_16,spin_17
39,746649,week_0_arm_1,2019-07-22,1.0,1.0,1.0,1.0,2.0,2.0,2.0,...,4.0,4.0,3.0,4.0,4.0,2.0,4.0,4.0,4.0,4.0
405,746649,week_4_arm_1,2019-08-23 19:04:16,3.0,3.0,3.0,3.0,3.0,3.0,2.0,...,4.0,4.0,4.0,4.0,3.0,4.0,4.0,4.0,2.0,4.0
406,746649,week_7_arm_1,2019-09-14 15:19:45,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,2.0,3.0
407,746649,week_10_arm_1,2019-10-04 16:25:27,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
93,1225297,week_0_arm_1,2019-07-24,3.0,3.0,3.0,2.0,1.0,1.0,2.0,...,4.0,4.0,3.0,4.0,3.0,3.0,4.0,3.0,3.0,3.0
1005,1225297,week_4_arm_1,2019-08-26 14:53:14,1.0,1.0,1.0,1.0,1.0,1.0,2.0,...,3.0,3.0,3.0,4.0,3.0,2.0,3.0,3.0,2.0,3.0
1006,1225297,week_7_arm_1,2019-09-16 15:52:35,2.0,2.0,2.0,2.0,1.0,1.0,2.0,...,3.0,3.0,3.0,3.0,3.0,1.0,3.0,3.0,2.0,3.0
1007,1225297,week_10_arm_1,2019-10-05 18:47:20,2.0,2.0,2.0,2.0,0.0,1.0,2.0,...,3.0,4.0,3.0,4.0,3.0,1.0,3.0,3.0,2.0,2.0
207,1254121,week_0_arm_1,2019-07-25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2301,1254121,week_4_arm_1,2019-08-23 11:08:33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [135]:
state_df.to_pickle("ls_data/state_0_10.df")

### Process PHQ 

In [156]:
phq_cols = list(baseline_redcap_df.columns[baseline_redcap_df.columns.str.startswith('phq')])
phq_df = baseline_redcap_df[['pid'] + phq_cols]
phq_df['phq_start'] = baseline_redcap_df['dt_screen']
phq_df['redcap_event_name'] = 'week_0_arm_1'
phq_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,pid,phq01_sc,phq02_sc,phq03_sc,phq04_sc,phq05_sc,phq06_sc,phq07_sc,phq08_sc,phq_total_sc,phq10_sc,phq_start,redcap_event_name
0,91048552,0,0,2,1,0,0,0,0,3,1.0,2019-07-22,week_0_arm_1
1,31456993,3,3,1,2,2,1,2,1,15,1.0,2019-07-22,week_0_arm_1
2,51735262,0,1,1,1,0,1,0,0,4,0.0,2019-07-22,week_0_arm_1
3,69452375,1,1,0,1,0,3,1,0,7,2.0,2019-07-22,week_0_arm_1
4,28021601,0,0,0,0,0,1,0,0,1,1.0,2019-07-22,week_0_arm_1


In [157]:
# process PHQ values populated via EMA

even_phq8_df = pd.DataFrame()

for pid in wave1_ids:
    even_df = pd.read_pickle("/data/tliu/all_ema_data/evening_phq8/{}.df".format(pid))
    even_phq8_df = even_phq8_df.append(even_df, sort=True)

In [158]:
morn_phq8_df = pd.DataFrame()

for pid in wave1_ids:
    morn_df = pd.read_pickle("/data/tliu/all_ema_data/morning_phq8/{}.df".format(pid))
    morn_phq8_df = morn_phq8_df.append(morn_df, sort=True)

In [159]:
even_phq8_df = format_time(even_phq8_df)
morn_phq8_df = format_time(morn_phq8_df)

In [160]:
def get_redcap_event_name(row):
    """populates redcap event name based on date."""

    if (row['date'] > pd.to_datetime('2019-07-23')) and (row['date'] < pd.to_datetime('2019-08-12')):
        return 'week_1_arm_1'
    
    if (row['date'] > pd.to_datetime('2019-08-13')) and (row['date'] < pd.to_datetime('2019-09-02')):
        return 'week_4_arm_1'
    
    if (row['date'] > pd.to_datetime('2019-09-03')) and (row['date'] < pd.to_datetime('2019-09-23')):
        return 'week_7_arm_1'
    
    if (row['date'] > pd.to_datetime('2019-09-24')) and (row['date'] < pd.to_datetime('2019-10-14')):
        return 'week_10_arm_1'
    
    if (row['date'] > pd.to_datetime('2019-10-15')) and (row['date'] < pd.to_datetime('2019-11-04')):
        return 'week_13_arm_1'
    
    if (row['date'] > pd.to_datetime('2019-11-05')) and (row['date'] < pd.to_datetime('2019-11-25')):
        return 'week_16_arm_1'
    

even_phq8_df['redcap_event_name'] = even_phq8_df.apply(get_redcap_event_name, axis=1)
morn_phq8_df['redcap_event_name'] = morn_phq8_df.apply(get_redcap_event_name, axis=1)

In [161]:
phq_dict = {
    'pleasure': 'phq01_sc',
    'depression': 'phq02_sc',
    'sleep': 'phq03_sc',
    'energy': 'phq04_sc',
    'appetite': 'phq05_sc',
    'feeling-bad': 'phq06_sc',
    'concentration': 'phq07_sc',
    'movement': 'phq08_sc',
    'difficulty': 'phq10_sc',
    'time': 'phq_start'
}

for k, v in phq_dict.items():
    even_phq8_df[v] = even_phq8_df[k]
    morn_phq8_df[v] = morn_phq8_df[k]
    
even_phq8_df['phq_total_sc'] = even_phq8_df[even_phq8_df.columns[even_phq8_df.columns.str.startswith("phq0")]].astype(int).sum(axis=1)
morn_phq8_df['phq_total_sc'] = morn_phq8_df[morn_phq8_df.columns[morn_phq8_df.columns.str.startswith("phq0")]].astype(int).sum(axis=1)

In [164]:
phq_df = phq_df.append(morn_phq8_df[phq_df.columns])
phq_df = phq_df.append(even_phq8_df[phq_df.columns])
phq_df = phq_df.sort_values(by=["pid", "phq_start"])
phq_df.head(20)

Unnamed: 0,pid,phq01_sc,phq02_sc,phq03_sc,phq04_sc,phq05_sc,phq06_sc,phq07_sc,phq08_sc,phq_total_sc,phq10_sc,phq_start,redcap_event_name
39,746649,2,1,3,1,1,1,1,2,12,3,2019-07-22,week_0_arm_1
0,746649,2,2,2,3,2,1,1,0,13,1,2019-07-30 12:31:50,week_1_arm_1
0,746649,2,2,2,3,2,1,1,0,13,1,2019-07-30 12:31:50,week_1_arm_1
0,746649,1,2,1,2,0,1,0,0,7,2,2019-08-05 23:32:00,week_1_arm_1
0,746649,1,2,1,2,0,1,0,0,7,2,2019-08-05 23:32:00,week_1_arm_1
1,746649,1,2,2,2,0,1,1,0,9,2,2019-08-26 19:04:10,week_4_arm_1
1,746649,1,2,2,2,0,1,1,0,9,2,2019-08-26 19:04:10,week_4_arm_1
1,746649,1,1,1,2,2,3,2,0,12,2,2019-09-10 14:28:31,week_7_arm_1
1,746649,1,1,1,2,2,3,2,0,12,2,2019-09-10 14:28:31,week_7_arm_1
2,746649,1,1,2,1,0,1,1,0,7,2,2019-09-16 19:08:22,week_7_arm_1


In [167]:
phq_df = phq_df.drop_duplicates()

In [168]:
phq_df['redcap_event_name'].value_counts()

week_1_arm_1     477
week_10_arm_1    470
week_13_arm_1    435
week_7_arm_1     411
week_16_arm_1    379
week_4_arm_1     306
week_0_arm_1     282
Name: redcap_event_name, dtype: int64

In [169]:
phq_df.to_pickle("ls_data/phq_0_16.df")

# Extract Wk 4 Features

## Utilities

In [6]:
fus_loc = "/data/tliu/wk{}_ls_data/pdk-location"
fga_loc = "/data/tliu/wk{}_ls_data/pdk-foreground-application"
sms_loc = "/data/tliu/wk{}_ls_data/pdk-text-messages"
cal_loc = "/data/tliu/wk{}_ls_data/pdk-phone-calls"

sensor_locs = [fus_loc, fga_loc, sms_loc, cal_loc]
wks = [7,10]

# load data
with open("data_pull/ids/wave1_ids.txt", "r") as internal_f:
    wave1_ids = [line.strip() for  line in internal_f.readlines()]

    
def process_sensor_data(pids, loc, out_loc, func, n_procs=4):
    """Wrapper function for processing sensor data.
    
    Args:
        pids (list): list of pids to process
        #wk (int): the week of data to process
        loc (str): the file location
        out_loc (str): the output file name and location
        func (function): the processing function to apply
        n_procs (int): the number of processes to spin up
    
    Returns:
        None, but writes to 
    """
    #loc = loc.format(wk)
    func_args = [(pid, loc) for pid in pids]
    with multiprocessing.Pool(n_procs) as pool:
        results = pool.starmap(func, func_args)
        
    df = pd.DataFrame()

    for res in results:
        df = df.append(res)
    
    df.to_pickle(out_loc)
        

fus_str = "ls_data/wk{}/fus_daily.df"
circ_str = "ls_data/wk{}/circ_movt.df"
fga_str = "ls_data/wk{}/fga_hr.df"
cal_str = "ls_data/wk{}/cal_hr.df"
sms_str = "ls_data/wk{}/sms_hr.df"

def process_all_data(pids, wks, n_procs=4):    
    for wk in wks:
        #process_sensor_data(pids, fus_loc.format(wk), fus_str.format(wk), build_fus, n_procs)
        process_sensor_data(pids, fus_loc.format(wk), circ_str.format(wk), build_circadian_stats, n_procs)
        process_sensor_data(pids, fga_loc.format(wk), fga_str.format(wk), build_fga_hr, n_procs)
        process_sensor_data(pids, cal_loc.format(wk), cal_str.format(wk), build_cal_hr, n_procs)
        process_sensor_data(pids, sms_loc.format(wk), sms_str.format(wk), build_sms_hr, n_procs)

In [7]:
%%time

process_all_data(wave1_ids, [10])

08343773
50550619
44667026
28949890
27099517
70483015
03327555
28939704
09489685
18583649
62860600
74575289
95556839
86283726
99050875
06638392
69452375
31456993
42258080
28244292
46002724
12807049
01495950
59222410
39548248
44655272
84469352
44909649
32718334
69335292
51419094
36969413
01254121
58780031
21594071
59654069
90496706
80504454
05261598
33250639
76854891
53435128
98621494
50765631
27761141
34262165
67900112
77842251
52982527
81049144
32573840
87485171
31477083
75348018
45433155
50730294
29878406
61762096
48367404
47363974
10285142
56596866
91788916
20206315
16777771
15565415
56912666
47688944
74371880
97678130
71676393
39106805
56184073
76432041
44293762
49001726
71219000
93627939
37168430
66873010
73518938
14753485
19410615
39725031
81729157
89434074
38646138
51456954
20706360
90638927
65143770
85752121
79316475
78911129
75696701
38890840
13051775
55542659
24936642
74739196
56910929
75437581
81968737
79439002
53808826
98250113
62463869
02970060
21894119
55313474
32309079
4

In [3]:
%%time

import multiprocessing
fus_loc = "/data/tliu/wk10_ls_data/pdk-location"
fus_args = [(pid, fus_loc) for pid in wave1_ids]

with multiprocessing.Pool(8) as pool:
    
    fus_results = pool.starmap(build_fus, fus_args)

44667026
52982527
44655272
(1838, 18)
98621494
(3449, 18)
53435128
81049144
(3756, 18)
(4449, 18)
27099517
(4608, 18)
28949890
(4038, 18)
08343773
50550619
(4936, 18)
(4379, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


32718334
(2786, 18)
36969413
(1668, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


18583649
(4563, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


70483015
21594071
(3793, 18)
45433155
(4197, 18)
03327555
(4783, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


09489685
(4118, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


48367404


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


28939704
(1553, 18)
47363974
(4426, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


05261598
(25, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


86283726
(3812, 18)
33250639
(4767, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


99050875
(1076, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


77842251
(4392, 18)
31456993
(4675, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


62860600
(1016, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


27761141
56912666
(4322, 18)
34262165
(2882, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


74575289
(4266, 18)
67900112
(2291, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


42258080
(5719, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


32573840
(21, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


87485171
(4451, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


97678130
(3005, 18)
31477083
(4166, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


69452375
(3409, 18)
01495950
(4113, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


29878406
(4905, 18)
75348018
(4431, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


39548248
(1676, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


44909649
(4381, 18)
12807049
(4743, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


49001726
(2909, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


95556839
(2283, 18)
69335292
(3590, 18)
37168430
(2336, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


06638392
(1775, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


91788916
20206315
(3016, 18)
85752121
(3855, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


28244292
(940, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


46002724


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


(3434, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


84469352
(3754, 18)
13051775
(4448, 18)
47688944
(4808, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


39106805
58780031
(1665, 18)
56184073
(4645, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


01254121
(4432, 18)
59654069
(4169, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


59222410
(1160, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


61762096
(2418, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


50730294
(4768, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


71219000
(3502, 18)
51419094
(4117, 18)
52064875
(4265, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


73518938
(2505, 18)
79439002
(3130, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


56596866
(3952, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


76854891
66873010
(2092, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


89434074
(967, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


74371880
(4737, 18)
38646138
(3773, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


70035688
(3118, 18)
90496706
(4298, 18)
39725031
(4582, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


31574721
(2004, 18)
65143770
(4716, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


80504454
(4511, 18)
98250113
(4195, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


81720300
(2713, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


02817507
(2071, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


90638927
(3994, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


76432041
(3733, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


58093242
(3500, 18)
50765631
(2915, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


13250317
(4446, 18)
10285142
(1840, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


55313474
(4465, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


46484562
(3324, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


78911129
(1082, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


16777771
(21, 18)
14753485
(4510, 18)
38890840
(4535, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


15565415
(4841, 18)
75696701
(2942, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


75282136
(4066, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


67597747
(2141, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


30501084
(2282, 18)
24936642
(3771, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


19674187
(4352, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


81968737
(4515, 18)
74739196
(4113, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


83085276
(4326, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


93627939
(2657, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


51456954


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


(4434, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


71676393
(2240, 18)
19410615
(602, 17)
53808826
(4259, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


81729157
(4520, 18)
23066392
(4674, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


56723660
(4190, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


79316475
(4227, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


20706360
(3445, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


93606382
(2891, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


44293762
(1002, 18)
55542659
(2817, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


81558830
(4153, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


56910929
(609, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


02970060
(4230, 18)
52581458
(4328, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


75437581
(4316, 18)
21894119
(3458, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


90229239
(1113, 18)
84902402
(4503, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


38588231
(467, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


72685265
(4813, 18)
62463869
(4529, 18)
53097921
(2913, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


42871706
(1, 15)
43292038
11436422
(4090, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


28540480
(3934, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


45761494
40932643
(410, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


07974290
(710, 18)
23388083
(3893, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


81249330
(707, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


68744652
(2469, 18)
99127649
(991, 18)
41606321
(4275, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


80657933
54461187
(3835, 18)
(2580, 18)
32309079
(4726, 18)
43589028
(4049, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


17328943
(2408, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


18156803
(2742, 18)
22498610
(2439, 18)
57473014
(4279, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


74589634
(4393, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


02144163
(628, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


78352234
(4785, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


55979795
(3602, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


11770862
(469, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


71189891
(3680, 18)
03384972
(2918, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


54841471
(3573, 18)
39561926
(4114, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


78681731
(4159, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


61131074
(4605, 18)
73960495
(3979, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


11927637
(2500, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


03939827
(462, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


27762780
(4481, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


90763832
(4783, 18)
19663467
(2991, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


09269616
(3527, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


72038219
(2649, 18)
71745031
53874087
(3519, 18)
36795256
(4482, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


08007167
(4573, 18)
26080346
(3346, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


87400142
(3978, 18)
73326278
(3440, 18)
80206225
(5592, 18)
29384065
(3186, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


54004910
(4098, 18)
55915099
(3308, 18)
46175798
(3720, 18)
53236058
(2011, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


80700486
(3752, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


65149091
(2030, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


13567195
(4075, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


70027963
50939076
(527, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


65741560
(3877, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


62599280
(826, 18)
12616311
(4121, 18)
45517860
(3689, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


83963249
(735, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


43093019
(3410, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


35493515
(3102, 18)
03578019
(3184, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


06400675
(4622, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


83275234
(2424, 18)
05565365
(4106, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


14549710
(3721, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


83062037
(4681, 18)
89346491
(4247, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


87929316
(830, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


42215399
(3054, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


79510141
(3777, 18)
66507502
(4528, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


27330785
(3763, 18)
08103884
(4274, 18)
55463070
(3557, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


59764431
(3908, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


44933937
(1501, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


17294720
(3635, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


35576469
(3271, 18)
28021601
(1937, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


76562623
(2084, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


76366191
(3975, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


64142475
(1701, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


10099555
(4397, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


58740880
(4258, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


74626135
(2816, 18)
28458341
(4483, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


22656406
(4347, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


90587846
(3805, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


50707558
(4035, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


90934495
(4029, 18)
84877086
(3958, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


07854544
74805749
(4084, 18)
29149362
(4141, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


91048552
(1863, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


99338619
(4494, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


51735262
(2225, 18)
48315222
(2558, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


78327476
(4405, 18)
01225297
(1759, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


46222210
(4208, 18)
67615491
(4371, 18)
65381988
(4681, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


51612397
(862, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


38459884
(3926, 18)
73142171
(4818, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


39854689
(2762, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


58081753
(2311, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


00746649
(3997, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


57973631
(2727, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


93519386
(4555, 18)
26957252
(4367, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


22352222
(4006, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


77579838
(2496, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


14196469
79819446
(3555, 18)
89057862
(5019, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


32888746
(1257, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


21150752
(4177, 18)
04918121
(4329, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


86756971
(2629, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


14113160
(4055, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


01766910
(4491, 18)
65696941
08007329
(4640, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


94277599
(3461, 18)
64292248
(2627, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


74133461
(4254, 18)
47505792
(4734, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


83056303
(3284, 18)
62808613
(4160, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


50931782
(2482, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


48625414
(4169, 18)
22086591
(4459, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


71043609
(2622, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


82727218
(5383, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


73916801
(554, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


03233601
(4638, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


04133537
(4385, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


68756107
(3495, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


62375942
(5038, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


09611865
(3617, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


29584096


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)
  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


18740846
(2332, 18)


  label_group['entropy'] = -(np.log(label_group) * label_group).sum(axis=1)


CPU times: user 786 ms, sys: 252 ms, total: 1.04 s
Wall time: 4min 11s


In [5]:
fus_df = pd.DataFrame()

for df in fus_results:
    fus_df = fus_df.append(df)
fus_df.head()
#fus_df.to_pickle("ls_data/wk10/fus_daily.df")

In [54]:
fus_loc = "/data/tliu/wk10_ls_data/pdk-location"
test = build_fus("29584096", fus_loc)

29584096


In [56]:
test = pd.read_pickle("ls_data/wk7/sms_hr.df")
test.shape

(112587, 8)

In [58]:
test.append(None).shape

(112587, 8)

## Fused Location

In [None]:
%%time

import multiprocessing
fus_loc = "/data/tliu/wk4_ls_data/pdk-location"
fus_args = [(pid, fus_loc) for pid in wave1_ids]

with multiprocessing.Pool(8) as pool:
    
    fus_results = pool.starmap(build_fus, fus_args)

In [None]:
fus_df = pd.DataFrame()

for df in fus_results:
    fus_df = fus_df.append(df)
fus_df.head()

In [None]:
# Should only be run once
#fus_df.to_pickle("ls_data/wk4/fus_daily.df")

In [None]:
%%time

import multiprocessing
fus_loc = "/data/tliu/wk4_ls_data/pdk-location"
fus_args = [(pid, fus_loc) for pid in wave1_ids]

with multiprocessing.Pool(12) as pool:
    
    circ_results = pool.starmap(build_circadian_stats, fus_args)

In [None]:
circ_df = pd.DataFrame()

for df in circ_results:
    circ_df = circ_df.append(df)
    
circ_df.head()

In [None]:
# only needs to be run once
#circ_df.to_pickle("ls_data/wk4/circ_movt.df")

## Foreground application

In [None]:
%%time

import multiprocessing

fga_loc = "/data/tliu/wk4_ls_data/pdk-foreground-application"
fga_args = [(pid, fga_loc) for pid in wave1_ids]

with multiprocessing.Pool(12) as pool:
    
    fga_results = pool.starmap(build_fga_hr, fga_args)

In [None]:
fga_df = pd.DataFrame()

for df in fga_results:
    fga_df = fga_df.append(df)
    
fga_df.head()

In [None]:
# only needs to be run once
#fga_df.to_pickle("ls_data/wk4/fga_hr.df")

## Calls

In [None]:
%%time

import multiprocessing

cal_loc = "/data/tliu/wk4_ls_data/pdk-phone-calls"
cal_args = [(pid, cal_loc) for pid in wave1_ids]

with multiprocessing.Pool(8) as pool:
    
    cal_results = pool.starmap(build_cal_hr, cal_args)

In [None]:
cal_df = pd.DataFrame()

for df in cal_results:
    cal_df = cal_df.append(df)
    
cal_df.head()

In [None]:
# only needs to be run once
#cal_df.to_pickle("ls_data/wk4/cal_hr.df")

## Texts

In [None]:
%%time

import multiprocessing

sms_loc = "/data/tliu/wk4_ls_data/pdk-text-messages"
sms_args = [(pid, sms_loc) for pid in wave1_ids]

with multiprocessing.Pool(8) as pool:
    
    sms_results = pool.starmap(build_sms_hr, sms_args)

In [None]:
sms_df = pd.DataFrame()

for df in sms_results:
    sms_df = sms_df.append(df)
    
sms_df.head()

In [None]:
# only needs to be run once
#sms_df.to_pickle("ls_data/wk4/sms_hr.df")