## Recurrent event: Exploratory Data Analysis

- This notebook dedicated to understanding the recurrent event process
- Start with End-of-Day (EOD) EMA
- Move to self-report and 

In [16]:
# Import packages and set directory
import pandas as pd
import numpy as np
import datetime as datetime
import os
os.getcwd()
dir = "/home/wdempsey/ExpanDrive/box/MD2K Northwestern/Processed Data/smoking-lvm-cleaned-data/"
os.chdir(dir)
os.getcwd()

'/home/wdempsey/ExpanDrive/box/MD2K Northwestern/Processed Data/smoking-lvm-cleaned-data'

In [8]:
# Response windows for end-of-day EMA
keys = ['8to9', '9to10', '10to11', '11to12',
        '12to13','13to14','14to15','15to16',
        '16to17','17to18','18to19','19to20']


In [30]:
# read EOD data and participant entry/exit dates
eod_ema = pd.read_csv(dir + '/final/eod-ema-final.csv')
participant_dates = pd.read_csv(dir + 'participant-dates-v3.csv')

3/18/19


In [43]:
# Construct user-days and record how many cigarettes
# are reported in the EOD EMA
eod_dates = []
for irow in range(0,eod_ema.shape[0]):
    row = eod_ema.iloc[irow]
    quit_iloc = np.where(participant_dates['participant'] == row['participant_id'])
    quit_row = participant_dates.iloc[quit_iloc[0][0]]
    quit_time = datetime.datetime.strptime(quit_row['quit_date'], '%m/%d/%y')
    if row['status'] == "MISSED":
        continue
    try:
        time = datetime.datetime.strptime(row['date'], '%m/%d/%Y %H:%M')
    except:
        time = datetime.datetime.strptime(row['date'], '%Y-%m-%d %H:%M:%S')
    #print row['participant_id'], time
    #print quit_row['participant'], quit_time
    temp_diff = time - quit_time
    if time.hour  == 0 or time.hour == 1:
        date = np.array([row['participant_id'], temp_diff.days-1])
        date = np.append(date, np.sum(np.array(row[keys])))
    else:
        date = np.array([row['participant_id'], temp_diff.days])
        date = np.append(date, np.sum(np.array(row[keys])))
    #print date
    eod_dates.append(date)
    
eod_dates = np.asarray(eod_dates)

In [39]:
## Each day get the mean and standard deviation
## in the number of reported cigarettes using end-of-day EMA
for day in range(-3, 11):
    print day, np.round(np.nanmean(eod_dates[eod_dates[:,1] == day,2]),3), np.round(np.nanstd(eod_dates[eod_dates[:,1] == day,2]),3)

 -3 4.409 2.972
-2 4.809 3.226
-1 4.279 2.88
0 1.692 2.04
1 1.237 1.286
2 1.788 2.446
3 1.364 2.556
4 1.212 2.041
5 1.629 2.294
6 1.469 2.136
7 1.839 2.554
8 1.788 1.981
9 1.727 2.689
10 2.0 3.215


In [36]:
eod_survival = []
for id in set(eod_dates[:,0]):
    subset = eod_dates[eod_dates[:,0] == id,:]
    if any(subset[:,1] > 0):
        subset = subset[subset[:,1] > 0,:]
        nonzero_iloc = np.where(subset[:,2] > 0)[0]
        zero_iloc = np.where(subset[:,2] == 0)[0]
        if len(nonzero_iloc) == 0:
            if len(zero_iloc) != 0:
                temp = np.append(subset[np.max(zero_iloc),0:2], 0)
        else:
            temp = np.append(subset[np.max(nonzero_iloc), 0:2], 1)
        eod_survival.append(temp)

eod_survival = np.asarray(eod_survival)

In [47]:
# Read data for self-report
selfreport = pd.read_csv(dir + 'final/self-report-smoking-final.csv')

In [45]:
'''
Construct user-days and record how many cigarettes 
are reported using selfreport
'''

sr_dates = []
for irow in range(0,selfreport.shape[0]):
    row = selfreport.iloc[irow]
    quit_iloc = np.where(participant_dates['participant'] == row['participant_id'])
    quit_row = participant_dates.iloc[quit_iloc[0][0]]
    quit_time = datetime.datetime.strptime(quit_row['quit_date'], '%m/%d/%y')
    try:
        time = datetime.datetime.strptime(row['date'], '%m/%d/%y %H:%M')
    except:
        time = datetime.datetime.strptime(row['date'], '%Y-%m-%d %H:%M:%S')
    temp_diff = time - quit_time
    if time.hour  == 0 or time.hour == 1:
        date = np.array([row['participant_id'], temp_diff.days-1])
    else:
        date = np.array([row['participant_id'], temp_diff.days])
    sr_dates.append(date)
    
sr_dates = np.asarray(sr_dates)

In [184]:
'''
For each user, find the first user-day on which 
they self-reported smoking
''' 

sr_survival = []
for id in set(ec_dates[:,0]):
    subset = ec_dates[ec_dates[:,0] == id,:]
    if any(subset[:,1] > 0):
        subset = subset[subset[:,1] > 0,:]
        temp = [id, np.max(subset[:,1])]
        ec_survival.append(temp)

ec_survival = np.asarray(ec_survival)

In [186]:
# Read data for random EMA
random_original_cloud_ids = [201, 203, 206, 210, 221, 226, 229] 
random_ema = pd.read_csv(dir + 'random-ema.csv')
random_ema = random_ema.drop(['offset'], axis = 1)

random_ema_alternative = pd.read_csv(dir + 'random-ema-alternative.csv')
random_ema_backup = pd.read_csv(dir + 'random-ema-backup.csv')

temp_random_original = random_ema[random_ema['participant_id'].isin(random_original_cloud_ids)]
temp_random_alt = random_ema_alternative[~random_ema_alternative['participant_id'].isin(random_original_cloud_ids)]

random_complete = pd.concat([temp_random_original, temp_random_alt, random_ema_backup])

In [223]:
# Use to look 
random_dates = []
for irow in range(0,random_complete.shape[0]):
    row = random_complete.iloc[irow]
    quit_iloc = np.where(participant_dates['participant'] == row['participant_id'])
    quit_row = participant_dates.iloc[quit_iloc[0][0]]
    quit_time = datetime.datetime.strptime(quit_row['quit_date'], '%m/%d/%y')
    if row['status'] == "MISSED":
        continue
    try:
        time = datetime.datetime.strptime(row['date'], '%m/%d/%y %H:%M')
    except:
        time = datetime.datetime.strptime(row['date'], '%Y-%m-%d %H:%M:%S')
    #print row['participant_id'], time
    #print quit_row['participant'], quit_time
    temp_diff = time - quit_time
    if time.hour  == 0 or time.hour == 1:
        date = np.array([row['participant_id'], temp_diff.days-1])
        if row['smoke'] == 'Yes':
            date = np.append(date, [1])
            random_dates.append(date)
        if row['smoke'] == 'No':
            date = np.append(date, [0])
            random_dates.append(date)
    else:
        date = np.array([row['participant_id'], temp_diff.days])
        if row['smoke'] == 'Yes':
            date = np.append(date, 1)
            random_dates.append(date)
        if row['smoke'] == 'No':
            date = np.append(date, 0)
            random_dates.append(date)

random_dates = np.asarray(random_dates)

In [227]:
random_survival = []
for id in set(random_dates[:,0]):
    subset = random_dates[random_dates[:,0] == id,:]
    if any(subset[:,1] > 0):
        subset = subset[subset[:,1] > 0,:]
        nonzero_iloc = np.where(subset[:,2] == 1)[0]
        zero_iloc = np.where(subset[:,2] == 0)[0]
        if len(nonzero_iloc) == 0:
            if len(zero_iloc) != 0:
                temp = np.append(subset[np.max(zero_iloc),0:2], 0)
        else:
            temp = np.append(subset[np.max(nonzero_iloc), 0:2], 1)
        random_survival.append(temp)

random_survival = np.asarray(random_survival)

random_survival

array([[202,   7,   1],
       [203,  13,   0],
       [205,   8,   1],
       [206,   9,   0],
       [207,   8,   0],
       [208,   3,   1],
       [211,  11,   1],
       [212,   6,   1],
       [213,   1,   0],
       [214,   4,   1],
       [215,  11,   0],
       [216,  10,   1],
       [217,   8,   1],
       [218,   1,   1],
       [219,  10,   1],
       [222,   1,   1],
       [223,  11,   0],
       [224,   9,   1],
       [225,  10,   1],
       [227,  10,   1],
       [228,   8,   1],
       [229,   3,   1],
       [230,  11,   0],
       [231,   2,   1],
       [233,   8,   1],
       [234,   1,   0],
       [235,   4,   0]])

In [300]:
all_ema_survival = []

for id in set(random_survival[:,0]) | set(ec_survival[:,0]) | set(eod_survival[:,0]):
    temp = np.array([id])
    random_subset = random_survival[random_survival[:,0] == id,:]
    if not len(random_subset) == 0:
        temp = np.append(temp, random_subset[0][1:3])
    else:
        temp = np.append(temp, [-1,-1])
    ec_subset = ec_survival[ec_survival[:,0] == id,:]
    if not len(ec_subset) == 0:
        ec_day = ec_subset[0][1]
        if temp[1] > ec_day or temp[1] == -1:
            temp[1:3] = [ec_day,1]
    eod_subset = eod_survival[eod_survival[:,0] == id, :]
    if not len(eod_subset) == 0:
        if eod_subset[0][1] < temp[1] and eod_subset[0][2] == 1:
            temp[1:3] = eod_subset[0][1:3]
        elif eod_subset[0][1] > temp[1] and eod_subset[0][2] == 0 and temp[2] == 0:
            temp[1:3] = eod_subset[0][1:3]
    all_ema_survival.append(temp)
    
all_ema_survival = np.asarray(all_ema_survival)

In [353]:
km_list = [1]
var_list = [0]
for day in range(1,13):
    n_day = np.sum(all_ema_survival[:,1] >= day, dtype='f')
    subset = all_ema_survival[all_ema_survival[:,1] == day,:]
    subset = subset[subset[:,2] == 1]
    d_day = subset.shape[0]
    km_list.append(1-d_day/n_day)
    var_list.append(d_day/(n_day * (n_day - d_day)))

km_est = np.cumprod(km_list)

var_est = np.multiply(np.power(km_est,2),np.cumsum(var_list))

km_stderr = np.sqrt(var_est)

upper_km_est = km_est + 1.96*km_stderr
lower_km_est = km_est - 1.96*km_stderr

print upper_km_est
print km_est
print lower_km_est




[1.         0.98675767 0.90651021 0.84278664 0.84278664 0.80746358
 0.73190752 0.69185671 0.60736767 0.41944874 0.22607129 0.22607129
 0.22607129]
[1.         0.85714286 0.74025974 0.66233766 0.66233766 0.62094156
 0.53814935 0.49675325 0.41396104 0.24837662 0.09935065 0.09935065
 0.09935065]
[ 1.          0.72752804  0.57400927  0.48188869  0.48188869  0.43441954
  0.34439118  0.30164979  0.2205544   0.07730451 -0.02736999 -0.02736999
 -0.02736999]
