## Recurrent event: Exploratory Data Analysis

- This notebook dedicated to understanding the recurrent event process
- End-of-Day (EOD) EMA: compute first day of EOD
- Self-report: compute first day of self-reported smoking since quit date and 

In [70]:
# Import packages and set directory
import pandas as pd
import numpy as np
import datetime as datetime
import os
print(os.getcwd())
dir = "../final-data"

C:\Users\wdem\Documents\GitHub\sense2stop-lvm\final-data


'C:\\Users\\wdem\\Documents\\GitHub\\sense2stop-lvm\\final-data'

In [41]:
# Response windows for end-of-day EMA
keys = ['8to9', '9to10', '10to11', '11to12',
        '12to13','13to14','14to15','15to16',
        '16to17','17to18','18to19','19to20']


In [42]:
# read EOD data and participant entry/exit dates
eod_ema = pd.read_csv(os.path.join(os.path.realpath(dir), 'eod-ema-final.csv'))
participant_dates = pd.read_csv(os.path.join(os.path.realpath(dir),'participant-dates-v3.csv'))

In [43]:
# Construct user-days and record how many cigarettes
# are reported in the EOD EMA
eod_dates = []
for irow in range(0,eod_ema.shape[0]):
    row = eod_ema.iloc[irow]
    quit_iloc = np.where(participant_dates['participant'] == row['participant_id'])
    quit_row = participant_dates.iloc[quit_iloc[0][0]]
    quit_time = datetime.datetime.strptime(quit_row['quit_date'], '%m/%d/%y')
    if row['status'] == "MISSED":
        continue
    try:
        time = datetime.datetime.strptime(row['date'], '%m/%d/%Y %H:%M')
    except:
        time = datetime.datetime.strptime(row['date'], '%Y-%m-%d %H:%M:%S')
    #print row['participant_id'], time
    #print quit_row['participant'], quit_time
    temp_diff = time - quit_time
    if time.hour  == 0 or time.hour == 1:
        date = np.array([row['participant_id'], temp_diff.days-1])
        date = np.append(date, np.sum(np.array(row[keys])))
    else:
        date = np.array([row['participant_id'], temp_diff.days])
        date = np.append(date, np.sum(np.array(row[keys])))
    #print date
    eod_dates.append(date)
    
eod_dates = np.asarray(eod_dates)
eod_dates

array([[201.,  -3.,   2.],
       [201.,  -2.,   9.],
       [203.,   6.,   0.],
       ...,
       [269.,   8.,   0.],
       [269.,   9.,   0.],
       [269.,  10.,   0.]])

In [44]:
## Each day get the mean and standard deviation
## in the number of reported cigarettes using end-of-day EMA
for day in range(-3, 11):
    print(day, np.round(np.nanmean(eod_dates[eod_dates[:,1] == day,2]),3), np.round(np.nanstd(eod_dates[eod_dates[:,1] == day,2]),3))

-3 4.444 2.948
-2 4.812 3.193
-1 4.227 2.867
0 1.7 2.015
1 1.237 1.286
2 1.735 2.429
3 1.324 2.529
4 1.176 2.022
5 1.629 2.294
6 1.424 2.118
7 1.839 2.554
8 1.735 1.975
9 1.676 2.665
10 2.0 3.215


In [65]:
eod_survival = []
for id in np.unique(eod_dates[:,0]):
    subset = eod_dates[eod_dates[:,0] == id,:]
    if any(subset[:,1] > 0):
        subset = subset[subset[:,1] > 0,:]
        nonzero_iloc = np.where(subset[:,2] > 0)[0]
        zero_iloc = np.where(subset[:,2] == 0)[0]
        if len(nonzero_iloc) == 0:
            if len(zero_iloc) != 0:
                temp = np.append(subset[np.max(zero_iloc),0:2], 0)
        else:
            temp = np.append(subset[np.max(nonzero_iloc), 0:2], 1)
        eod_survival.append(temp)

eod_survival = np.asarray(eod_survival)

  


In [71]:
# Read data for self-report
selfreport = pd.read_csv(os.path.join(os.path.realpath(dir), 'self-report-smoking-final.csv'))

In [72]:
'''
Extract all smoking events and label with correct user-day
'''

sr_dates = []
for irow in range(0,selfreport.shape[0]):
    row = selfreport.iloc[irow]
    quit_iloc = np.where(participant_dates['participant'] == row['participant_id'])
    quit_row = participant_dates.iloc[quit_iloc[0][0]]
    quit_time = datetime.datetime.strptime(quit_row['quit_date'], '%m/%d/%y')
    try:
        time = datetime.datetime.strptime(row['date'], '%m/%d/%y %H:%M')
    except:
        time = datetime.datetime.strptime(row['date'], '%Y-%m-%d %H:%M:%S')
    temp_diff = time - quit_time
    if time.hour  == 0 or time.hour == 1:
        date = np.array([row['participant_id'], temp_diff.days-1])
    else:
        date = np.array([row['participant_id'], temp_diff.days])
    sr_dates.append(date)
    
sr_dates = np.asarray(sr_dates)

array([-4, -3, -2, -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11],
      dtype=int64)

In [73]:
'''
Aggregate smoking events for each user-day
'''

sr_pd = pd.DataFrame(sr_dates)
sr_pd.columns = ['user', 'day']
sr_userdays = sr_pd.groupby(['user','day']).size()

sr_userdays.groupby('day').mean()

day
-4     2.500000
-3     4.380952
-2     7.210526
-1     6.056604
 0     3.000000
 1     2.476190
 2     3.578947
 3     4.000000
 4     5.153846
 5     5.307692
 6     5.833333
 7     5.571429
 8     6.076923
 9     3.800000
 10    5.642857
 11    2.000000
dtype: float64

In [96]:
'''
For each user, find the first user-day on which 
they self-reported smoking
''' 

sr_survival = []
for id in set(sr_dates[:,0]):
    subset = sr_dates[sr_dates[:,0] == id,:]
    if any(subset[:,1] > 0):
        subset = subset[subset[:,1] > 0,:]
        temp = [id, np.max(subset[:,1])]
        sr_survival.append(temp)

sr_survival = np.asarray(sr_survival)

In [99]:
# Read data for random EMA
random_ema = pd.read_csv(dir + 'final/random-ema-final.csv')

In [111]:
'''
Extract all smoking events from random EMA and label with correct user-day
'''

random_dates = []
for irow in range(0,random_ema.shape[0]):
    row = random_ema.iloc[irow]
    quit_iloc = np.where(participant_dates['participant'] == row['participant_id'])
    quit_row = participant_dates.iloc[quit_iloc[0][0]]
    quit_time = datetime.datetime.strptime(quit_row['quit_date'], '%m/%d/%y')
    if row['status'] == "MISSED":
        continue
    try:
        time = datetime.datetime.strptime(row['date'], '%m/%d/%y %H:%M')
    except:
        time = datetime.datetime.strptime(row['date'], '%Y-%m-%d %H:%M:%S')
    #print row['participant_id'], time
    #print quit_row['participant'], quit_time
    temp_diff = time - quit_time
    if time.hour  == 0 or time.hour == 1:
        date = np.array([row['participant_id'], temp_diff.days-1])
        if row['smoke'] == 'Yes':
            date = np.append(date, [1])
            random_dates.append(date)
        if row['smoke'] == 'No':
            date = np.append(date, [0])
            random_dates.append(date)
    else:
        date = np.array([row['participant_id'], temp_diff.days])
        if row['smoke'] == 'Yes':
            date = np.append(date, 1)
            random_dates.append(date)
        if row['smoke'] == 'No':
            date = np.append(date, 0)
            random_dates.append(date)

random_dates = np.asarray(random_dates)

In [112]:
'''
Aggregate smoking events for each user-day
within the random EMA
'''

random_pd = pd.DataFrame(random_dates)
random_pd.columns = ['user', 'day', 'smoke']
random_pd_userdays = random_pd.groupby(['user','day'])['smoke'].sum().reset_index()

In [113]:
random_survival = []
for id in set(random_dates[:,0]):
    subset = random_dates[random_dates[:,0] == id,:]
    if any(subset[:,1] > 0):
        subset = subset[subset[:,1] > 0,:]
        nonzero_iloc = np.where(subset[:,2] == 1)[0]
        zero_iloc = np.where(subset[:,2] == 0)[0]
        if len(nonzero_iloc) == 0:
            if len(zero_iloc) != 0:
                temp = np.append(subset[np.max(zero_iloc),0:2], 0)
        else:
            temp = np.append(subset[np.max(nonzero_iloc), 0:2], 1)
        random_survival.append(temp)

random_survival = np.asarray(random_survival)

array([[256,  10,   0],
       [258,  10,   1],
       [259,   3,   1],
       [260,   5,   1],
       [261,   9,   1],
       [262,   1,   1],
       [263,  10,   0],
       [264,  11,   0],
       [265,   6,   1],
       [267,   7,   1],
       [269,   3,   1],
       [202,   7,   1],
       [203,  13,   0],
       [205,   8,   1],
       [206,   9,   0],
       [207,   8,   0],
       [208,   3,   1],
       [211,  11,   1],
       [212,   6,   1],
       [213,   1,   0],
       [214,   4,   1],
       [215,  11,   0],
       [216,  10,   1],
       [217,   8,   1],
       [218,   1,   1],
       [219,  10,   1],
       [222,   1,   1],
       [223,  11,   0],
       [224,   9,   1],
       [225,  10,   1],
       [226,   8,   1],
       [227,  10,   1],
       [228,   8,   1],
       [229,   3,   1],
       [230,  11,   0],
       [231,   2,   1],
       [233,   8,   1],
       [234,   1,   0],
       [235,   4,   0],
       [238,  10,   1],
       [240,  10,   1],
       [241,   1

In [300]:
all_ema_survival = []

for id in set(random_survival[:,0]) | set(ec_survival[:,0]) | set(eod_survival[:,0]):
    temp = np.array([id])
    random_subset = random_survival[random_survival[:,0] == id,:]
    if not len(random_subset) == 0:
        temp = np.append(temp, random_subset[0][1:3])
    else:
        temp = np.append(temp, [-1,-1])
    ec_subset = ec_survival[ec_survival[:,0] == id,:]
    if not len(ec_subset) == 0:
        ec_day = ec_subset[0][1]
        if temp[1] > ec_day or temp[1] == -1:
            temp[1:3] = [ec_day,1]
    eod_subset = eod_survival[eod_survival[:,0] == id, :]
    if not len(eod_subset) == 0:
        if eod_subset[0][1] < temp[1] and eod_subset[0][2] == 1:
            temp[1:3] = eod_subset[0][1:3]
        elif eod_subset[0][1] > temp[1] and eod_subset[0][2] == 0 and temp[2] == 0:
            temp[1:3] = eod_subset[0][1:3]
    all_ema_survival.append(temp)
    
all_ema_survival = np.asarray(all_ema_survival)

In [353]:
km_list = [1]
var_list = [0]
for day in range(1,13):
    n_day = np.sum(all_ema_survival[:,1] >= day, dtype='f')
    subset = all_ema_survival[all_ema_survival[:,1] == day,:]
    subset = subset[subset[:,2] == 1]
    d_day = subset.shape[0]
    km_list.append(1-d_day/n_day)
    var_list.append(d_day/(n_day * (n_day - d_day)))

km_est = np.cumprod(km_list)

var_est = np.multiply(np.power(km_est,2),np.cumsum(var_list))

km_stderr = np.sqrt(var_est)

upper_km_est = km_est + 1.96*km_stderr
lower_km_est = km_est - 1.96*km_stderr

print upper_km_est
print km_est
print lower_km_est




[1.         0.98675767 0.90651021 0.84278664 0.84278664 0.80746358
 0.73190752 0.69185671 0.60736767 0.41944874 0.22607129 0.22607129
 0.22607129]
[1.         0.85714286 0.74025974 0.66233766 0.66233766 0.62094156
 0.53814935 0.49675325 0.41396104 0.24837662 0.09935065 0.09935065
 0.09935065]
[ 1.          0.72752804  0.57400927  0.48188869  0.48188869  0.43441954
  0.34439118  0.30164979  0.2205544   0.07730451 -0.02736999 -0.02736999
 -0.02736999]
