## Random EMA and HTMGs: Exploratory Data Analysis

- This notebook is dedicated to understanding the number of HTMGs in the minutes prior to a Random EMA where the response was 'No' to smoking question
- For multiple window-lengths (Delta), we compute the following descriptive statistics
    + Mean number of HTMGs prior (aggregated data)
    + Avg number of HTMGs (avg of means across indidivuals): 
    + Std dev of number of HTMGs (of means across indidivuals)
    + Fraction of time no HTMGs in window (aggregated data)
    + Avg fraction of time no HTMGs in window (of fractions across individuals)
    + Std dev of fraction of time no HTMGs in window (of fractions across individuals)

In [1]:
## Import packages and set directory
import pandas as pd
import numpy as np
import datetime as datetime
import matplotlib.pyplot as plt
import os
import math
os.getcwd()
dir = "../final-data"

In [2]:
## Dictionary for converting random EMA responses
## to numeric values (taking midpoint of interval)
random_accptresponse = ['1 - 19 Minutes', '20 - 39 Minutes', '40 - 59 Minutes', 
                    '60 - 79 Minutes', '80 - 100 Minutes']
random_dictionary = {'1 - 19 Minutes': 10, 
                     '20 - 39 Minutes': 30, 
                     '40 - 59 Minutes':50,
                     '60 - 79 Minutes':70, 
                     '80 - 100 Minutes':90 } 

In [3]:
## Read in relevant data streams
random_ema = pd.read_csv(os.path.join(os.path.realpath(dir), 'random-ema-final.csv'))
htmgs = pd.read_csv(os.path.join(os.path.realpath(dir), 'puff-probability-final.csv'))

In [4]:
def random_puff(delta):
    '''
    Checks how many HTMGs within window of length _delta_ 
    prior to Random EMA with response 'No' to smoking status
    '''
    # print("Set delta to {} minutes".format(delta))

    htm_complete_yes = []
    htm_complete_no = []
    for id in set(random_ema['participant_id']) & set(htmgs['participant_id']):

        puffmarker_id = np.where(htmgs['participant_id'] == id) 
        puffmarker_subset = htmgs.iloc[puffmarker_id[0]]
        try:
            puffmarker_dates_list = [datetime.datetime.strptime(date, '%m/%d/%y %H:%M') for date in puffmarker_subset['date']]
        except:
            puffmarker_dates_list = [datetime.datetime.strptime(date, '%Y-%m-%d %H:%M:%S') for date in puffmarker_subset['date']]

        random_id = np.where(random_ema['participant_id'] == id) 
        random_subset = random_ema.iloc[random_id[0]]

        htm_id_list_yes = []
        htm_id_list_no = []
        for index, row in random_subset.iterrows():
            try:
                random_time = datetime.datetime.strptime(row['date'], '%m/%d/%y %H:%M')
            except:
                random_time = datetime.datetime.strptime(row['date'], '%Y-%m-%d %H:%M:%S')
            if row['when_smoke'] in random_accptresponse:
                random_time = random_time - datetime.timedelta(minutes=random_dictionary[row['when_smoke']])
            if row['when_smoke'] in random_accptresponse: 
                htm_count = 0
                for index_puff in range(0,len(puffmarker_dates_list)):
                    temp = abs((puffmarker_dates_list[index_puff] - random_time).total_seconds() / 60.0)
                    if temp <= delta:
                        htm_count += 1
                htm_id_list_yes.append(htm_count)  
            if row['smoke'] == 'No':
                htm_count = 0
                for index_puff in range(0,len(puffmarker_dates_list)):
                    temp = abs((puffmarker_dates_list[index_puff] - random_time).total_seconds() / 60.0)
                    if temp <= delta:
                        htm_count += 1
                htm_id_list_no.append(htm_count)  
        temp_yes = np.array(htm_id_list_yes, dtype = 'f')
        temp_no = np.array(htm_id_list_no, dtype = 'f')
        if temp_yes.size > 0:
            htm_complete_yes.append(temp_yes)
        if temp_no.size > 0:
            htm_complete_no.append(temp_no)

    return htm_complete_yes, htm_complete_no

In [5]:
def summary_output(delta):
    '''
    Constructs descriptive statistics using the output from 
    random_puff(delta) for specific choice of delta
    '''
        
    output_delta_yes, output_delta_no = random_puff(delta)
    agg_sum_delta_no = np.asarray([np.sum(arr) for arr in output_delta_no])
    agg_mean_delta_no  = np.asarray([np.mean(arr) for arr in output_delta_no])
    agg_mean_delta_no  = agg_mean_delta_no[~np.isnan(agg_mean_delta_no)]
    agg_count_delta_no  = np.asarray([len(arr) for arr in output_delta_no])
    agg_zero_delta_no  = np.asarray([np.count_nonzero(arr==0) for arr in output_delta_no], dtype='f')
    ind_zero_delta_no = np.divide(agg_zero_delta_no, agg_count_delta_no)
    ind_zero_delta_no = ind_zero_delta_no[~np.isnan(ind_zero_delta_no)]

    aggregate_frac_delta_no = np.divide(np.sum(agg_sum_delta_no),np.sum(agg_count_delta_no))
    aggregate_fraczero_delta_no = np.divide(np.sum(agg_zero_delta_no),np.sum(agg_count_delta_no))

    agg_sum_delta_yes = np.asarray([np.sum(arr) for arr in output_delta_yes])
    agg_mean_delta_yes  = np.asarray([np.mean(arr) for arr in output_delta_yes])
    agg_mean_delta_yes = agg_mean_delta_yes[~np.isnan(agg_mean_delta_yes)]
    agg_count_delta_yes  = np.asarray([len(arr) for arr in output_delta_yes])
    agg_zero_delta_yes  = np.asarray([np.count_nonzero(arr==0) for arr in output_delta_yes], dtype='f')
    ind_zero_delta_yes = np.divide(agg_zero_delta_yes, agg_count_delta_yes)
    ind_zero_delta_yes = ind_zero_delta_yes[~np.isnan(ind_zero_delta_yes)]

    aggregate_frac_delta_yes = np.divide(np.sum(agg_sum_delta_yes),np.sum(agg_count_delta_yes))
    aggregate_fraczero_delta_yes = np.divide(np.sum(agg_zero_delta_yes),np.sum(agg_count_delta_yes))

    print '% s minute window:' % (delta)
    print 'Prior to EMA Response: No'
    print 'Mean number of HTMGs prior (aggregated data): %s' % (np.round(aggregate_frac_delta_no,3))
    print 'Avg number of HTMGs (avg of means across indidivuals): %s' % (np.round(np.mean(agg_mean_delta_no),3))
    print 'Std dev of number of HTMGs (of means across indidivuals): %s' %  (np.round(np.std(agg_mean_delta_no),3))
    print 'Fraction of time no HTMGs in window (aggregated data): %s' %  (np.round(np.sum(aggregate_fraczero_delta_no),3))
    print 'Avg fraction of time no HTMGs in window (of fractions across individuals): %s' %  (np.round(np.mean(ind_zero_delta_no),3))
    print 'Std dev of fraction of time no HTMGs in window (of fractions across individuals): %s' %  (np.round(np.std(ind_zero_delta_no),3))
    print
    '''
    print '% s minute window:' % (delta)
    print 'Prior to EMA Response: Yes'
    print 'Mean number of HTMGs prior (aggregated data): %s' % (np.round(aggregate_frac_delta_yes,3))
    print 'Avg number of HTMGs (avg of means across indidivuals): %s' % (np.round(np.mean(agg_mean_delta_yes),3))
    print 'Std dev of number of HTMGs (of means across indidivuals): %s' %  (np.round(np.std(agg_mean_delta_yes),3))
    print 'Fraction of time no HTMGs in window (aggregated data): %s' %  (np.round(np.sum(aggregate_fraczero_delta_yes),3))
    print 'Avg fraction of time no HTMGs in window (of fractions across individuals): %s' %  (np.round(np.mean(ind_zero_delta_yes),3))
    print 'Std dev of fraction of time no HTMGs in window (of fractions across individuals): %s' %  (np.round(np.std(ind_zero_delta_yes),3))
    print
    '''

    return None


In [6]:
'''
Calculate the HTMG coverage for 5, 15, 30, and 60 minutes around the event time.
'''

summary_output(5)

summary_output(15)

summary_output(30)

summary_output(60)

summary_output(90)

summary_output(120)

5 minute window:
Prior to EMA Response: No
Mean number of HTMGs prior (aggregated data): 0.586
Avg number of HTMGs (avg of means across indidivuals): 0.569
Std dev of number of HTMGs (of means across indidivuals): 0.531
Fraction of time no HTMGs in window (aggregated data): 0.68
Avg fraction of time no HTMGs in window (of fractions across individuals): 0.696
Std dev of fraction of time no HTMGs in window (of fractions across individuals): 0.226

15 minute window:
Prior to EMA Response: No
Mean number of HTMGs prior (aggregated data): 1.787
Avg number of HTMGs (avg of means across indidivuals): 1.698
Std dev of number of HTMGs (of means across indidivuals): 1.238
Fraction of time no HTMGs in window (aggregated data): 0.437
Avg fraction of time no HTMGs in window (of fractions across individuals): 0.444
Std dev of fraction of time no HTMGs in window (of fractions across individuals): 0.251

30 minute window:
Prior to EMA Response: No
Mean number of HTMGs prior (aggregated data): 3.508
Av

In [11]:
''' 
Compute an anova decomposition using the poisson likelihood
This will test if there are significant differences across
individuals.
'''

delta = 30
llik_delta = 0; agg_llik_delta = 0
output_delta = random_puff(delta)
output_delta = np.asarray(output_delta)
output_no_delta = output_delta[1,:] ## Extract only No vector for analysis
agg_sum_delta = np.asarray([np.sum(arr) for arr in output_no_delta])
agg_mean_delta  = np.asarray([np.mean(arr) for arr in output_no_delta])
agg_count_delta  = np.asarray([len(arr) for arr in output_no_delta])
aggregate_frac_delta  = np.divide(np.sum(agg_sum_delta),np.sum(agg_count_delta))

for i in range(0, agg_mean_delta.size):
    user_mean = agg_mean_delta[i]
    row = output_no_delta[i]
    if user_mean > 0.0 and math.isnan(user_mean) == False:
        llik_delta += np.sum(np.subtract(np.multiply(row, np.log(user_mean)),user_mean))
        agg_llik_delta += np.sum(np.subtract(np.multiply(row, np.log(aggregate_frac_delta)),aggregate_frac_delta))

D_delta = -2*agg_llik_delta + 2*llik_delta
print D_delta


from scipy.stats import chi2
n = np.sum(agg_count_delta) ## Number of observations
k = output_no_delta.shape[0]
df = n-k ## Number of parameters 
print k

print 'ANOVA p-value for current hour: %s' % (1-chi2.cdf(D_delta, df))

1420.1965579986572
67
ANOVA p-value for current hour: 2.55351295663786e-14
