## Self-report and HTMGs: Exploratory Data Analysis

- This notebook is dedicated to understanding the number of HTMGs in the minutes prior to a self-report of smoking
- For multiple window-lengths (Delta), we compute the following descriptive statistics
    + Mean number of HTMGs prior (aggregated data)
    + Avg number of HTMGs (avg of means across indidivuals): 
    + Std dev of number of HTMGs (of means across indidivuals)
    + Fraction of time no HTMGs in window (aggregated data)
    + Avg fraction of time no HTMGs in window (of fractions across individuals)
    + Std dev of fraction of time no HTMGs in window (of fractions across individuals)

In [14]:
import pandas as pd
import numpy as np
import datetime as datetime
import matplotlib.pyplot as plt
import os
os.getcwd()
dir = "../final-data"

In [30]:
sr_accptresponse = ['Smoking Event(15 to 30 minutes)', '5 to 15 minutes', 'Smoking Event(less than 5 minutes ago)']
sr_dictionary = {'Smoking Event(less than 5 minutes ago)': 2.5, 
                 'Smoking Event(15 - 30 minutes)': 17.5, 
                 'Smoking Event(5 - 15 minutes)': 10
                } 

In [26]:
# read data
htmgs = pd.read_csv(os.path.join(os.path.realpath(dir), 'puff-probability-final.csv'))
selfreport = pd.read_csv(os.path.join(os.path.realpath(dir), 'self-report-smoking-final.csv'))

In [93]:
def selfreport_puff(delta):
    '''
    checks how sensitive is puffmarker coverage of 
    contingent events to time window
    '''
    #print("Set delta to {} minutes".format(delta))

    htmg_complete = []
    
    for id in np.unique(np.concatenate([selfreport['participant_id'], htmgs['participant_id']])):
        htmg_id = np.where(htmgs['participant_id'] == id) 
        htmg_subset = htmgs.iloc[htmg_id[0]]
        try:
            htmg_dates_list = [datetime.datetime.strptime(date, '%m/%d/%y %H:%M') for date in htmg_subset['date']]
        except:
            htmg_dates_list = [datetime.datetime.strptime(date, '%Y-%m-%d %H:%M:%S') for date in htmg_subset['date']]

        sr_id = np.where(selfreport['participant_id'] == id) 
        sr_subset = selfreport.iloc[sr_id[0]]

        htmg_id_list = []
        for index, row in sr_subset.iterrows():
            try:
                sr_time = datetime.datetime.strptime(row['date'], '%m/%d/%y %H:%M')
            except:
                sr_time = datetime.datetime.strptime(row['date'], '%Y-%m-%d %H:%M:%S')
            if row['message'] in sr_accptresponse:
                sr_time = sr_time - datetime.timedelta(minutes=sr_dictionary[row['message']])
            if row['message'] in sr_accptresponse: 
                htmg_count = 0
                for index_puff in range(0,len(htmg_dates_list)):
                    temp = abs((htmg_dates_list[index_puff] - sr_time).total_seconds() / 60.0)
                    if temp <= delta:
                        htmg_count += 1
                htmg_id_list.append(htmg_count)  
        temp = np.array(htmg_id_list, dtype = 'f')
        if temp.size > 0:
            htmg_complete.append(temp)

    return htmg_complete


18

In [95]:
def summary_output(delta):
    output_delta = selfreport_puff(delta)
    output_delta = np.asarray(output_delta)
    agg_sum_delta = np.asarray([np.sum(arr) for arr in output_delta])
    agg_mean_delta  = np.asarray([np.mean(arr) for arr in output_delta])
    agg_count_delta  = np.asarray([len(arr) for arr in output_delta])
    agg_zero_delta  = np.asarray([np.count_nonzero(arr==0) for arr in output_delta], dtype='f')
    ind_zero_delta = np.divide(agg_zero_delta, agg_count_delta)

    aggregate_frac_delta  = np.divide(np.sum(agg_sum_delta),np.sum(agg_count_delta))
    aggregate_fraczero_delta  = np.divide(np.sum(agg_zero_delta),np.sum(agg_count_delta))

    print '% s minute window:' % (delta)
    print 'Mean number of HTMGs (aggregated data): %s' % (np.round(aggregate_frac_delta,3))
    print 'Avg number of HTMGs (avg of means across indidivuals): %s' % (np.round(np.mean(agg_mean_delta),3))
    print 'Std dev of number of HTMGs (of means across indidivuals): %s' %  (np.round(np.std(agg_mean_delta),3))
    print 'Fraction of time no HTMGs in window (aggregated data): %s' %  (np.round(np.sum(aggregate_fraczero_delta),3))
    print 'Avg fraction of time no HTMGs in window (of fractions across individuals): %s' %  (np.round(np.mean(ind_zero_delta),3))
    print 'Std dev of fraction of time no HTMGs in window (of fractions across individuals): %s' %  (np.round(np.std(ind_zero_delta),3))
    print
    
    return None

In [96]:
'''
Calculate the HTMG coverage for 5, 15, 30, and 60 minutes
around the event time.
'''

summary_output(5)

summary_output(15)

summary_output(30)

summary_output(60)

summary_output(90)

summary_output(120)

5 minute window:
Mean number of HTMGs (aggregated data): 1.525
Avg number of HTMGs (avg of means across indidivuals): 1.852
Std dev of number of HTMGs (of means across indidivuals): 1.663
Fraction of time no HTMGs in window (aggregated data): 0.525
Avg fraction of time no HTMGs in window (of fractions across individuals): 0.459
Std dev of fraction of time no HTMGs in window (of fractions across individuals): 0.277

15 minute window:
Mean number of HTMGs (aggregated data): 2.916
Avg number of HTMGs (avg of means across indidivuals): 3.593
Std dev of number of HTMGs (of means across indidivuals): 2.787
Fraction of time no HTMGs in window (aggregated data): 0.364
Avg fraction of time no HTMGs in window (of fractions across individuals): 0.298
Std dev of fraction of time no HTMGs in window (of fractions across individuals): 0.246

30 minute window:
Mean number of HTMGs (aggregated data): 4.536
Avg number of HTMGs (avg of means across indidivuals): 5.537
Std dev of number of HTMGs (of means

In [97]:
''' 
Compute an anova decomposition using the poisson likelihood
This will test if there are significant differences across
individuals.
'''

llik_delta = 0; agg_llik_delta = 0
output_delta = selfreport_puff(delta)
output_delta = np.asarray(output_delta)
agg_sum_delta = np.asarray([np.sum(arr) for arr in output_delta])
agg_mean_delta  = np.asarray([np.mean(arr) for arr in output_delta])
agg_count_delta  = np.asarray([len(arr) for arr in output_delta])
aggregate_frac_delta  = np.divide(np.sum(agg_sum_delta),np.sum(agg_count_delta))

for i in range(0, agg_mean_delta.size):
    user_mean = agg_mean_delta[i]
    row = output_delta[i]
    if user_mean > 0.0:
        llik_delta += np.sum(np.subtract(np.multiply(row, np.log(user_mean)),user_mean))
        agg_llik_delta += np.sum(np.subtract(np.multiply(row, np.log(aggregate_frac_delta)),aggregate_frac_delta))

D_delta = -2*agg_llik_delta + 2*llik_delta
print D_delta


from scipy.stats import chi2
n = np.sum(agg_count_delta) ## Number of observations
k = output_delta.shape[0]
df = n-k ## Number of parameters 
print k

print 'ANOVA p-value for current hour: %s' % (1-chi2.cdf(D_delta, df))

2435.031967163086
60
ANOVA p-value for current hour: 0.0
