In [1]:
# load data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
# load the users data:
users = pd.read_csv('../data/users.csv')
users.head()

Unnamed: 0,user.index,intake.survey.utime,intake.survey.tz,intake.survey.gmtoff,first.notif.utime,first.steps.utime,exit.survey.utime,exit.survey.tz,exit.survey.gmtoff,last.notif.utime,...,walk10.days.exit,sit.time.exit,vigact.time.exit,modact.time.exit,walk.time.exit,vigact.metmins.exit,modact.metmins.exit,walk.metmins.exit,metmins.exit,ipaq.hepa.exit
0,1,2015-07-22 14:34:03,Etc/GMT+4,-14400,2015-07-22 16:31:53,2015-07-22 16:42:00,2015-09-15 06:00:00,Etc/GMT+4,-14400,2015-09-15 21:31:27,...,7.0,,120.0,90.0,240.0,4800.0,1800.0,5544.0,12144.0,3.0
1,2,2015-07-24 14:46:56,Etc/GMT+4,-14400,2015-07-24 17:01:56,2015-07-24 17:38:00,2015-09-04 07:00:00,Etc/GMT+4,-14400,2015-09-04 00:31:38,...,7.0,600.0,0.0,0.0,40.0,0.0,0.0,924.0,924.0,2.0
2,3,2015-07-25 17:40:12,Etc/GMT+4,-14400,2015-07-25 18:31:35,2015-07-25 20:03:00,2015-09-14 07:45:00,Etc/GMT+4,-14400,2015-09-14 15:21:04,...,7.0,360.0,60.0,30.0,30.0,1920.0,840.0,693.0,3453.0,3.0
3,4,2015-07-27 13:57:23,Etc/GMT+4,-14400,2015-07-27 18:32:19,2015-07-27 18:37:00,2015-09-09 12:30:00,Etc/GMT+4,-14400,2015-09-08 23:01:26,...,7.0,420.0,90.0,90.0,60.0,4320.0,2520.0,1386.0,8226.0,3.0
4,5,2015-07-27 16:47:45,Etc/GMT+4,-14400,2015-07-27 19:01:43,2015-07-27 19:01:00,2015-09-08 12:30:00,Etc/GMT+4,-14400,2015-09-08 21:31:48,...,3.0,600.0,0.0,30.0,30.0,0.0,120.0,297.0,417.0,1.0


In [3]:
# Check for missing values in the users data to understand their distribution
missing_values = users.isnull().sum()

# Summary statistics for key demographic columns and activity-related columns
summary_stats = users[['age', 'gender', 'marital', 'selfeff.intake', 'selfeff.exit', 
                          'consc', 'walk.time.exit', 'modact.time.exit', 'vigact.time.exit']].describe()

missing_values

user.index              0
intake.survey.utime     0
intake.survey.tz        0
intake.survey.gmtoff    0
first.notif.utime       0
                       ..
vigact.metmins.exit     2
modact.metmins.exit     2
walk.metmins.exit       3
metmins.exit            3
ipaq.hepa.exit          3
Length: 117, dtype: int64

In [4]:
summary_stats

Unnamed: 0,age,selfeff.intake,selfeff.exit,consc,walk.time.exit,modact.time.exit,vigact.time.exit
count,37.0,37.0,35.0,37.0,34.0,35.0,35.0
mean,35.513514,14.486486,13.942857,22.72973,59.941176,58.914286,42.714286
std,14.219747,3.30506,3.572302,2.468115,65.693015,58.877426,41.662913
min,19.0,8.0,8.0,18.0,0.0,0.0,0.0
25%,24.0,12.0,11.0,21.0,30.0,21.0,0.0
50%,28.0,14.0,13.0,23.0,32.5,45.0,40.0
75%,47.0,17.0,17.0,25.0,60.0,60.0,60.0
max,64.0,21.0,22.0,27.0,240.0,240.0,150.0


In [5]:
# Define a function that applies reverse coding to self-efficacy columns

def reverse_code_selfeff(df):
    # List of self-efficacy columns to reverse code
    selfeff_columns = [
        'selfeff.tired.intake', 'selfeff.badmood.intake', 'selfeff.notime.intake', 
        'selfeff.vaca.intake', 'selfeff.precip.intake', 'selfeff.tired.exit', 
        'selfeff.badmood.exit', 'selfeff.notime.exit', 'selfeff.vaca.exit', 
        'selfeff.precip.exit'
    ]


    # Create new reversed columns
    for col in selfeff_columns:
        df[col + '_reverse'] = 6 - df[col]
    
    # Calculate the reversed total score for intake and exit surveys, and convert to integer
    df['selfeff.intake_reversed'] = df[['selfeff.tired.intake_reverse', 'selfeff.badmood.intake_reverse', 
                                        'selfeff.notime.intake_reverse', 'selfeff.vaca.intake_reverse', 
                                        'selfeff.precip.intake_reverse']].sum(axis=1).astype(int)
    
    df['selfeff.exit_reversed'] = df[['selfeff.tired.exit_reverse', 'selfeff.badmood.exit_reverse', 
                                      'selfeff.notime.exit_reverse', 'selfeff.vaca.exit_reverse', 
                                      'selfeff.precip.exit_reverse']].sum(axis=1).astype(int)
    
    return df

# Test the function on the users
users_df_reversed = reverse_code_selfeff(users)

# Display the first few rows to verify the output
users_df_reversed[['user.index', 'selfeff.intake_reversed', 'selfeff.exit_reversed']].head()


Unnamed: 0,user.index,selfeff.intake_reversed,selfeff.exit_reversed
0,1,11,10
1,2,18,19
2,3,16,12
3,4,9,8
4,5,14,18


In [6]:
# Display all columns related to self-efficacy for user with index 1
selfeff_columns_user_1 = users.loc[users['user.index'] == 1, users.columns.str.contains('selfeff')]

# Show the result
selfeff_columns_user_1

Unnamed: 0,selfeff.tired.intake,selfeff.badmood.intake,selfeff.notime.intake,selfeff.vaca.intake,selfeff.precip.intake,selfeff.intake,selfeff.tired.exit,selfeff.badmood.exit,selfeff.notime.exit,selfeff.vaca.exit,...,selfeff.notime.intake_reverse,selfeff.vaca.intake_reverse,selfeff.precip.intake_reverse,selfeff.tired.exit_reverse,selfeff.badmood.exit_reverse,selfeff.notime.exit_reverse,selfeff.vaca.exit_reverse,selfeff.precip.exit_reverse,selfeff.intake_reversed,selfeff.exit_reversed
0,3,4,3,5,4,19,4.0,4.0,4.0,4.0,...,3,1,2,2.0,2.0,2.0,2.0,2.0,11,10


In the data cleaning process, we reverse-coded the self-efficacy scores to enhance interpretability. Initially, higher scores indicated lower confidence in engaging in physical activity (using a 5-point likert scale where 1=strongly agree to 5=strongly disagree), which is counter-intuitive. We applied a transformation (6 - current_score) to flip the scale, so that higher scores now represent greater confidence in physical activity. This change was applied to both the intake and exit surveys for all related items. We also computed new total self-efficacy scores, selfeff.intake_reversed and selfeff.exit_reversed, which now reflect the participants' confidence in physical activity, where higher scores indicate higher confidence.

In [7]:
# load the suggestions data:
suggestions = pd.read_csv('../data/suggestions.csv')
suggestions.head()


  suggestions = pd.read_csv('../data/suggestions.csv')


Unnamed: 0,user.index,decision.index,decision.index.nogap,sugg.select.utime,sugg.select.slot,sugg.select.update,sugg.tz,sugg.gmtoff,sugg.decision.utime,sugg.decision.slot,...,gfmins10,gfsteps10,gfmins30,gfsteps30,gfmins60,gfsteps60,gfmins30pre,gfsteps30pre,gfmins60pre,gfsteps60pre
0,1,0,0.0,2015-07-22 16:30:00,2,2015-07-22,Etc/GMT+4,-14400.0,2015-07-22 16:31:53,2.0,...,3.0,3.0,23.0,948.0,23.0,948.0,0.0,0.0,23.0,954.0
1,1,1,1.0,2015-07-22 18:30:00,3,2015-07-22,Etc/GMT+4,-14400.0,2015-07-22 18:32:10,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,2,2.0,2015-07-22 21:30:00,4,2015-07-22,Etc/GMT+4,-14400.0,2015-07-22 21:31:48,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,73.0
3,1,3,3.0,2015-07-22 23:30:00,5,2015-07-22,Etc/GMT+4,-14400.0,2015-07-22 23:31:50,5.0,...,5.0,10.0,16.0,323.0,46.0,1103.0,46.0,2603.0,67.0,3760.0
4,1,4,4.0,2015-07-23 09:30:00,1,2015-07-23,Etc/GMT+4,-14400.0,2015-07-23 09:31:31,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# check the size of the suggestion data:
suggestions.shape

(8274, 86)

In [9]:
# show the data group by differnt user index:
suggestions.groupby('user.index').count()

Unnamed: 0_level_0,decision.index,decision.index.nogap,sugg.select.utime,sugg.select.slot,sugg.select.update,sugg.tz,sugg.gmtoff,sugg.decision.utime,sugg.decision.slot,sugg.context.utime,...,gfmins10,gfsteps10,gfmins30,gfsteps30,gfmins60,gfsteps60,gfmins30pre,gfsteps30pre,gfmins60pre,gfsteps60pre
user.index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,278,178,278,278,278,274,278,278,274,274,...,231,231,231,231,231,231,232,232,232,232
2,209,209,209,209,209,165,209,209,165,165,...,180,180,180,180,180,180,181,181,181,181
3,255,215,255,255,255,209,255,255,209,209,...,202,202,202,202,202,202,203,203,203,203
4,219,219,218,219,219,204,218,218,204,204,...,189,189,189,189,189,189,190,190,190,190
5,217,217,217,217,217,216,217,217,216,216,...,200,200,200,200,200,200,201,201,201,201
6,212,182,212,212,212,192,212,212,192,192,...,0,0,0,0,0,0,0,0,0,0
7,216,216,216,216,216,204,216,216,204,204,...,184,184,184,184,184,184,185,185,185,185
8,221,221,221,221,221,216,221,221,216,216,...,202,202,202,202,202,202,203,203,203,203
9,207,207,207,207,207,197,207,207,197,197,...,177,177,177,177,177,177,178,178,178,178
10,215,215,215,215,215,209,215,215,209,209,...,205,205,205,205,205,205,206,206,206,206


In [10]:
# check the send.active column:
suggestions['send.active']

0       False
1        True
2       False
3       False
4       False
        ...  
8269    False
8270    False
8271     True
8272    False
8273    False
Name: send.active, Length: 8274, dtype: object