## Import Library

In [34]:
import os, errno
import argparse
import numpy as np
import pandas as pd
import sys
import matplotlib.pyplot as plt

from datetime import datetime, timedelta
from scipy.stats import kurtosis
from scipy.stats.mstats import moment
from scipy import stats
from scipy.stats import ttest_ind, f_oneway

from util.load_data_basic import *

# date_time format
date_time_format = '%Y-%m-%dT%H:%M:%S.%f'
date_only_date_time_format = '%Y-%m-%d'

# sleep after work duration thereshold
sleep_after_work_duration_threshold = 12
sleep_after_sleep_duration_threshold = 2

# data folder
main_data_directory = '../data/keck_wave1/2_preprocessed_data'
recording_timeline_directory = '../output/recording_timeline'
sleep_timeline_directory = '../output/sleep_timeline'
individual_timeline_directory = 'output/individual_timeline'


## Read Basic Information

In [35]:
# Read sleep data for all participant
sleep_data = pd.read_csv(os.path.join('output', 'sleep_survey_full.csv'))

# Read ID
IDs = getParticipantID(main_data_directory, index=False)

# Read MGT
MGT = read_MGT(main_data_directory)

# Read Pre-Study info
PreStudyInfo = read_pre_study_info(main_data_directory)

# Read IGTB info
IGTB = read_IGTB(main_data_directory)

# Demographic
Demographic = read_Demographic(main_data_directory)

# Day shift data
day_data = sleep_data.loc[sleep_data['shift_type'] == 1]
day_workday_data = day_data.loc[(day_data['is_sleep_before_work'] == 1) & (day_data['is_sleep_after_work'] == 1)]
day_off_day_data = day_data.loc[(day_data['is_sleep_before_work'] != 1) & (day_data['is_sleep_after_work'] != 1)]
day_transition_day_data = day_data.loc[(day_data['is_sleep_transition_before_work'] == 1) | (day_data['is_sleep_transition_after_work'] == 1)]

# Night shift data
night_data = sleep_data.loc[sleep_data['shift_type'] == 2]
night_workday_data = night_data.loc[(night_data['is_sleep_before_work'] == 1) & (night_data['is_sleep_after_work'] == 1)]
night_off_day_data = night_data.loc[(night_data['is_sleep_before_work'] != 1) & (night_data['is_sleep_after_work'] != 1)]
night_transition_day_data = night_data.loc[(night_data['is_sleep_transition_before_work'] == 1) | (night_data['is_sleep_transition_after_work'] == 1)]

# Data array
data_array = [day_data, night_data, 
              day_workday_data, day_off_day_data, 
              night_workday_data, night_off_day_data]

data_type = ['day-shift all', 'night-shift all',
             'day-shift workday', 'day-shift off day', 
             'night-shift workday', 'night-shift off day']

colunm_type = ['duration_in_seconds', 'SleepEfficiency',
               'SleepMinutesStageDeep', 'SleepMinutesStageLight', 'SleepMinutesStageRem',
               'sleep_heart_rate_mean', 'sleep_heart_rate_std',
               'sleep_heart_rate_percentile_10', 'sleep_heart_rate_percentile_90']

UserInfo = pd.merge(IGTB, PreStudyInfo, left_on='uid', right_on='uid', how='outer')
UserInfo = pd.merge(UserInfo, IDs, left_on='uid', right_on='uid', how='outer')
UserInfo = pd.merge(UserInfo, Demographic, left_on='uid', right_on='uid', how='outer')
UserInfo = UserInfo.set_index('uid')


## Get participant with valid sleep data

In [36]:
valid_sleep_stats = pd.DataFrame()
frame_col = ['number_of_sleep', 'number_of_long_sleep', 'number_of_short_sleep', 
             'sleep_rate', 'short_sleep_rate', 'long_sleep_rate']

select_column = ['participant_id', 'shift_pre-study', 
                 'life_satisfaction_pre-study', 'wellbeing_pre-study',
                 'social_functioning_pre-study', 'pain_pre-study', 'general_health_pre-study',
                 'neu_igtb', 'con_igtb', 'ext_igtb', 'agr_igtb', 'ope_igtb']

for participant_id in UserInfo['participant_id']:
    
    user_id = UserInfo.loc[UserInfo['participant_id'] == participant_id].index.values[0]
    participant_MGT = MGT.loc[MGT['uid'] == user_id]
    
    if len(participant_MGT) > 1:
        start_date = np.datetime64(participant_MGT.index.values[0], 'D')
        end_date = np.datetime64(participant_MGT.index.values[-1], 'D')
        days_of_survey = (end_date - start_date) / np.timedelta64(1, 'D') + 1
        
        
        participant_sleep_data = sleep_data.loc[sleep_data['participant_id'] == participant_id]
        participant_sleep_data = participant_sleep_data.set_index('start_recording_time')
        
        start_date = np.datetime64(start_date).astype(datetime).strftime(date_time_format)
        end_date = np.datetime64(end_date).astype(datetime).strftime(date_time_format)
        
        participant_sleep_data = participant_sleep_data[start_date:end_date]
        
    
    if len(participant_sleep_data) > 20:
        frame = pd.DataFrame(columns=frame_col, index=[user_id])
        frame['number_of_sleep'] = len(participant_sleep_data)
        frame['number_of_short_sleep'] = len(participant_sleep_data.loc[participant_sleep_data['duration_in_seconds'] <= 3600 * 4])
        frame['number_of_long_sleep'] = len(participant_sleep_data.loc[participant_sleep_data['duration_in_seconds'] > 3600 * 4])
        frame['sleep_rate'] = len(participant_sleep_data) / days_of_survey
        frame['short_sleep_rate'] = len(participant_sleep_data.loc[participant_sleep_data['duration_in_seconds'] <= 3600 * 4]) / days_of_survey
        frame['long_sleep_rate'] = len(participant_sleep_data.loc[participant_sleep_data['duration_in_seconds'] > 3600 * 4]) / days_of_survey
        
        valid_sleep_stats = valid_sleep_stats.append(frame)
        
# sleep rate and IGTB
temp = UserInfo.loc[:,:].copy()
valid_sleep_stats = pd.concat([valid_sleep_stats, temp], axis=1)
valid_sleep_stats = valid_sleep_stats.dropna(subset=['number_of_sleep'])
        
# Number of sleep
for col in frame_col:
    print(col)
    print('Number of valid participant: %i' % (len(valid_sleep_stats[col])))
    print('Average sleep data per participant: %.3f' % (np.mean(valid_sleep_stats[col])))
    print('Std sleep data per participant: %.3f' % (np.std(valid_sleep_stats[col])))

    print('Min/Max sleep data of participant: %.3f/%.3f\n' % (np.max(valid_sleep_stats[col]), 
                                                          np.min(valid_sleep_stats[col])))
        

number_of_sleep
Number of valid participant: 152
Average sleep data per participant: 52.414
Std sleep data per participant: 19.034
Min/Max sleep data of participant: 137.000/21.000

number_of_long_sleep
Number of valid participant: 152
Average sleep data per participant: 40.322
Std sleep data per participant: 14.570
Min/Max sleep data of participant: 82.000/9.000

number_of_short_sleep
Number of valid participant: 152
Average sleep data per participant: 12.092
Std sleep data per participant: 11.339
Min/Max sleep data of participant: 78.000/0.000

sleep_rate
Number of valid participant: 152
Average sleep data per participant: 0.793
Std sleep data per participant: 0.237
Min/Max sleep data of participant: 1.930/0.310

short_sleep_rate
Number of valid participant: 152
Average sleep data per participant: 0.183
Std sleep data per participant: 0.163
Min/Max sleep data of participant: 1.099/0.000

long_sleep_rate
Number of valid participant: 152
Average sleep data per participant: 0.610
Std sl

## Raw IGTB - PSQI and GATS

In [37]:
raw_IGTB_col = ['psqi_inst', 'psqi1', 'psqi1ampm', 'psqi2', 'psqi3', 'psqi3ampm', 'psqi4',
            'psqi5a', 'psqi5b', 'psqi5c', 'psqi5d', 'psqi5e', 'psqi5f', 'psqi5g', 'psqi5h',
            'psqi5i', 'psqi5ja', 'psqi5jb', 'psqi6', 'psqi7', 'psqi8', 'psqi9', 'psqi_complete', 
            'gats1', 'gats2', 'gats3Week_1', 'gats3Week_2', 'gats3Week_3', 
            'gats3Week_4', 'gats3Week_5', 'gats3Week_6', 'gats3Week_7', 
            'audit1', 'audit2', 'audit3', 'audit4', 'audit5', 
            'audit6', 'audit7', 'audit8', 'audit9', 'audit10', 'Name']

IGTB_RAW = read_IGTB_Raw(main_data_directory)[raw_IGTB_col]

IGTB_RAW_Score = pd.DataFrame()

for user_id in valid_sleep_stats.index.values:
    IGTB_RAW_participant = IGTB_RAW.loc[IGTB_RAW['Name'] == user_id]
    
    # PSQI
    # Contains 7 scores, the lower the score, the better the performance
    frame = pd.DataFrame(index=[user_id])
    # 1st score
    frame['subjective_sleep_quality_psqi'] = IGTB_RAW_participant['psqi6'].values[0]
    
    # 2nd score
    if IGTB_RAW_participant['psqi2'].values[0] <= 15:
        response = 0
    elif 15 < IGTB_RAW_participant['psqi2'].values[0] <= 30:
        response = 1
    elif 30 < IGTB_RAW_participant['psqi2'].values[0] <= 60:
        response = 2
    elif IGTB_RAW_participant['psqi2'].values[0] > 60:
        response = 3
     
    frame['sleep_latency_psqi'] = int((response + IGTB_RAW_participant['psqi5a'].values[0] + 1) / 2)
    
    # 3rd score
    if IGTB_RAW_participant['psqi4'].values[0] >= 7:
        response = 0
    elif 6 <= IGTB_RAW_participant['psqi4'].values[0] < 7:
        response = 1
    elif 5 <= IGTB_RAW_participant['psqi4'].values[0] < 6:
        response = 2
    elif IGTB_RAW_participant['psqi4'].values[0] < 5:
        response = 3
    frame['sleep_duration_psqi'] = response
    
    # 4th score
    in_bed_time = int(IGTB_RAW_participant['psqi1'].values[0] / 100)
    in_bed_time = in_bed_time + int(IGTB_RAW_participant['psqi1'].values[0] % 100) / 60
    
    get_up_time = int(IGTB_RAW_participant['psqi3'].values[0] / 100)
    get_up_time = get_up_time + int(IGTB_RAW_participant['psqi3'].values[0] % 100) / 60
    
    if in_bed_time >= 12:
        in_bed_time = in_bed_time - 12
    
    if get_up_time >= 12:
        get_up_time = get_up_time - 12
    
    if IGTB_RAW_participant['psqi1ampm'].values[0] == 1200 and IGTB_RAW_participant['psqi3ampm'].values[0] == 0:
        time_in_bed = 12 - in_bed_time + get_up_time
    elif IGTB_RAW_participant['psqi1ampm'].values[0] == 0 and IGTB_RAW_participant['psqi3ampm'].values[0] == 1200:
        time_in_bed = 12 - in_bed_time + get_up_time
    else:
        time_in_bed = get_up_time - in_bed_time
    
    efficiency = 100 * IGTB_RAW_participant['psqi4'].values[0] / time_in_bed
    
    if efficiency < 0:
        efficiency = 100
    
    if efficiency >= 85:
        response = 0
    elif 75 <= efficiency < 85:
        response = 1
    elif 65 <= efficiency < 75:
        response = 2
    elif efficiency < 65:
        response = 3
    frame['sleep_efficiency_psqi'] = response
    
    # 5th score
    response = IGTB_RAW_participant['psqi5b'].values[0] + IGTB_RAW_participant['psqi5c'].values[0] + IGTB_RAW_participant['psqi5d'].values[0] + IGTB_RAW_participant['psqi5e'].values[0] + IGTB_RAW_participant['psqi5f'].fillna(0).values[0] + IGTB_RAW_participant['psqi5g'].values[0] + IGTB_RAW_participant['psqi5h'].values[0] + IGTB_RAW_participant['psqi5i'].values[0] 
    response = response + IGTB_RAW_participant['psqi5jb'].fillna(0)
    frame['sleep_distrubance_psqi'] = int((response + 8) / 9)
    
    # 6th score
    frame['sleep_medication_psqi'] = IGTB_RAW_participant['psqi7'].values[0]
    
    # 7th score
    response = IGTB_RAW_participant['psqi8'].values[0] + IGTB_RAW_participant['psqi9'].values[0]
    frame['daytime_dysfunction_psqi'] = int((response + 1) / 2)
    
    # GATS
    frame['current_tobacco_gats'] = IGTB_RAW_participant['gats1'].values[0]
    frame['past_tobacco_gats'] = IGTB_RAW_participant['gats2'].values[0]
    frame['individual_cigarettes'] = IGTB_RAW_participant['gats3Week_1'].fillna(0).values[0]
    frame['individual_clove_cigarettes'] = IGTB_RAW_participant['gats3Week_2'].fillna(0).values[0]
    frame['individual_cigars'] = IGTB_RAW_participant['gats3Week_3'].fillna(0).values[0]
    frame['e_cigarette'] = IGTB_RAW_participant['gats3Week_4'].fillna(0).values[0]
    frame['pipe_session'] = IGTB_RAW_participant['gats3Week_5'].fillna(0).values[0]
    frame['smokeless_session'] = IGTB_RAW_participant['gats3Week_6'].fillna(0).values[0]
    frame['other_cigarettes'] = IGTB_RAW_participant['gats3Week_7'].fillna(0).values[0]
    
    # AUDIT
    frame['driking_frequency'] = IGTB_RAW_participant['audit1'].values[0]
    frame['number_of_drink_per_day'] = IGTB_RAW_participant['audit2'].values[0]
    frame['more_than_six_drink_frequency'] = IGTB_RAW_participant['audit3'].values[0]
    
    IGTB_RAW_Score = IGTB_RAW_Score.append(frame)
    
valid_sleep_stats = pd.concat([valid_sleep_stats, IGTB_RAW_Score], axis=1)
    

## Demographic - Overall

In [38]:
col = 'age'
age_stats = valid_sleep_stats[col].dropna()

# print(age_stats)
print('Number of valid participant: %i' % (len(age_stats)))

# Age:
print('\n' + col)
print('Average age per participant: %.3f' % (np.mean(age_stats)))
print('Std age data per participant: %.3f' % (np.std(age_stats)))
print('Min/Max age of participant: %.3f/%.3f\n' % (np.min(age_stats), np.max(age_stats)))
print('Age between 20-29: %.3f' % (len(valid_sleep_stats.loc[(valid_sleep_stats[col] < 30) & (valid_sleep_stats[col] >= 20)]) / len(age_stats)))
print('Age between 30-39: %.3f' % (len(valid_sleep_stats.loc[(valid_sleep_stats[col] < 40) & (valid_sleep_stats[col] >= 30)]) / len(age_stats)))
print('Age above 40: %.3f' % (len(valid_sleep_stats.loc[(valid_sleep_stats[col] >= 40)]) / len(age_stats)))

# Gender
col = 'gender'
demo_stats = valid_sleep_stats.dropna(subset=['gender'])
print('\n' + col)
print('Male percentage: %.3f (%.i)' % (len(demo_stats.loc[demo_stats[col] == 1]) / len(demo_stats), len(demo_stats.loc[demo_stats[col] == 1])))
print('Female percentage: %.3f (%.i)' % (len(demo_stats.loc[demo_stats[col] == 2]) / len(demo_stats), len(demo_stats.loc[demo_stats[col] == 2])))

# Education
col = 'education'
print('\n' + col)
print('High school: %.3f (%.i)' % (len(demo_stats.loc[demo_stats[col] <= 2]) / len(demo_stats), len(demo_stats.loc[demo_stats[col] <= 2])))
print('College and above: %.3f (%.i)' % (len(demo_stats.loc[demo_stats[col] >= 3]) / len(demo_stats), len(demo_stats.loc[demo_stats[col] >= 3])))
print('Master Degree: %.3f (%.i)' % (len(demo_stats.loc[demo_stats[col] == 6]) / len(demo_stats), len(demo_stats.loc[demo_stats[col] == 6])))
print('Advanced degree: %.3f (%.i)' % (len(demo_stats.loc[demo_stats[col] == 7]) / len(demo_stats), len(demo_stats.loc[demo_stats[col] >= 7])))


# Supervise
col = 'supervise'
print('\n' + col)
print('Supervise: %.3f (%.i)' % (len(demo_stats.loc[demo_stats[col] == 1]) / len(demo_stats), len(demo_stats.loc[demo_stats[col] == 1])))
print('Not supervise: %.3f (%.i)' % (len(demo_stats.loc[demo_stats[col] == 2]) / len(demo_stats), len(demo_stats.loc[demo_stats[col] == 2])))


# Job stat
col = 'jobstat'
print('\n' + col)
print('Part-time: %.3f (%.i)' % (len(demo_stats.loc[demo_stats[col] == 1]) / len(demo_stats), len(demo_stats.loc[demo_stats[col] == 1])))
print('Full-time: %.3f (%.i)' % (len(demo_stats.loc[demo_stats[col] == 2]) / len(demo_stats), len(demo_stats.loc[demo_stats[col] == 2])))

# prestudy info
prestudy_stats = valid_sleep_stats.dropna(subset=['shift_pre-study'])
# Shift
col = 'shift_pre-study'
print('\n' + col)
print('Day shift: %.3f (%.i)' % (len(prestudy_stats.loc[prestudy_stats[col] == 1]) / len(prestudy_stats), len(prestudy_stats.loc[prestudy_stats[col] == 1])))
print('Night shift: %.3f (%.i)' % (len(prestudy_stats.loc[prestudy_stats[col] == 2]) / len(prestudy_stats), len(prestudy_stats.loc[prestudy_stats[col] == 2])))




Number of valid participant: 129

age
Average age per participant: 38.659
Std age data per participant: 9.420
Min/Max age of participant: 24.000/63.000

Age between 20-29: 0.140
Age between 30-39: 0.473
Age above 40: 0.388

gender
Male percentage: 0.233 (30)
Female percentage: 0.767 (99)

education
High school: 0.054 (7)
College and above: 0.946 (122)
Master Degree: 0.109 (14)
Advanced degree: 0.031 (4)

supervise
Supervise: 0.287 (37)
Not supervise: 0.713 (92)

jobstat
Part-time: 0.016 (2)
Full-time: 0.984 (127)

shift_pre-study
Day shift: 0.619 (91)
Night shift: 0.381 (56)


## Demographic - Day, Night

In [39]:
col = 'age'

demo_stats = valid_sleep_stats.dropna(subset=['gender'])
day_sleep_stats = demo_stats.loc[demo_stats['shift_pre-study'] == 1]
night_sleep_stats = demo_stats.loc[demo_stats['shift_pre-study'] == 2]

overall_data = [day_sleep_stats, night_sleep_stats]
data_type = ['day-shift', 'night-shift']

# Age:
print('\n' + col + '\n')
for data in overall_data:
    print(data_type[int(data['shift_pre-study'].unique()[0])-1])
    print('Number of valid participant: %i' % (len(age_stats)))

    
    print('Average age per participant: %.3f' % (np.mean(age_stats)))
    print('Std age data per participant: %.3f' % (np.std(age_stats)))
    print('Min/Max age of participant: %.3f/%.3f\n' % (np.min(age_stats), np.max(age_stats)))
    print('Age between 20-29: %.3f' % (len(data.loc[(data[col] < 30) & (data[col] >= 20)]) / len(age_stats)))
    print('Age between 30-39: %.3f' % (len(data.loc[(data[col] < 40) & (data[col] >= 30)]) / len(age_stats)))
    print('Age above 40: %.3f\n' % (len(data.loc[(data[col] >= 40)]) / len(age_stats)))

stat, p = ttest_ind(day_sleep_stats[col].dropna(), night_sleep_stats[col].dropna())

print('Stats')
print('Statistics = %.3f, p = %.3f \n' % (stat, p))

# Gender
col = 'gender'
print('\n' + col + '\n')
for data in overall_data:
    print(data_type[int(data['shift_pre-study'].unique()[0])-1])
    print('Number of valid participant: %i' % (len(data)))
    print('Male percentage: %.3f (%.i)' % (len(data.loc[data[col] == 1]) / len(data), len(data.loc[data[col] == 1])))
    print('Female percentage: %.3f (%.i)\n' % (len(data.loc[data[col] == 2]) / len(data), len(data.loc[data[col] == 2])))

stat, p = ttest_ind(day_sleep_stats[col].dropna(), night_sleep_stats[col].dropna())

print('Stats')
print('Statistics = %.3f, p = %.3f \n' % (stat, p))

# Education
col = 'education'
print('\n' + col + '\n')

for data in overall_data:
    print(data_type[int(data['shift_pre-study'].unique()[0])-1])
    print('Number of valid participant: %i' % (len(data)))
    print('High school: %.3f (%.i)' % (len(data.loc[data[col] <= 2]) / len(data), len(data.loc[data[col] <= 2])))
    print('College and above: %.3f (%.i)' % (len(data.loc[data[col] >= 3]) / len(data), len(data.loc[data[col] >= 3])))
    print('Master Degree: %.3f (%.i)' % (len(data.loc[data[col] == 6]) / len(data), len(data.loc[data[col] == 6])))
    print('Advanced degree: %.3f (%.i)\n' % (len(data.loc[data[col] == 7]) / len(data), len(data.loc[data[col] >= 7])))

stat, p = ttest_ind(day_sleep_stats[col].dropna(), night_sleep_stats[col].dropna())

print('Stats')
print('Statistics = %.3f, p = %.3f \n' % (stat, p))



# Supervise
col = 'supervise'
print('\n' + col + '\n')

for data in overall_data:
    print(data_type[int(data['shift_pre-study'].unique()[0])-1])
    print('Number of valid participant: %i' % (len(data)))
    print('Supervise: %.3f (%.i)' % (len(data.loc[data[col] == 1]) / len(data), len(data.loc[data[col] == 1])))
    print('Not supervise: %.3f (%.i)\n' % (len(data.loc[data[col] == 2]) / len(data), len(data.loc[data[col] == 2])))

stat, p = ttest_ind(day_sleep_stats[col].dropna(), night_sleep_stats[col].dropna())

print('Stats')
print('Statistics = %.3f, p = %.3f \n' % (stat, p))



# Job stat
col = 'jobstat'
print('\n' + col + '\n')

for data in overall_data:
    print(data_type[int(data['shift_pre-study'].unique()[0])-1])
    print('Number of valid participant: %i' % (len(data)))
    print('Part-time: %.3f (%.i)' % (len(data.loc[data[col] == 1]) / len(data), len(data.loc[data[col] == 1])))
    print('Full-time: %.3f (%.i)\n' % (len(data.loc[data[col] == 2]) / len(data), len(data.loc[data[col] == 2])))

stat, p = ttest_ind(day_sleep_stats[col].dropna(), night_sleep_stats[col].dropna())

print('Stats')
print('Statistics = %.3f, p = %.3f \n' % (stat, p))




age

day-shift
Number of valid participant: 129
Average age per participant: 38.659
Std age data per participant: 9.420
Min/Max age of participant: 24.000/63.000

Age between 20-29: 0.070
Age between 30-39: 0.271
Age above 40: 0.240

night-shift
Number of valid participant: 129
Average age per participant: 38.659
Std age data per participant: 9.420
Min/Max age of participant: 24.000/63.000

Age between 20-29: 0.062
Age between 30-39: 0.186
Age above 40: 0.140

Stats
Statistics = 1.162, p = 0.247 


gender

day-shift
Number of valid participant: 75
Male percentage: 0.200 (15)
Female percentage: 0.800 (60)

night-shift
Number of valid participant: 50
Male percentage: 0.300 (15)
Female percentage: 0.700 (35)

Stats
Statistics = 1.281, p = 0.203 


education

day-shift
Number of valid participant: 75
High school: 0.053 (4)
College and above: 0.947 (71)
Master Degree: 0.120 (9)
Advanced degree: 0.053 (4)

night-shift
Number of valid participant: 50
High school: 0.060 (3)
College and above:

## IGTB - Day, Night shift

In [40]:
ana_col = ['nurse_years_pre-study', 'hours_pre-study', 'housing_pre-study', 'wellbeing_pre-study', 'social_functioning_pre-study', 
            'pain_pre-study', 'general_health_pre-study', 'life_satisfaction_pre-study', 'perceived_stress_pre-study', 
            'psy_flexbility_pre-study', 'psy_inflexbility_pre-study', 'psy_capital_pre-study', 
            'waaq_pre-study', 'challenge_stressor_pre-study', 'hindrance_stressor_pre-study',
            'pos_af_igtb', 'neg_af_igtb', 
            'itp_igtb', 'irb_igtb', 
            'iod_id_igtb', 'iod_od_igtb', 
            'ocb_igtb', 'shipley_abs_igtb', 'shipley_voc_igtb',
            'neu_igtb', 'con_igtb', 'ext_igtb', 'agr_igtb', 'ope_igtb',
            'stai_igtb', 'audit_igtb', 'gats_status_igtb', 'gats_quantity_igtb', 
            'ipaq_igtb', 'psqi_igtb']

# shift_pre-study
day_sleep_stats = valid_sleep_stats.loc[(valid_sleep_stats['shift_pre-study'] == 1) | (valid_sleep_stats['shift'] == 1) & (valid_sleep_stats['shift_pre-study'] != 2)]
night_sleep_stats = valid_sleep_stats.loc[(valid_sleep_stats['shift_pre-study'] == 2) | (valid_sleep_stats['shift'] == 2) & (valid_sleep_stats['shift_pre-study'] != 1)]

overall_data = [day_sleep_stats, night_sleep_stats]
data_type = ['day-shift', 'night-shift']

all_data = pd.concat([day_sleep_stats, night_sleep_stats], axis=1)

for col in ana_col:

    response0 = overall_data[0][col]
    response1 = overall_data[1][col]
    
    print(col + '\n')
    print('Number of valid participant: day: %i; night: %i\n' % (len(response0), len(response1)))
    
    print('Total: mean = %.2f, std = %.2f, range is %.3f - %.3f' % (np.mean(valid_sleep_stats[col]), np.std(valid_sleep_stats[col]), np.min(valid_sleep_stats[col]), np.max(valid_sleep_stats[col])))
    print('Day shift: mean = %.2f, std = %.2f, range is %.3f - %.3f' % (np.mean(response0), np.std(response0), np.min(response0), np.max(response0)))
    print('Night shift: mean = %.2f, std = %.2f, range is %.3f - %.3f \n' % (np.mean(response1), np.std(response1), np.min(response1), np.max(response1)))
    
    stat, p = ttest_ind(overall_data[0][col].dropna(), overall_data[1][col].dropna())
    # print('Type: ' + col)
    print('Statistics = %.3f, p = %.3f' % (stat, p))
    print('\n')

nurse_years_pre-study

Number of valid participant: day: 93; night: 58

Total: mean = 11.12, std = 8.44, range is 1.000 - 40.000
Day shift: mean = 12.35, std = 8.85, range is 1.000 - 40.000
Night shift: mean = 9.16, std = 7.32, range is 1.000 - 33.000 

Statistics = 2.264, p = 0.025


hours_pre-study

Number of valid participant: day: 93; night: 58

Total: mean = 37.50, std = 9.47, range is 3.000 - 90.000
Day shift: mean = 38.21, std = 9.07, range is 12.000 - 90.000
Night shift: mean = 36.39, std = 9.97, range is 3.000 - 73.000 

Statistics = 1.136, p = 0.258


housing_pre-study

Number of valid participant: day: 93; night: 58

Total: mean = 1.69, std = 0.85, range is 1.000 - 4.000
Day shift: mean = 1.64, std = 0.87, range is 1.000 - 4.000
Night shift: mean = 1.77, std = 0.80, range is 1.000 - 4.000 

Statistics = -0.939, p = 0.349


wellbeing_pre-study

Number of valid participant: day: 93; night: 58

Total: mean = 56.04, std = 18.34, range is 0.000 - 90.000
Day shift: mean = 57.36, s

## IGTB (Sleep) - Day, Night shift

In [41]:
sleep_col = ['subjective_sleep_quality_psqi', 'sleep_latency_psqi', 'sleep_duration_psqi',
             'sleep_efficiency_psqi', 'sleep_distrubance_psqi', 'sleep_medication_psqi', 'daytime_dysfunction_psqi']

# shift_pre-study
day_sleep_stats = valid_sleep_stats.loc[(valid_sleep_stats['shift_pre-study'] == 1) | (valid_sleep_stats['shift'] == 1) & (valid_sleep_stats['shift_pre-study'] != 2)]
night_sleep_stats = valid_sleep_stats.loc[((valid_sleep_stats['shift_pre-study'] == 2) | (valid_sleep_stats['shift'] == 2)) & (valid_sleep_stats['shift_pre-study'] != 1)]

overall_data = [day_sleep_stats, night_sleep_stats]
data_type = ['day-shift', 'night-shift']

for col in sleep_col:

    response0 = overall_data[0][col]
    response1 = overall_data[1][col]
    
    print(col + '\n')
    print('Number of valid participant: day: %i; night: %i\n' % (len(response0), len(response1)))
    
    print('Total: mean = %.2f, std = %.2f, range is %.3f - %.3f' % (np.mean(valid_sleep_stats[col]), np.std(valid_sleep_stats[col]), np.min(valid_sleep_stats[col]), np.max(valid_sleep_stats[col])))
    print('Day shift: mean = %.2f, std = %.2f, range is %.3f - %.3f' % (np.mean(response0), np.std(response0), np.min(response0), np.max(response0)))
    print('Night shift: mean = %.2f, std = %.2f, range is %.3f - %.3f \n' % (np.mean(response1), np.std(response1), np.min(response1), np.max(response1)))
    
    stat, p = ttest_ind(overall_data[0][col].dropna(), overall_data[1][col].dropna())
    # print('Type: ' + col)
    print('Statistics = %.3f, p = %.3f' % (stat, p))
    print('\n')

subjective_sleep_quality_psqi

Number of valid participant: day: 93; night: 58

Total: mean = 0.54, std = 1.01, range is 0.000 - 3.000
Day shift: mean = 0.48, std = 0.98, range is 0.000 - 3.000
Night shift: mean = 0.59, std = 1.02, range is 0.000 - 3.000 

Statistics = -0.611, p = 0.542


sleep_latency_psqi

Number of valid participant: day: 93; night: 58

Total: mean = 1.34, std = 0.93, range is 0.000 - 3.000
Day shift: mean = 1.26, std = 0.88, range is 0.000 - 3.000
Night shift: mean = 1.45, std = 0.99, range is 0.000 - 3.000 

Statistics = -1.226, p = 0.222


sleep_duration_psqi

Number of valid participant: day: 93; night: 58

Total: mean = 0.97, std = 0.94, range is 0.000 - 3.000
Day shift: mean = 0.84, std = 0.88, range is 0.000 - 3.000
Night shift: mean = 1.14, std = 0.97, range is 0.000 - 3.000 

Statistics = -1.933, p = 0.055


sleep_efficiency_psqi

Number of valid participant: day: 93; night: 58

Total: mean = 0.62, std = 0.93, range is 0.000 - 3.000
Day shift: mean = 0.35, 

## IGTB (Tobacco) - Day, Night shift

In [42]:
gats_col = ['current_tobacco_gats', 'past_tobacco_gats',]
            #'individual_cigarettes',
            #'individual_clove_cigarettes', 'individual_cigars', 'e_cigarette', 'pipe_session',
            #'smokeless_session', 'other_cigarettes']

# shift_pre-study
day_sleep_stats = valid_sleep_stats.loc[(valid_sleep_stats['shift_pre-study'] == 1) | (valid_sleep_stats['shift'] == 1) & (valid_sleep_stats['shift_pre-study'] != 2)]
night_sleep_stats = valid_sleep_stats.loc[((valid_sleep_stats['shift_pre-study'] == 2) | (valid_sleep_stats['shift'] == 2)) & (valid_sleep_stats['shift_pre-study'] != 1)]

overall_data = [day_sleep_stats, night_sleep_stats]
data_type = ['day-shift', 'night-shift']

for col in gats_col:

    response0 = overall_data[0][col]
    response1 = overall_data[1][col]
    
    print(col + '\n')
    print('Number of valid participant: day: %i; night: %i\n' % (len(response0), len(response1)))
    
    # Status
    if col == 'current_tobacco_gats' or col == 'past_tobacco_gats':
        
        daily = valid_sleep_stats.loc[valid_sleep_stats[col] == 1]
        day_daily = day_sleep_stats.loc[day_sleep_stats[col] == 1]
        night_daily = night_sleep_stats.loc[night_sleep_stats[col] == 1]
        
        less_than_daily = valid_sleep_stats.loc[valid_sleep_stats[col] == 2]
        day_less_than_daily = day_sleep_stats.loc[day_sleep_stats[col] == 2]
        night_less_than_daily = night_sleep_stats.loc[night_sleep_stats[col] == 2]
        
        not_at_all = valid_sleep_stats.loc[valid_sleep_stats[col] == 3]
        day_not_at_all = day_sleep_stats.loc[day_sleep_stats[col] == 3]
        night_not_at_all = night_sleep_stats.loc[night_sleep_stats[col] == 3]
        
        # Daily
        print('Daily \n')
        print('Total: n = %i, %.3f ' % (len(daily), len(daily) / len(valid_sleep_stats)))
        print('Day shift: n = %i, %.3f' % (len(day_daily), len(day_daily) / len(day_sleep_stats)))
        print('Night shift: n = %i, %.3f\n' % (len(night_daily), len(night_daily) / len(night_sleep_stats)))
        
        # Less than daily
        print('Less than daily \n')
        print('Total: n = %i, %.3f ' % (len(less_than_daily), len(less_than_daily) / len(valid_sleep_stats)))
        print('Day shift: n = %i, %.3f' % (len(day_less_than_daily), len(day_less_than_daily) / len(day_sleep_stats)))
        print('Night shift: n = %i, %.3f\n' % (len(night_less_than_daily), len(night_less_than_daily) / len(night_sleep_stats)))
        
        # Not at all
        print('Not at all \n')
        print('Total: n = %i, %.3f ' % (len(not_at_all), len(not_at_all) / len(valid_sleep_stats)))
        print('Day shift: n = %i, %.3f' % (len(day_not_at_all), len(day_not_at_all) / len(day_sleep_stats)))
        print('Night shift: n = %i, %.3f\n' % (len(night_not_at_all), len(night_not_at_all) / len(night_sleep_stats)))
        
        stat, p = ttest_ind(day_sleep_stats[col].dropna(), night_sleep_stats[col].dropna())
        print('Statistics = %.3f, p = %.3f\n' % (stat, p))
        
    else:
        
        print()
    

current_tobacco_gats

Number of valid participant: day: 93; night: 58

Daily 

Total: n = 5, 0.033 
Day shift: n = 2, 0.022
Night shift: n = 2, 0.034

Less than daily 

Total: n = 11, 0.072 
Day shift: n = 6, 0.065
Night shift: n = 5, 0.086

Not at all 

Total: n = 135, 0.888 
Day shift: n = 84, 0.903
Night shift: n = 51, 0.879

Statistics = 0.681, p = 0.497

past_tobacco_gats

Number of valid participant: day: 93; night: 58

Daily 

Total: n = 13, 0.086 
Day shift: n = 9, 0.097
Night shift: n = 3, 0.052

Less than daily 

Total: n = 24, 0.158 
Day shift: n = 15, 0.161
Night shift: n = 9, 0.155

Not at all 

Total: n = 114, 0.750 
Day shift: n = 68, 0.731
Night shift: n = 46, 0.793

Statistics = -0.968, p = 0.335



## IGTB - Alcohol usage, Day, Night shift

In [43]:
audit_col = ['driking_frequency', 'number_of_drink_per_day', 'more_than_six_drink_frequency']

driking_frequency = ['Never', 'Monthly', '2-4 times per month', '2-3 times per week', '4 or more times a week']
number_of_drink_per_day = ['1-2', '3-4', '5-6', '7-8', '9-10']
more_than_six_drink_frequency = ['Never', 'Less than Monthly', 'Monthly', 'Weekly', 'Daily']

# shift_pre-study
day_sleep_stats = valid_sleep_stats.loc[(valid_sleep_stats['shift_pre-study'] == 1) | (valid_sleep_stats['shift'] == 1) & (valid_sleep_stats['shift_pre-study'] != 2)]
night_sleep_stats = valid_sleep_stats.loc[((valid_sleep_stats['shift_pre-study'] == 2) | (valid_sleep_stats['shift'] == 2)) & (valid_sleep_stats['shift_pre-study'] != 1)]
data_type = ['day-shift', 'night-shift']

for col in audit_col:

    print(col + '\n')
    print('Number of valid participant: day: %i; night: %i\n' % (len(response0), len(response1)))
    
    # Status
    if col == 'driking_frequency':
        answer_type = driking_frequency
    elif col == 'number_of_drink_per_day':
        answer_type = number_of_drink_per_day
    elif col == 'more_than_six_drink_frequency':
        answer_type = more_than_six_drink_frequency
        
    for i in range(1, 6, 1):
        overall = valid_sleep_stats.loc[valid_sleep_stats[col] == i]
        day_data = day_sleep_stats.loc[day_sleep_stats[col] == i]
        night_data = night_sleep_stats.loc[night_sleep_stats[col] == i]

        # Print
        print(answer_type[i-1] + '\n')
        print('Total: n = %i, %.3f ' % (len(overall), len(overall) / len(valid_sleep_stats)))
        print('Day shift: n = %i, %.3f' % (len(day_data), len(day_data) / len(day_sleep_stats)))
        print('Night shift: n = %i, %.3f\n' % (len(night_data), len(night_data) / len(night_sleep_stats)))

    stat, p = ttest_ind(day_sleep_stats[col].dropna(), night_sleep_stats[col].dropna())
    print('Statistics = %.3f, p = %.3f\n' % (stat, p))
    

driking_frequency

Number of valid participant: day: 93; night: 58

Never

Total: n = 32, 0.211 
Day shift: n = 20, 0.215
Night shift: n = 11, 0.190

Monthly

Total: n = 66, 0.434 
Day shift: n = 38, 0.409
Night shift: n = 28, 0.483

2-4 times per month

Total: n = 38, 0.250 
Day shift: n = 23, 0.247
Night shift: n = 15, 0.259

2-3 times per week

Total: n = 14, 0.092 
Day shift: n = 11, 0.118
Night shift: n = 3, 0.052

4 or more times a week

Total: n = 2, 0.013 
Day shift: n = 1, 0.011
Night shift: n = 1, 0.017

Statistics = 0.489, p = 0.625

number_of_drink_per_day

Number of valid participant: day: 93; night: 58

1-2

Total: n = 107, 0.704 
Day shift: n = 73, 0.785
Night shift: n = 33, 0.569

3-4

Total: n = 20, 0.132 
Day shift: n = 10, 0.108
Night shift: n = 10, 0.172

5-6

Total: n = 8, 0.053 
Day shift: n = 2, 0.022
Night shift: n = 6, 0.103

7-8

Total: n = 1, 0.007 
Day shift: n = 1, 0.011
Night shift: n = 0, 0.000

9-10

Total: n = 1, 0.007 
Day shift: n = 0, 0.000
Night shi

## Prestudy Info

In [44]:
ana_col = ['nurse_years_pre-study', 'hours_pre-study', 'housing_pre-study', 'wellbeing_pre-study', 'social_functioning_pre-study', 
           'pain_pre-study', 'general_health_pre-study', 'life_satisfaction_pre-study', 'perceived_stress_pre-study', 
           'psy_flexbility_pre-study', 'psy_inflexbility_pre-study', 'psy_capital_pre-study', 
           'waaq_pre-study', 'challenge_stressor_pre-study', 'hindrance_stressor_pre-study',
           'pos_af_igtb', 'neg_af_igtb', 'itp_igtb', 'irb_igtb', 'iod_id_igtb', 'iod_od_igtb', 
           'ocb_igtb', 'shipley_abs_igtb', 'shipley_voc_igtb',
           'neu_igtb', 'con_igtb', 'ext_igtb', 'agr_igtb', 'ope_igtb',
           'stai_igtb', 'audit_igtb', 'gats_status_igtb', 'gats_quantity_igtb', 'ipaq_igtb', 'psqi_igtb']

for col in ana_col:
    print(col)
    print('Number of valid participant: %i' % (len(valid_sleep_stats[col])))
    print('Average data per participant: %.3f' % (np.mean(valid_sleep_stats[col])))
    print('Std data per participant: %.3f' % (np.std(valid_sleep_stats[col])))

    print('Min/Max data of participant: %.3f/%.3f\n' % (np.max(valid_sleep_stats[col]), np.min(valid_sleep_stats[col])))


nurse_years_pre-study
Number of valid participant: 152
Average data per participant: 11.122
Std data per participant: 8.438
Min/Max data of participant: 40.000/1.000

hours_pre-study
Number of valid participant: 152
Average data per participant: 37.503
Std data per participant: 9.469
Min/Max data of participant: 90.000/3.000

housing_pre-study
Number of valid participant: 152
Average data per participant: 1.689
Std data per participant: 0.845
Min/Max data of participant: 4.000/1.000

wellbeing_pre-study
Number of valid participant: 152
Average data per participant: 56.036
Std data per participant: 18.343
Min/Max data of participant: 90.000/0.000

social_functioning_pre-study
Number of valid participant: 152
Average data per participant: 81.250
Std data per participant: 22.347
Min/Max data of participant: 100.000/0.000

pain_pre-study
Number of valid participant: 152
Average data per participant: 81.672
Std data per participant: 18.621
Min/Max data of participant: 100.000/22.500

genera

## Compliance Rate Analysis

In [45]:
ana_col = ['nurse_years_pre-study', 'hours_pre-study', 'housing_pre-study', 'wellbeing_pre-study', 'social_functioning_pre-study', 
           'pain_pre-study', 'general_health_pre-study', 'life_satisfaction_pre-study', 'perceived_stress_pre-study', 
           'psy_flexbility_pre-study', 'psy_inflexbility_pre-study', 'psy_capital_pre-study', 
           'waaq_pre-study', 'challenge_stressor_pre-study', 'hindrance_stressor_pre-study',
           'pos_af_igtb', 'neg_af_igtb', 'itp_igtb', 'irb_igtb', 'iod_id_igtb', 'iod_od_igtb', 
           'ocb_igtb', 'shipley_abs_igtb', 'shipley_voc_igtb',
           'neu_igtb', 'con_igtb', 'ext_igtb', 'agr_igtb', 'ope_igtb',
           'stai_igtb', 'audit_igtb', 'gats_status_igtb', 'gats_quantity_igtb', 'ipaq_igtb', 'psqi_igtb']
high = valid_sleep_stats.loc[(valid_sleep_stats['number_of_sleep'] > 30) & (valid_sleep_stats['sleep_rate'] > 0.55)]
low = valid_sleep_stats.loc[(valid_sleep_stats['number_of_sleep'] <= 30) | (valid_sleep_stats['sleep_rate'] <= 0.55)]

for col in ana_col:
    print(col)
    print('Number of valid participant: %i' % (len(high[col])))
    print('Average sleep data per participant: %.3f' % (np.mean(high[col])))
    print('Std sleep data per participant: %.3f' % (np.std(high[col])))

    print('Min/Max sleep data of participant: %.3f/%.3f\n' % (np.max(high[col]), np.min(high[col])))

    print('Number of valid participant: %i' % (len(low[col])))
    print('Average sleep data per participant: %.3f' % (np.mean(low[col])))
    print('Std sleep data per participant: %.3f' % (np.std(low[col])))

    print('Min/Max sleep data of participant: %.3f/%.3f\n' % (np.max(low[col]), np.min(low[col])))

    stat, p = ttest_ind(high[col].dropna(), low[col].dropna())

    print('Stats')
    print('Statistics = %.3f, p = %.3f \n' % (stat, p))

nurse_years_pre-study
Number of valid participant: 117
Average sleep data per participant: 11.504
Std sleep data per participant: 8.905
Min/Max sleep data of participant: 40.000/1.000

Number of valid participant: 35
Average sleep data per participant: 9.886
Std sleep data per participant: 6.563
Min/Max sleep data of participant: 25.000/1.000

Stats
Statistics = 0.988, p = 0.325 

hours_pre-study
Number of valid participant: 117
Average sleep data per participant: 37.562
Std sleep data per participant: 9.190
Min/Max sleep data of participant: 80.000/3.000

Number of valid participant: 35
Average sleep data per participant: 37.314
Std sleep data per participant: 10.309
Min/Max sleep data of participant: 90.000/12.000

Stats
Statistics = 0.134, p = 0.893 

housing_pre-study
Number of valid participant: 117
Average sleep data per participant: 1.708
Std sleep data per participant: 0.889
Min/Max sleep data of participant: 4.000/1.000

Number of valid participant: 35
Average sleep data per p

Std sleep data per participant: 0.745
Min/Max sleep data of participant: 4.750/1.000

Number of valid participant: 35
Average sleep data per participant: 2.369
Std sleep data per participant: 0.592
Min/Max sleep data of participant: 3.583/1.250

Stats
Statistics = -0.119, p = 0.905 

con_igtb
Number of valid participant: 117
Average sleep data per participant: 4.133
Std sleep data per participant: 0.639
Min/Max sleep data of participant: 5.000/2.333

Number of valid participant: 35
Average sleep data per participant: 4.152
Std sleep data per participant: 0.569
Min/Max sleep data of participant: 5.000/2.917

Stats
Statistics = -0.159, p = 0.874 

ext_igtb
Number of valid participant: 117
Average sleep data per participant: 3.574
Std sleep data per participant: 0.702
Min/Max sleep data of participant: 5.000/1.833

Number of valid participant: 35
Average sleep data per participant: 3.586
Std sleep data per participant: 0.604
Min/Max sleep data of participant: 5.000/2.250

Stats
Statistics

## Shift vs ana_col

In [46]:
for col in ana_col:
    response = valid_sleep_stats[col].dropna()
    print('Number of valid participant: %i' % (len(response)))
    print('Type: %s, mean = %.3f, std = %.3f, range is %.3f - %.3f \n' % (col, np.mean(response), np.std(response), np.min(response), np.max(response)))
    

data_array = []
data_type = ['shift']

# shift test
day_shift = valid_sleep_stats.loc[valid_sleep_stats['shift_pre-study'] == 1]
night_shift = valid_sleep_stats.loc[valid_sleep_stats['shift_pre-study'] == 2]
data_array.append([day_shift, night_shift])

print('T Test \n')

for i in range(len(data_array)):
    
    print(data_type[i] + '\n\n')
    
    for col in ana_col:
        
        response0 = data_array[i][0][col]
        response1 = data_array[i][1][col]
        print(col + '\n')
        print('Number of valid participant: %i' % (len(response0) + len(response1)))
        print('Day shift: mean = %.3f, std = %.3f, range is %.3f - %.3f \n' % (np.mean(response0), np.std(response0), np.min(response0), np.max(response0)))
        print('Night shift: mean = %.3f, std = %.3f, range is %.3f - %.3f \n' % (np.mean(response1), np.std(response1), np.min(response1), np.max(response1)))

        stat, p = ttest_ind(data_array[i][0][info_type].dropna(), data_array[i][1][info_type].dropna())
        print('Type: ' + col)
        print('Statistics = %.3f, p = %.3f' % (stat, p))
        print('\n')
        
corr = valid_sleep_stats[ana_col].corr()

plt.matshow(corr)
plt.show()



Number of valid participant: 148
Type: nurse_years_pre-study, mean = 11.122, std = 8.438, range is 1.000 - 40.000 

Number of valid participant: 147
Type: hours_pre-study, mean = 37.503, std = 9.469, range is 3.000 - 90.000 

Number of valid participant: 148
Type: housing_pre-study, mean = 1.689, std = 0.845, range is 1.000 - 4.000 

Number of valid participant: 148
Type: wellbeing_pre-study, mean = 56.036, std = 18.343, range is 0.000 - 90.000 

Number of valid participant: 148
Type: social_functioning_pre-study, mean = 81.250, std = 22.347, range is 0.000 - 100.000 

Number of valid participant: 148
Type: pain_pre-study, mean = 81.672, std = 18.621, range is 22.500 - 100.000 

Number of valid participant: 148
Type: general_health_pre-study, mean = 74.164, std = 17.732, range is 15.000 - 100.000 

Number of valid participant: 148
Type: life_satisfaction_pre-study, mean = 5.137, std = 1.290, range is 1.000 - 7.000 

Number of valid participant: 147
Type: perceived_stress_pre-study, mea

NameError: name 'info_type' is not defined

## Sleep Strategy

In [None]:
# frame_col = ['nurse_year', 'well_being', 'general_health', 'life_satisfaction', 'perceived_stress', 'overtime']
less_adaptive_data = sleep_data.loc[(sleep_data['night_shift_type'] == 2) | (sleep_data['night_shift_type'] == 4)]
more_adaptive_data = sleep_data.loc[(sleep_data['night_shift_type'] == 3) | (sleep_data['night_shift_type'] == 5)]

less_adaptive_sleep_stats = pd.DataFrame()
more_adaptive_sleep_stats = pd.DataFrame()

for participant_id in less_adaptive_data['participant_id'].unique():
    frame = valid_sleep_stats.loc[valid_sleep_stats['participant_id'] == participant_id]
    if len(frame) > 0:
        less_adaptive_sleep_stats = less_adaptive_sleep_stats.append(frame)

for participant_id in more_adaptive_data['participant_id'].unique():
    frame = valid_sleep_stats.loc[valid_sleep_stats['participant_id'] == participant_id]
    if len(frame) > 0:
        more_adaptive_sleep_stats = more_adaptive_sleep_stats.append(frame)
    

for col in ana_col:
    print(col + '\n')
    print('less_adaptive')
    print('Number of valid participant: %i' % (len(less_adaptive_sleep_stats)))
    print('Average: %.3f' % (np.mean(less_adaptive_sleep_stats[col])))
    print('Std: %.3f\n' % (np.std(less_adaptive_sleep_stats[col])))
    # print(sleep_stats.loc[sleep_stats['adaptive'] == 'less_adaptive'])

    print('more_adaptive')
    print('Number of valid participant: %i' % (len(more_adaptive_sleep_stats)))
    print('Average: %.3f' % (np.mean(more_adaptive_sleep_stats[col])))
    print('Std: %.3f\n' % (np.std(more_adaptive_sleep_stats[col])))
    # print(sleep_stats.loc[sleep_stats['adaptive'] == 'more_adaptive'])

## Plot IGTB

In [None]:
from sklearn.mixture import GMM
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
from sklearn.mixture import GaussianMixture
from sklearn.manifold import TSNE
import time
from ggplot import *

colors = ['r', 'g', 'b']

select_column = ['participant_id', 'shift_pre-study', 'pos_af_igtb', 'neg_af_igtb',
                 'itp_igtb', 'irb_igtb', 'iod_id_igtb', 'iod_od_igtb',
                 'ocb_igtb', 'shipley_abs_igtb', 'shipley_voc_igtb',
                 'life_satisfaction_pre-study', 'wellbeing_pre-study',
                 'social_functioning_pre-study', 'pain_pre-study', 'general_health_pre-study',
                 'neu_igtb', 'con_igtb', 'ext_igtb', 'agr_igtb', 'ope_igtb',
                 'stai_igtb', 'audit_igtb', 'gats_status_igtb', 'gats_quantity_igtb', 'ipaq_igtb', 'psqi_igtb']


select_data_df = UserInfo[select_column].dropna()

# Ground truth
ground_truth = select_data_df[['participant_id', 'shift_pre-study', 'pos_af_igtb', 'neg_af_igtb',
                               'itp_igtb', 'irb_igtb', 'iod_id_igtb', 'iod_od_igtb',
                               'ocb_igtb', 'shipley_abs_igtb', 'shipley_voc_igtb',
                               'stai_igtb', 'audit_igtb', 'gats_status_igtb', 'gats_quantity_igtb', 'ipaq_igtb', 'psqi_igtb'
                               'life_satisfaction_pre-study', 'wellbeing_pre-study',
                               'social_functioning_pre-study', 'pain_pre-study', 'general_health_pre-study']]

big_five = select_data_df[['neu_igtb', 'con_igtb', 'ext_igtb', 'agr_igtb', 'ope_igtb']]

# GMM
# gmm = GaussianMixture(n_components=3, covariance_type='full', max_iter=300).fit(big_five)
gmm = GaussianMixture(n_components=3, max_iter=300).fit(big_five)
labels = gmm.predict(big_five)

bigFiveLabelDf = pd.DataFrame(data=labels, columns=['big_five_label'], index=select_data_df.index)
finalDf = pd.concat([big_five, bigFiveLabelDf], axis=1)
finalDf = pd.concat([finalDf, ground_truth], axis=1)


# t-SNE
tsne = TSNE(n_components=2, random_state=0, n_iter=500)
tsne_results = tsne.fit_transform(big_five)

finalDf = finalDf.loc[:,:].copy()
finalDf['x-tsne'] = tsne_results[:,0]
finalDf['y-tsne'] = tsne_results[:,1]

# Plot t-SNE
fig = plt.figure(figsize = (8, 8))
ax = fig.add_subplot(111)
ax.set_xlabel('x-tsne', fontsize = 15)
ax.set_ylabel('y-tsne', fontsize = 15)
ax.set_title('2 component t-SNE', fontsize = 20)

for label, color in zip(range(3), colors):
    data_to_plot = finalDf.loc[finalDf['big_five_label'] == label]
    ax.scatter(data_to_plot['x-tsne'], data_to_plot['y-tsne'], c = color)
    
ax.set_xlim([-15, 15])
ax.set_ylim([-15, 15])

ax.grid()
plt.show()

# print(finalDf)



## Sleep stats (Mean night of sleep of participant)

In [None]:



ana_col = ['pos_af_igtb', 'neg_af_igtb', 'itp_igtb', 'irb_igtb', 'iod_id_igtb', 'iod_od_igtb', 
           'ocb_igtb', 'shipley_abs_igtb', 'shipley_voc_igtb',
           'neu_igtb', 'con_igtb', 'ext_igtb', 'agr_igtb', 'ope_igtb',
           'stai_igtb', 'audit_igtb', 'gats_status_igtb', 'gats_quantity_igtb', 'ipaq_igtb', 'psqi_igtb']
high = valid_sleep_stats.loc[(valid_sleep_stats['number_of_sleep'] > 30) & (valid_sleep_stats['sleep_rate'] > 0.55)]
low = valid_sleep_stats.loc[(valid_sleep_stats['number_of_sleep'] <= 30) | (valid_sleep_stats['sleep_rate'] <= 0.55)]

for col in ana_col:
    print(col)
    print('Number of valid participant: %i' % (len(high[col])))
    print('Average sleep data per participant: %.3f' % (np.mean(high[col])))
    print('Std sleep data per participant: %.3f' % (np.std(high[col])))

    print('Min/Max sleep data of participant: %.3f/%.3f\n' % (np.max(high[col]), np.min(high[col])))

    print('Number of valid participant: %i' % (len(low[col])))
    print('Average sleep data per participant: %.3f' % (np.mean(low[col])))
    print('Std sleep data per participant: %.3f' % (np.std(low[col])))

    print('Min/Max sleep data of participant: %.3f/%.3f\n' % (np.max(low[col]), np.min(low[col])))

    stat, p = ttest_ind(high[col].dropna(), low[col].dropna())

    print('Stats')
    print('Statistics = %.3f, p = %.3f \n' % (stat, p))


        


## sleep strategy

In [None]:
frame_col = ['nurse_year', 'well_being', 'general_health', 'life_satisfaction', 'perceived_stress', 'overtime']

less_adaptive_data = sleep_data.loc[(sleep_data['night_shift_type'] == 2) | (sleep_data['night_shift_type'] == 4)]
more_adaptive_data = sleep_data.loc[(sleep_data['night_shift_type'] == 3) | (sleep_data['night_shift_type'] == 5)]

sleep_stats = pd.DataFrame()

for participant_id in less_adaptive_data['participant_id'].unique():
    user_id = UserInfo.loc[UserInfo['participant_id'] == participant_id].index.values[0]
    nurse_year = UserInfo.loc[UserInfo['participant_id'] == participant_id]['nurse_years_pre-study'].values[0]
    
    # if len(less_adaptive_data.loc[less_adaptive_data['participant_id'] == participant_id]) > 20:
    frame = pd.DataFrame(columns=frame_col, index=[user_id])
    frame['nurse_year'] = nurse_year
    frame['well_being'] = UserInfo.loc[UserInfo['participant_id'] == participant_id]['wellbeing_pre-study'].values[0]
    frame['general_health'] = UserInfo.loc[UserInfo['participant_id'] == participant_id]['general_health_pre-study'].values[0]
    frame['life_satisfaction'] = UserInfo.loc[UserInfo['participant_id'] == participant_id]['life_satisfaction_pre-study'].values[0]
    frame['perceived_stress'] = UserInfo.loc[UserInfo['participant_id'] == participant_id]['perceived_stress_pre-study'].values[0]
    frame['overtime'] = UserInfo.loc[UserInfo['participant_id'] == participant_id]['overtime_pre-study'].values[0]

    frame['adaptive'] = 'less_adaptive'
    sleep_stats = sleep_stats.append(frame)
    
for participant_id in more_adaptive_data['participant_id'].unique():
    user_id = UserInfo.loc[UserInfo['participant_id'] == participant_id].index.values[0]
    nurse_year = UserInfo.loc[UserInfo['participant_id'] == participant_id]['nurse_years_pre-study'].values[0]
    
    # if len(more_adaptive_data.loc[more_adaptive_data['participant_id'] == participant_id]) > 20:
    frame = pd.DataFrame(columns=frame_col, index=[user_id])
    frame['nurse_year'] = nurse_year
    frame['well_being'] = UserInfo.loc[UserInfo['participant_id'] == participant_id]['wellbeing_pre-study'].values[0]
    frame['general_health'] = UserInfo.loc[UserInfo['participant_id'] == participant_id]['general_health_pre-study'].values[0]
    frame['life_satisfaction'] = UserInfo.loc[UserInfo['participant_id'] == participant_id]['life_satisfaction_pre-study'].values[0]
    frame['perceived_stress'] = UserInfo.loc[UserInfo['participant_id'] == participant_id]['perceived_stress_pre-study'].values[0]
    frame['overtime'] = UserInfo.loc[UserInfo['participant_id'] == participant_id]['overtime_pre-study'].values[0]

    frame['adaptive'] = 'more_adaptive'
    sleep_stats = sleep_stats.append(frame)
    
for col in frame_col:
    print(col + '\n')
    print('less_adaptive')
    print('Number of valid participant: %i' % (len(sleep_stats.loc[sleep_stats['adaptive'] == 'less_adaptive'])))
    print('Average: %.3f' % (np.mean(sleep_stats.loc[sleep_stats['adaptive'] == 'less_adaptive'][col])))
    print('Std: %.3f\n' % (np.std(sleep_stats.loc[sleep_stats['adaptive'] == 'less_adaptive'][col])))
    # print(sleep_stats.loc[sleep_stats['adaptive'] == 'less_adaptive'])

    print('more_adaptive')
    print('Number of valid participant: %i' % (len(sleep_stats.loc[sleep_stats['adaptive'] == 'more_adaptive'])))
    print('Average: %.3f' % (np.mean(sleep_stats.loc[sleep_stats['adaptive'] == 'more_adaptive'][col])))
    print('Std: %.3f\n' % (np.std(sleep_stats.loc[sleep_stats['adaptive'] == 'more_adaptive'][col])))
    # print(sleep_stats.loc[sleep_stats['adaptive'] == 'more_adaptive'])



In [None]:
# Number of sleep

In [None]:
frame_col = ['number_of_sleep', 'number_of_long_sleep', 'number_of_short_sleep', 
             'sleep_rate', 'short_sleep_rate', 'long_sleep_rate']

sleep_stat = valid_sleep_stats.dropna(subset=['number_of_sleep'])

print(sleep_stat)

# Number of sleep
for col in frame_col:
    print(col)
    print('Number of valid participant: %i' % (len(sleep_stat[col])))
    print('Average sleep data per participant: %.3f' % (np.mean(sleep_stat[col])))
    print('Std sleep data per participant: %.3f' % (np.std(sleep_stat[col])))

    print('Min/Max sleep data of participant: %.3f/%.3f\n' % (np.max(sleep_stat[col]), np.min(sleep_stat[col])))
    




## Prestudy stats

In [None]:
PreStudyInfo_Type = [# 'nurse_years_pre-study', 'hours_pre-study', 'housing_pre-study',
                     'wellbeing_pre-study', 'social_functioning_pre-study', 
                     'pain_pre-study', 'general_health_pre-study', 
                     'life_satisfaction_pre-study', 'perceived_stress_pre-study', 
                     'psy_flexbility_pre-study', 'psy_inflexbility_pre-study', 'psy_capital_pre-study',
                     'waaq_pre-study', 'challenge_stressor_pre-study', 'hindrance_stressor_pre-study']

for info_type in PreStudyInfo_Type:
    response = UserInfo[info_type].dropna()
    print(' Type: %s \n mean = %.3f, std = %.3f, range is %.3f - %.3f \n' % (info_type, np.mean(response), np.std(response), np.min(response), np.max(response)))
    

data_array = []
data_type = ['commute', 'shift', 'nurse_year']

# commute test
short_commute = UserInfo.loc[UserInfo['commute_time_pre-study'] <= 2]
long_commute = UserInfo.loc[UserInfo['commute_time_pre-study'] > 2]
data_array.append([short_commute, long_commute])

# shift test
day_shift = UserInfo.loc[UserInfo['shift_pre-study'] == 1]
night_shift = UserInfo.loc[UserInfo['shift_pre-study'] == 2]
data_array.append([day_shift, night_shift])

# nurse year test
short_year = UserInfo.loc[UserInfo['nurse_years_pre-study'] >= 10]
long_year = UserInfo.loc[UserInfo['nurse_years_pre-study'] < 10]
data_array.append([short_year, long_year])

print(' T Test \n')

for i in range(len(data_array)):
    
    print(' ' + data_type[i] + '\n\n')
    
    for info_type in PreStudyInfo_Type:
        
        response0 = data_array[i][0][info_type]
        response1 = data_array[i][1][info_type]
        print(' Type 1: %s \n mean = %.3f, std = %.3f, range is %.3f - %.3f \n' % (info_type, np.mean(response0), np.std(response0), np.min(response0), np.max(response0)))
        print(' Type 2: %s \n mean = %.3f, std = %.3f, range is %.3f - %.3f \n' % (info_type, np.mean(response1), np.std(response1), np.min(response1), np.max(response1)))

        stat, p = ttest_ind(data_array[i][0][info_type].dropna(), data_array[i][1][info_type].dropna())
        print(' Type: ' + info_type)
        print(' Statistics = %.3f, p = %.3f' % (stat, p))
        print('\n')
        
corr = UserInfo[PreStudyInfo_Type].corr()

plt.matshow(corr)
plt.show()
# print(corr)


## T-Test

In [None]:
# work day sleep duration test
day_workday_sleep_duration = day_workday_data['duration_in_seconds'] / 3600
night_workday_sleep_duration = night_workday_data['duration_in_seconds'] / 3600

stat, p = ttest_ind(day_workday_sleep_duration, night_workday_sleep_duration)

print('day, night work day sleep')
print('Statistics = %.3f, p = %.3f \n' % (stat, p))

# off day sleep duration test
day_off_sleep_duration = day_off_day_data['duration_in_seconds'] / 3600
night_off_sleep_duration = night_off_day_data['duration_in_seconds'] / 3600

stat, p = ttest_ind(day_off_sleep_duration, night_off_sleep_duration)

print('day, night off day sleep')
print('Statistics = %.3f, p = %.3f \n' % (stat, p))


## Health type

In [None]:
for colunm in colunm_type:
    print('Start processing: ' + colunm + '\n')
    for i in range(len(data_array)):
        # day shift Work day
        poor_health_data = data_array[i].loc[data_array[i]['general_health'] <= 60]
        mid_health_data  = data_array[i].loc[(data_array[i]['general_health'] > 60) & (data_array[i]['general_health'] <= 85)]
        good_health_data = data_array[i].loc[data_array[i]['general_health'] > 85]

        stat, p = f_oneway(np.array(poor_health_data[colunm].dropna()), np.array(mid_health_data[colunm].dropna()), np.array(good_health_data[colunm].dropna()))
        print(data_type[i] + ' Health type, ' + colunm + ' ANOVA')
        print('poor_health_data length = %i, mid_health_data length = %i, good_health_data length = %i' % (len(poor_health_data[colunm]), len(mid_health_data[colunm]), len(good_health_data[colunm])))
        print('Statistics = %.3f, p = %.3f \n' % (stat, p))
    
    print('\n\n')
    

## Nurse Year

In [None]:
for colunm in colunm_type:
    for i in range(len(data_array)):
        low_experienced_data = data_array[i].loc[data_array[i]['nurse_years'] <= 5]
        mid_experienced_data  = data_array[i].loc[(data_array[i]['nurse_years'] > 5) & (data_array[i]['nurse_years'] <= 15)]
        high_experienced_data = data_array[i].loc[data_array[i]['nurse_years'] > 15]

        stat, p = f_oneway(low_experienced_data[colunm].dropna(), mid_experienced_data[colunm].dropna(), high_experienced_data[colunm].dropna())
        
        print(data_type[i] + ' Nurse Year type, ' + colunm + ' ANOVA')
        print('low_experienced_data length = %i, mid_experienced_data length = %i, high_experienced_data length = %i' % (len(low_experienced_data[colunm]), len(mid_experienced_data[colunm]), len(high_experienced_data[colunm])))
        print('Statistics = %.3f, p = %.3f \n' % (stat, p))
    
    print('\n\n')


## Life Satisfication

In [None]:
for colunm in colunm_type:
    for i in range(len(data_array)):
        low_satisfication_data = data_array[i].loc[data_array[i]['life_satisfaction'] <= 4.5]
        mid_satisfication_data  = data_array[i].loc[(data_array[i]['life_satisfaction'] > 4.5) & (data_array[i]['life_satisfaction'] <= 6)]
        high_satisfication_data = data_array[i].loc[data_array[i]['life_satisfaction'] > 6]

        stat, p = f_oneway(low_satisfication_data[colunm].dropna(), mid_satisfication_data[colunm].dropna(), high_satisfication_data[colunm].dropna())
        
        print(data_type[i] + ' Nurse Year type, ' + colunm + ' ANOVA')
        print('low_experienced_data length = %i, mid_experienced_data length = %i, high_experienced_data length = %i' % (len(low_satisfication_data[colunm]), len(mid_satisfication_data[colunm]), len(high_satisfication_data[colunm])))
        print('Statistics = %.3f, p = %.3f \n' % (stat, p))
    
    print('\n\n')
        

In [None]:

for col in frame_col:
    
    ANOVA_array = []
    for i in range(3):
        sleep_stats_per_cluster = valid_sleep_stats.loc[valid_sleep_stats['big_five_label'] == i]
        print(col + ' big_five_label: %i' % (i))
        print('Number of valid participant: %i' % (len(sleep_stats_per_cluster[col])))
        print('Average sleep data per participant: %.3f' % (np.mean(sleep_stats_per_cluster[col])))
        print('Std sleep data per participant: %.3f' % (np.std(sleep_stats_per_cluster[col])))

        print('Min/Max sleep data of participant: %.3f/%.3f\n' % (np.max(sleep_stats_per_cluster[col]), 
                                                                  np.min(sleep_stats_per_cluster[col])))
        
        ANOVA_array.append(sleep_stats_per_cluster[col].dropna())
        
    stat, p = f_oneway(ANOVA_array[0], ANOVA_array[1], ANOVA_array[2])

    # print('\n')
    print('ANOVA Test')
    print('Statistics = %.3f, p = %.3f \n' % (stat, p))
    print('\n')



In [None]:
sleep_stats = pd.DataFrame()

for participant_id in less_adaptive_data['participant_id'].unique():
    user_id = valid_sleep_stats.loc[valid_sleep_stats['participant_id'] == participant_id].index.values[0]
    nurse_year = valid_sleep_stats.loc[valid_sleep_stats['participant_id'] == participant_id]['nurse_years_pre-study'].values[0]
    
    # if len(less_adaptive_data.loc[less_adaptive_data['participant_id'] == participant_id]) > 20:
    frame = pd.DataFrame(columns=frame_col, index=[user_id])
    frame['nurse_year'] = nurse_year
    frame['well_being'] = valid_sleep_stats.loc[valid_sleep_stats['participant_id'] == participant_id]['wellbeing_pre-study'].values[0]
    frame['general_health'] = valid_sleep_stats.loc[UserInfo['participant_id'] == participant_id]['general_health_pre-study'].values[0]
    frame['life_satisfaction'] = valid_sleep_stats.loc[valid_sleep_stats['participant_id'] == participant_id]['life_satisfaction_pre-study'].values[0]
    frame['perceived_stress'] = valid_sleep_stats.loc[valid_sleep_stats['participant_id'] == participant_id]['perceived_stress_pre-study'].values[0]
    frame['overtime'] = valid_sleep_stats.loc[valid_sleep_stats['participant_id'] == participant_id]['overtime_pre-study'].values[0]

    frame['adaptive'] = 'less_adaptive'
    sleep_stats = sleep_stats.append(frame)
    
for participant_id in more_adaptive_data['participant_id'].unique():
    user_id = UserInfo.loc[UserInfo['participant_id'] == participant_id].index.values[0]
    nurse_year = UserInfo.loc[UserInfo['participant_id'] == participant_id]['nurse_years_pre-study'].values[0]
    
    # if len(more_adaptive_data.loc[more_adaptive_data['participant_id'] == participant_id]) > 20:
    frame = pd.DataFrame(columns=frame_col, index=[user_id])
    frame['nurse_year'] = nurse_year
    frame['well_being'] = UserInfo.loc[UserInfo['participant_id'] == participant_id]['wellbeing_pre-study'].values[0]
    frame['general_health'] = UserInfo.loc[UserInfo['participant_id'] == participant_id]['general_health_pre-study'].values[0]
    frame['life_satisfaction'] = UserInfo.loc[UserInfo['participant_id'] == participant_id]['life_satisfaction_pre-study'].values[0]
    frame['perceived_stress'] = UserInfo.loc[UserInfo['participant_id'] == participant_id]['perceived_stress_pre-study'].values[0]
    frame['overtime'] = UserInfo.loc[UserInfo['participant_id'] == participant_id]['overtime_pre-study'].values[0]

    frame['adaptive'] = 'more_adaptive'
    sleep_stats = sleep_stats.append(frame)
    