# STAT 306 Group Project

In [14]:
options(warn.conflicts = FALSE)
library(dplyr)
library(tidyr)
library(ggplot2)
library(lmtest)

## Loading the dataset

In [15]:
# Load the dataset
sleep_data <- read.csv("Sleep_Efficiency.csv")

# View the summary of the dataset 
head(sleep_data)
tail(sleep_data)
summary(sleep_data)

Unnamed: 0_level_0,ID,Age,Gender,Bedtime,Wakeup.time,Sleep.duration,Sleep.efficiency,REM.sleep.percentage,Deep.sleep.percentage,Light.sleep.percentage,Awakenings,Caffeine.consumption,Alcohol.consumption,Smoking.status,Exercise.frequency
Unnamed: 0_level_1,<int>,<int>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>
1,1,65,Female,2021-03-06 01:00:00,2021-03-06 07:00:00,6.0,0.88,18,70,12,0,0.0,0,Yes,3
2,2,69,Male,2021-12-05 02:00:00,2021-12-05 09:00:00,7.0,0.66,19,28,53,3,0.0,3,Yes,3
3,3,40,Female,2021-05-25 21:30:00,2021-05-25 05:30:00,8.0,0.89,20,70,10,1,0.0,0,No,3
4,4,40,Female,2021-11-03 02:30:00,2021-11-03 08:30:00,6.0,0.51,23,25,52,3,50.0,5,Yes,1
5,5,57,Male,2021-03-13 01:00:00,2021-03-13 09:00:00,8.0,0.76,27,55,18,3,0.0,3,No,3
6,6,36,Female,2021-07-01 21:00:00,2021-07-01 04:30:00,7.5,0.9,23,60,17,0,,0,No,1


Unnamed: 0_level_0,ID,Age,Gender,Bedtime,Wakeup.time,Sleep.duration,Sleep.efficiency,REM.sleep.percentage,Deep.sleep.percentage,Light.sleep.percentage,Awakenings,Caffeine.consumption,Alcohol.consumption,Smoking.status,Exercise.frequency
Unnamed: 0_level_1,<int>,<int>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>
447,447,23,Male,2021-04-21 00:00:00,2021-04-21 07:00:00,7.0,0.5,15,40,45,2,0.0,4,Yes,
448,448,27,Female,2021-11-13 22:00:00,2021-11-13 05:30:00,7.5,0.91,22,57,21,0,0.0,0,No,5.0
449,449,52,Male,2021-03-31 21:00:00,2021-03-31 03:00:00,6.0,0.74,28,57,15,4,25.0,0,No,3.0
450,450,40,Female,2021-09-07 23:00:00,2021-09-07 07:30:00,8.5,0.55,20,32,48,1,,3,Yes,0.0
451,451,45,Male,2021-07-29 21:00:00,2021-07-29 04:00:00,7.0,0.76,18,72,10,3,0.0,0,No,3.0
452,452,18,Male,2021-03-17 02:30:00,2021-03-17 10:00:00,7.5,0.63,22,23,55,1,50.0,0,No,1.0


       ID             Age           Gender            Bedtime         
 Min.   :  1.0   Min.   : 9.00   Length:452         Length:452        
 1st Qu.:113.8   1st Qu.:29.00   Class :character   Class :character  
 Median :226.5   Median :40.00   Mode  :character   Mode  :character  
 Mean   :226.5   Mean   :40.29                                        
 3rd Qu.:339.2   3rd Qu.:52.00                                        
 Max.   :452.0   Max.   :69.00                                        
                                                                      
 Wakeup.time        Sleep.duration   Sleep.efficiency REM.sleep.percentage
 Length:452         Min.   : 5.000   Min.   :0.5000   Min.   :15.00       
 Class :character   1st Qu.: 7.000   1st Qu.:0.6975   1st Qu.:20.00       
 Mode  :character   Median : 7.500   Median :0.8200   Median :22.00       
                    Mean   : 7.466   Mean   :0.7889   Mean   :22.62       
                    3rd Qu.: 8.000   3rd Qu.:0.9000   3rd

## Cleaning the dataset

The decision to remove the "Bedtime" and "Wakeup time" columns from the dataset is because these variables might not directly influence sleep efficiency and including them could create redundancy. Sleep efficiency measures how well someone sleeps during a specific time period, which is already captured by other variables like total sleep time. Additionally, these variables might be closely related to other predictors in the dataset, potentially causing issues with the analysis. Therefore, removing "Bedtime" and "Wakeup time" allows for a more focused analysis on factors that are likely to have a stronger impact on sleep efficiency.

In [16]:
# Remove 'Bedtime' and 'Wakeup time' columns
sleep_data <- select(sleep_data, -Bedtime, -Wakeup.time)
head(sleep_data)
tail(sleep_data)

Unnamed: 0_level_0,ID,Age,Gender,Sleep.duration,Sleep.efficiency,REM.sleep.percentage,Deep.sleep.percentage,Light.sleep.percentage,Awakenings,Caffeine.consumption,Alcohol.consumption,Smoking.status,Exercise.frequency
Unnamed: 0_level_1,<int>,<int>,<chr>,<dbl>,<dbl>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>
1,1,65,Female,6.0,0.88,18,70,12,0,0.0,0,Yes,3
2,2,69,Male,7.0,0.66,19,28,53,3,0.0,3,Yes,3
3,3,40,Female,8.0,0.89,20,70,10,1,0.0,0,No,3
4,4,40,Female,6.0,0.51,23,25,52,3,50.0,5,Yes,1
5,5,57,Male,8.0,0.76,27,55,18,3,0.0,3,No,3
6,6,36,Female,7.5,0.9,23,60,17,0,,0,No,1


Unnamed: 0_level_0,ID,Age,Gender,Sleep.duration,Sleep.efficiency,REM.sleep.percentage,Deep.sleep.percentage,Light.sleep.percentage,Awakenings,Caffeine.consumption,Alcohol.consumption,Smoking.status,Exercise.frequency
Unnamed: 0_level_1,<int>,<int>,<chr>,<dbl>,<dbl>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>
447,447,23,Male,7.0,0.5,15,40,45,2,0.0,4,Yes,
448,448,27,Female,7.5,0.91,22,57,21,0,0.0,0,No,5.0
449,449,52,Male,6.0,0.74,28,57,15,4,25.0,0,No,3.0
450,450,40,Female,8.5,0.55,20,32,48,1,,3,Yes,0.0
451,451,45,Male,7.0,0.76,18,72,10,3,0.0,0,No,3.0
452,452,18,Male,7.5,0.63,22,23,55,1,50.0,0,No,1.0


We set "Male" as the baseline category for the categorical variable "Gender" in the regression analysis.

In [19]:
# Convert gender variable to factor
sleep_data$Gender <- as.factor(sleep_data$Gender)

# Set "male" as the baseline
sleep_data$Gender <- relevel(sleep_data$Gender, ref = "Male")

We set "No" as the baseline category for the categorical variable "Smoking.status".

In [21]:
# Convert Smoking.status to a factor variable
sleep_data$Smoking.status <- as.factor(sleep_data$Smoking.status)

# Set "No" as the baseline
sleep_data$Smoking.status <- relevel(sleep_data$Smoking.status, ref = "No")

We noticed that some data are missing. For the accuracy of the analysis, we decided to delete the rows with missing values.

In [23]:
# Delete rows with missing data in each column
sleep_data <- na.omit(sleep_data)

In [24]:
head(sleep_data)
tail(sleep_data)

Unnamed: 0_level_0,ID,Age,Gender,Sleep.duration,Sleep.efficiency,REM.sleep.percentage,Deep.sleep.percentage,Light.sleep.percentage,Awakenings,Caffeine.consumption,Alcohol.consumption,Smoking.status,Exercise.frequency
Unnamed: 0_level_1,<int>,<int>,<fct>,<dbl>,<dbl>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<fct>,<dbl>
1,1,65,Female,6,0.88,18,70,12,0,0,0,Yes,3
2,2,69,Male,7,0.66,19,28,53,3,0,3,Yes,3
3,3,40,Female,8,0.89,20,70,10,1,0,0,No,3
4,4,40,Female,6,0.51,23,25,52,3,50,5,Yes,1
5,5,57,Male,8,0.76,27,55,18,3,0,3,No,3
7,7,27,Female,6,0.54,28,25,47,2,50,0,Yes,1


Unnamed: 0_level_0,ID,Age,Gender,Sleep.duration,Sleep.efficiency,REM.sleep.percentage,Deep.sleep.percentage,Light.sleep.percentage,Awakenings,Caffeine.consumption,Alcohol.consumption,Smoking.status,Exercise.frequency
Unnamed: 0_level_1,<int>,<int>,<fct>,<dbl>,<dbl>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<fct>,<dbl>
445,445,24,Male,7.5,0.58,24,28,48,1,0,2,Yes,0
446,446,30,Female,7.5,0.53,28,20,52,4,50,2,Yes,1
448,448,27,Female,7.5,0.91,22,57,21,0,0,0,No,5
449,449,52,Male,6.0,0.74,28,57,15,4,25,0,No,3
451,451,45,Male,7.0,0.76,18,72,10,3,0,0,No,3
452,452,18,Male,7.5,0.63,22,23,55,1,50,0,No,1


       ID             Age           Gender    Sleep.duration   Sleep.efficiency
 Min.   :  1.0   Min.   : 9.00   Male  :194   Min.   : 5.000   Min.   :0.5000  
 1st Qu.:111.8   1st Qu.:29.00   Female:194   1st Qu.: 7.000   1st Qu.:0.7000  
 Median :228.5   Median :41.00                Median : 7.500   Median :0.8200  
 Mean   :226.3   Mean   :40.83                Mean   : 7.451   Mean   :0.7893  
 3rd Qu.:339.2   3rd Qu.:52.00                3rd Qu.: 8.000   3rd Qu.:0.9000  
 Max.   :452.0   Max.   :69.00                Max.   :10.000   Max.   :0.9900  
 REM.sleep.percentage Deep.sleep.percentage Light.sleep.percentage
 Min.   :15.00        Min.   :18.00         Min.   : 7.0          
 1st Qu.:20.00        1st Qu.:51.00         1st Qu.:15.0          
 Median :22.00        Median :58.00         Median :18.0          
 Mean   :22.68        Mean   :52.82         Mean   :24.5          
 3rd Qu.:25.00        3rd Qu.:63.00         3rd Qu.:24.0          
 Max.   :30.00        Max.   :75.00   