<a href="https://colab.research.google.com/github/the-Jabberwocky/The_Debuggers_Project/blob/main/I310D_Data_Science_Project_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Evaluating The Relationship Between Lifestyle Practices and Sleep Quality**

**Group**: The Debuggers

**Names**: Harini Chandrasekhar, Serena Manwani, Taryn Morris, Mona Brown

**Course**: I310D - Introduction to Human-Centered Data Science Project

###**Imported Libraries and File Read In**

In [1]:
#Import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

#Read in csv file
sleep_quality_df = pd.read_csv("Sleep_health_and_lifestyle_dataset.csv")
sleep_quality_df.head()


FileNotFoundError: [Errno 2] No such file or directory: 'Sleep_health_and_lifestyle_dataset.csv'

### **Data Cleaning and Preprocessing**

#### **Initial Cleaning and Statistics**

In [None]:
# Adjust the Blood Pressure Column by splitting the values by the slash into systolic and diastolic
sleep_quality_df[['Systolic Blood Pressure', 'Diastolic Blood Pressure']] = sleep_quality_df['Blood Pressure'].str.split('/', expand=True)
sleep_quality_df['Systolic Blood Pressure'] = pd.to_numeric(sleep_quality_df['Systolic Blood Pressure'], errors='coerce')
sleep_quality_df['Diastolic Blood Pressure'] = pd.to_numeric(sleep_quality_df['Diastolic Blood Pressure'], errors='coerce')
# Drop the original blood pressure column
#sleep_quality_df.drop(columns=['Blood Pressure'], inplace=True)

# Drop any null values in the dataframe and update the dataframe
sleep_quality_df.dropna(inplace=True)

# Present statistics on the dataset
sleep_quality_df.info()

# Present descriptive statistics on the dataset
sleep_quality_df.describe()


#### **Exploratory Analysis Visualizations - Categorical Variables**

In [None]:
#Visualizations for Categorical Variable Gender
gender_column = sleep_quality_df["Gender"]
sleep_quality_column = sleep_quality_df["Quality of Sleep"]

average_sleep_quality_gender = sleep_quality_df.groupby("Gender")["Quality of Sleep"].mean()
indices = average_sleep_quality_gender.index.tolist()
avg_sleep_data = average_sleep_quality_gender.to_list()
display(gender_column)

# Now plot our bar graph
plt.bar(indices, avg_sleep_data, color ='green', width = 0.7)


In [None]:
#Visualizations for Categorical Variable Occupation
occupation_column = sleep_quality_df["Occupation"]
sleep_quality_column = sleep_quality_df["Quality of Sleep"]

average_sleep_quality_occupation = sleep_quality_df.groupby("Occupation")["Quality of Sleep"].mean()
indices = average_sleep_quality_occupation.index.tolist()
avg_sleep_data = average_sleep_quality_occupation.to_list()
display(gender_column)

# Now plot our bar graph
plt.bar(indices, avg_sleep_data, color ='pink', width = 0.6)
# Rotating X-axis labels
plt.xticks(rotation = 90)

In [None]:
#Visualizations for Categorical Variable BMI
BMI_column = sleep_quality_df["BMI Category"]
sleep_quality_column = sleep_quality_df["Quality of Sleep"]

average_sleep_quality_bmi = sleep_quality_df.groupby("BMI Category")["Quality of Sleep"].mean()
indices = average_sleep_quality_bmi.index.tolist()
avg_sleep_data = average_sleep_quality_bmi.to_list()
display(BMI_column)

# Now plot our bar graph
plt.bar(indices, avg_sleep_data, color ='maroon', width = 0.5)
# Rotating X-axis labels
plt.xticks(rotation = 70)

In [None]:
#Visualizations for Categorical Variable Sleep Disorder

sleep_disorder_column = sleep_quality_df["Sleep Disorder"]
sleep_quality_column = sleep_quality_df["Quality of Sleep"]
average_sleep_quality_disorder = sleep_quality_df.groupby("Sleep Disorder")["Quality of Sleep"].mean()
indices = average_sleep_quality_disorder.index.tolist()
avg_sleep_data = average_sleep_quality_disorder.to_list()
display(sleep_disorder_column)

# Now plot our bar graph
plt.bar(indices, avg_sleep_data, color ='violet', width = 0.7)
# Rotating X-axis labels
plt.xticks(rotation = 70)

#### **Exploratory Analysis Visualizations - Continous Variables**

In [None]:
#Correlation Matrix for Continuous Variables
sleep_quality_df

#Creating a correlation matrix of the attributes and understanding the strength of the relationship between variables
numeric_sleep_quality_df = sleep_quality_df.select_dtypes(include=['number'])
correlation_matrix = numeric_sleep_quality_df.corr()
#Presenting correlation in a heat map to visualize the strongest relationships
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')

#### **One-Hot Encoding Significant Categorical Variable**

In [None]:
#Use one hot encoding on the categorical attributes to make them numerical : https://www.geeksforgeeks.org/ml-one-hot-encoding/
occupation_encoder = OneHotEncoder()
# Build vocabulary to use encoder on 'Occupation' column
occupation_encoder.fit(sleep_quality_df[["Occupation"]])
# Print the occupation vocabulary by each unique name
display("Printing Occupation Vocabulary (Unique Occupations):", occupation_encoder.get_feature_names_out(input_features=['Occupation']))
# Encode the occupation variable by assigning the numeric 1s and 0s
occupation_encoded = occupation_encoder.transform(sleep_quality_df[["Occupation"]])
#Assign respective names to encoded occupation
encoded_feature_names = occupation_encoder.get_feature_names_out(input_features=['Occupation'])
# Create DataFrame of encoded 'Occupation'
occupation_encoded_df = pd.DataFrame(occupation_encoded.toarray(), columns=encoded_feature_names)
# Drop the original 'Occupation' column
sleep_quality_df.drop(columns=["Occupation"], inplace=True)
sleep_quality_df.reset_index(drop = True, inplace=True)
occupation_encoded_df.reset_index(drop = True, inplace = True)
# Merge the new columns of the unique occupation variables to the original data frame
sleep_quality_df = pd.concat([sleep_quality_df, occupation_encoded_df], axis=1)

display(sleep_quality_df)


### **Organized Data with Required Columns**




In [None]:
#Select desired columns and display
sleep_quality_df = sleep_quality_df[['Person ID', 'Age', 'Sleep Duration', 'Occupation_Accountant', 'Occupation_Doctor', 'Occupation_Engineer', 'Occupation_Lawyer', 'Occupation_Nurse', 'Occupation_Sales Representative', 'Occupation_Salesperson', 'Occupation_Software Engineer', 'Occupation_Teacher', 'Stress Level', 'Heart Rate', 'Quality of Sleep']]
display(sleep_quality_df)

### **Data Split: Training (80%) - Testing (20%)**

In [None]:
#Split the dataset into training and testing (20% testing : 80% training)
train_sleep_quality_df, test_sleep_quality_df = train_test_split(sleep_quality_df, test_size = 0.2)
display(test_sleep_quality_df)
display(train_sleep_quality_df)

### **Machine Learning Models: Training**


explain

#### K-Nearest Neighbors (KNN)

In [None]:
## Split into x and y sets, so the KNN model is not influenced by results
##
train_x = train_sleep_quality_df.iloc[:, 0:13].
train_x