# Sleep Quality Prediction using Machine Learning
This notebook trains a classification model to predict sleep quality based on lifestyle and physiological data.

In [None]:
# Importing required libraries
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import pickle

## Loading the Dataset

In [None]:
# Dataset: Sleep Health and Lifestyle
url = "https://raw.githubusercontent.com/thiagosnuness/sleep_backend/refs/heads/main/machine_learning/Sleep_health_and_lifestyle_dataset.csv"
df = pd.read_csv(url, keep_default_na=False)

# Displaying first rows
df.head()

## Preprocessing the Data

In [None]:
# Mapping categorical variables to numeric
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
df['Sleep Disorder'] = df['Sleep Disorder'].map({'None': 0, 'Insomnia': 1, 'Sleep Apnea': 1})

# Selecting features and target: numeric and strongly linked to sleep behavior, improving model performance and interpretability.
features = ['Age', 'Heart Rate', 'Stress Level', 'Physical Activity Level', 'Sleep Duration']
X = df[features]
y = df['Sleep Disorder']

## Splitting the Data

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7, stratify=y)

# K-Fold for validation
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)

## Evaluating Models

In [None]:
# Models to compare
models = []
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('Naive Bayes', GaussianNB()))
models.append(('SVM', SVC()))

# Lists to store results
results = []
names = []

# Cross-validation
for name, model in models:
    result = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
    results.append(result)
    names.append(name)
    print(f"{name}: Mean Accuracy = {result.mean():.4f} (+/- {result.std():.4f})")

# Creating the boxplot
fig = plt.figure(figsize=(10, 6))
plt.title('Model Comparison')
box = plt.boxplot(results, patch_artist=True)
best = np.argmax([r.mean() for r in results])
[patch.set_facecolor('lightgreen' if i == best else 'lightgray') for i, patch in enumerate(box['boxes'])]
plt.xticks(ticks=range(1, len(names)+1), labels=names)
plt.ylabel('Accuracy')
plt.grid(True)
plt.legend()
plt.show()

### Model Evaluation with Scaled and Normalized Data


In [None]:
# Set a random seed for reproducibility
np.random.seed(7)

# Lists to store results
pipelines = []
results = []
names = []

# Define classifiers
knn = ('KNN', KNeighborsClassifier())
cart = ('CART', DecisionTreeClassifier())
nb = ('NB', GaussianNB())
svm = ('SVM', SVC())

# Define scalers
standard_scaler = ('StandardScaler', StandardScaler())
minmax_scaler = ('MinMaxScaler', MinMaxScaler())

# Pipelines without scaling
pipelines.append(('KNN-orig', Pipeline([knn])))
pipelines.append(('CART-orig', Pipeline([cart])))
pipelines.append(('NB-orig', Pipeline([nb])))
pipelines.append(('SVM-orig', Pipeline([svm])))

# Pipelines with StandardScaler
pipelines.append(('KNN-std', Pipeline([standard_scaler, knn])))
pipelines.append(('CART-std', Pipeline([standard_scaler, cart])))
pipelines.append(('NB-std', Pipeline([standard_scaler, nb])))
pipelines.append(('SVM-std', Pipeline([standard_scaler, svm])))

# Pipelines with MinMaxScaler
pipelines.append(('KNN-norm', Pipeline([minmax_scaler, knn])))
pipelines.append(('CART-norm', Pipeline([minmax_scaler, cart])))
pipelines.append(('NB-norm', Pipeline([minmax_scaler, nb])))
pipelines.append(('SVM-norm', Pipeline([minmax_scaler, svm])))

# Cross-validation for each pipeline
for name, model in pipelines:
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    print(f"{name}: Mean Accuracy = {cv_results.mean():.3f} (+/- {cv_results.std():.3f})")

# Creating the boxplot
fig = plt.figure(figsize=(16, 8))
fig.suptitle('Comparison of Models with and without Scaling')
ax = fig.add_subplot(111)
box = ax.boxplot(results, patch_artist=True)
best = np.argmax([r.mean() for r in results])
[patch.set_facecolor('lightgreen' if i == best else 'lightgray') for i, patch in enumerate(box['boxes'])]
ax.set_xticklabels(names, rotation=45, ha='right')
plt.ylabel('Accuracy')
plt.grid(True)
plt.tight_layout()
plt.show()

## Hyperparameter Tuning with GridSearchCV

In [None]:
# Pipelines with optional scalers
pipelines = []
results = []
names = []

# Scalers
standard_scaler = ('Scaler', StandardScaler())
min_max_scaler = ('MinMaxScaler', MinMaxScaler())

# KNN
knn = ('KNN', KNeighborsClassifier())
pipelines.append(('knn-orig', Pipeline([knn])))
pipelines.append(('knn-std', Pipeline([standard_scaler, knn])))
pipelines.append(('knn-norm', Pipeline([min_max_scaler, knn])))

# CART (no scaler needed)
cart = ('CART', DecisionTreeClassifier())
pipelines.append(('cart', Pipeline([cart])))

# SVM
svm = ('SVM', SVC())
pipelines.append(('svm-std', Pipeline([standard_scaler, svm])))
pipelines.append(('svm-norm', Pipeline([min_max_scaler, svm])))

# Naive Bayes
nb = ('NB', GaussianNB())
pipelines.append(('nb-std', Pipeline([standard_scaler, nb])))
pipelines.append(('nb-norm', Pipeline([min_max_scaler, nb])))

# Hyperparameters for each model
param_grids = {
    'knn-orig': {'KNN__n_neighbors': list(range(1, 20, 2)), 'KNN__metric': ['euclidean', 'manhattan', 'minkowski']},
    'knn-std': {'KNN__n_neighbors': list(range(1, 20, 2)), 'KNN__metric': ['euclidean', 'manhattan', 'minkowski']},
    'knn-norm': {'KNN__n_neighbors': list(range(1, 20, 2)), 'KNN__metric': ['euclidean', 'manhattan', 'minkowski']},
    'cart': {'CART__max_depth': [None, 5, 10, 15]},
    'svm-std': {'SVM__C': [0.1, 1, 10], 'SVM__kernel': ['linear', 'rbf']},
    'svm-norm': {'SVM__C': [0.1, 1, 10], 'SVM__kernel': ['linear', 'rbf']},
    'nb-std': {},  # Naive Bayes has no hyperparameters to tune
    'nb-norm': {},
}

# Grid Search
for name, pipeline in pipelines:
    param_grid = param_grids[name]
    grid = GridSearchCV(pipeline, param_grid=param_grid, scoring='accuracy', cv=kfold)
    grid.fit(X_train, y_train)

    # Evaluation with better parameters
    best_model = grid.best_estimator_
    cv_scores = cross_val_score(best_model, X_train, y_train, scoring='accuracy', cv=kfold)
    results.append(cv_scores)
    names.append(name)
    print(f"{name} - Best Score: {grid.best_score_:.4f} using {grid.best_params_}")

# Creating the boxplot
fig = plt.figure(figsize=(14, 7))
fig.suptitle('Comparison of Models - Hyperparameter Tuning with GridSearchCV')
ax = fig.add_subplot(111)
box = ax.boxplot(results, patch_artist=True)
best = np.argmax([r.mean() for r in results])
[patch.set_facecolor('lightgreen' if i == best else 'lightgray') for i, patch in enumerate(box['boxes'])]
ax.set_xticklabels(names, rotation=45, ha='right')
plt.ylabel('Accuracy')
plt.grid(True)
plt.tight_layout()
plt.show()

## Final Model Training and Evaluation

In [None]:
# Training the best model with full training set
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Use best params from GridSearchCV
model = SVC(kernel='rbf', C=10)
model.fit(X_train_scaled, y_train)

# Accuracy on test set
predictions = model.predict(X_test_scaled)
print("Test Accuracy:", accuracy_score(y_test, predictions))

## Final Training and Prediction with Full Dataset

In [None]:
# Training the final model with all available data
scaler = StandardScaler().fit(X)  # Fit scaler on full dataset
X_scaled = scaler.transform(X)

# Use best parameters found in GridSearchCV
model = SVC(kernel='rbf', C=10)
model.fit(X_scaled, y)

# Simulating 3 new input data ('Age', 'Heart Rate', 'Stress Level', 'Physical Activity Level', 'Sleep Duration')
new_data = [
    [25, 70, 7, 4, 6.5],
    [28, 85, 8, 30, 5.9],
    [60, 65, 3, 5, 7.5]
]
new_data_scaled = scaler.transform(new_data)
predictions_final = model.predict(new_data_scaled)
for sample, pred in zip(new_data, predictions_final):
    result = "Disorder" if pred == 1 else "No Disorder"
    print(f"Input: {sample} => Prediction: {result}")

## Saving the Model and Scaler using Pickle

In [None]:
# Saving model and scaler
with open('sleep_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('sleep_scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

## Final Conclusion

After evaluating multiple classification models — **K-Nearest Neighbors (KNN)**, **Decision Tree (CART)**, **Naive Bayes**, and **Support Vector Machine (SVM)** — under different preprocessing conditions (original data, standardized, and normalized), we performed hyperparameter tuning using `GridSearchCV` to optimize each model.

**Key Findings:**

- The best model after hyperparameter tuning was:
  - **Support Vector Machine (SVM)** with **RBF kernel** and **C=10**
  - Achieved a **mean cross-validation accuracy** of **93.31%**
  - Achieved a **final test accuracy** of **96%**

- Standardization (using `StandardScaler`) significantly improved the performance of SVM and KNN models.

**Final Model Selected:**
```python
model = SVC(kernel='rbf', C=10)
```

**Dataset Source**  
The dataset used in this project was obtained from [Kaggle – Sleep Health and Lifestyle Dataset](https://www.kaggle.com/datasets/uom190346a/sleep-health-and-lifestyle-dataset)  
It contains information about age, heart rate, stress levels, physical activity, sleep duration, and more, allowing the prediction of potential sleep disorders.
