# data preprocessing (label encoding, rename, dan handling imbalanced data)

In [828]:
import pandas as pd

In [829]:
# Load dataset
data = pd.read_csv('../data/Sleep_health_and_lifestyle_dataset.csv')

In [830]:
# Mengisi baris yang mengandung nilai kosong
data['Sleep Disorder'] = data['Sleep Disorder'].fillna('Healthy Sleep')
data['Sleep Disorder'].value_counts()

Sleep Disorder
Healthy Sleep    219
Sleep Apnea       78
Insomnia          77
Name: count, dtype: int64

In [831]:
# Memisahkan kolom 'Blood Pressure' menjadi dua kolom 'Systolic' dan 'Diastolic'
data[['Systolic', 'Diastolic']] = data['Blood Pressure'].str.split('/', expand=True)
data['Systolic'] = data['Systolic'].astype(float)
data['Diastolic'] = data['Diastolic'].astype(float)

data.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder,Systolic,Diastolic
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,Healthy Sleep,126.0,83.0
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,Healthy Sleep,125.0,80.0
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,Healthy Sleep,125.0,80.0
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea,140.0,90.0
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea,140.0,90.0


In [832]:
# Menghapus kolom yang tidak diperlukan
data = data.copy()
data = data.drop(['Person ID', 'Blood Pressure', 'Occupation'], axis=1)
data.head()

Unnamed: 0,Gender,Age,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Heart Rate,Daily Steps,Sleep Disorder,Systolic,Diastolic
0,Male,27,6.1,6,42,6,Overweight,77,4200,Healthy Sleep,126.0,83.0
1,Male,28,6.2,6,60,8,Normal,75,10000,Healthy Sleep,125.0,80.0
2,Male,28,6.2,6,60,8,Normal,75,10000,Healthy Sleep,125.0,80.0
3,Male,28,5.9,4,30,8,Obese,85,3000,Sleep Apnea,140.0,90.0
4,Male,28,5.9,4,30,8,Obese,85,3000,Sleep Apnea,140.0,90.0


In [833]:
from sklearn.preprocessing import LabelEncoder
import joblib
from imblearn.over_sampling import SMOTE

model yang akan digunakan hanya dapat mengolah data numerik

In [834]:
# Inisialisasi label encoder untuk kolom kategorikal
label_encoders = {}
cat_columns = ['BMI Category', 'Sleep Disorder', 'Gender']
for col in cat_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

data.head()

Unnamed: 0,Gender,Age,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Heart Rate,Daily Steps,Sleep Disorder,Systolic,Diastolic
0,1,27,6.1,6,42,6,3,77,4200,0,126.0,83.0
1,1,28,6.2,6,60,8,0,75,10000,0,125.0,80.0
2,1,28,6.2,6,60,8,0,75,10000,0,125.0,80.0
3,1,28,5.9,4,30,8,2,85,3000,2,140.0,90.0
4,1,28,5.9,4,30,8,2,85,3000,2,140.0,90.0


In [835]:
# Simpan label encoder untuk 'Sleep Disorder'
joblib.dump(label_encoders['Sleep Disorder'], 'sleep_disorder_label_encoder.pkl')

['sleep_disorder_label_encoder.pkl']

pada dasarnya normal weight sama normal sama saja berdasarkan data visualization

In [836]:
# Mengganti nilai 'Normal Weight' menjadi 'Normal' pada kolom 'BMI Category'
data['BMI Category'] = data['BMI Category'].replace("Normal Weight", "Normal")
data['BMI Category'].value_counts()

BMI Category
0    195
3    148
1     21
2     10
Name: count, dtype: int64

In [837]:
# Memisahkan fitur dan label
X = data.drop('Sleep Disorder', axis=1)
y = data['Sleep Disorder']

mengatasi masalah ketidakseimbangan kelas di dataset

In [838]:
# Terapkan SMOTE untuk menangani ketidakseimbangan kelas
smote = SMOTE(random_state=1)
X_resampled, y_resampled = smote.fit_resample(X, y)

# modeling

In [839]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

untuk melatih model (train) dan mengevaluasi kinerja model (test)

In [840]:
# Bagi data menjadi training dan testing set
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [841]:
# memeriksa akurasi jika menggunakan DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

model_dtc = DecisionTreeClassifier(class_weight='balanced')
model_dtc.fit(X_train, y_train)

y_pred = model_dtc.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.8939393939393939
Precision: 0.8930976430976431
Recall: 0.8939393939393939
F1-score: 0.8931583353753165


In [842]:
# Memeriksa akurasi RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)


Accuracy: 0.9242424242424242
Precision: 0.9266738956294082
Recall: 0.9242424242424242
F1-score: 0.9245076052683937


In [843]:
# Evaluasi pentingnya fitur
importances = model.feature_importances_
feature_names = X.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Menampilkan tabel pentingnya fitur
print(importance_df)

                    Feature  Importance
9                  Systolic    0.219984
10                Diastolic    0.200509
6              BMI Category    0.122094
4   Physical Activity Level    0.103911
8               Daily Steps    0.098108
2            Sleep Duration    0.080168
1                       Age    0.069867
7                Heart Rate    0.041973
3          Quality of Sleep    0.032866
5              Stress Level    0.017914
0                    Gender    0.012605


memahami seberapa baik model dapat mengklasifikasikan atau memprediksi data dengan benar.

In [844]:
# Simpan model ke file
joblib.dump(model, 'sleep_disorder_model.pkl')

['sleep_disorder_model.pkl']

# prediction on new data

In [845]:
# Load model dan label encoder
model = joblib.load('sleep_disorder_model.pkl')
sleep_disorder_label_encoder = joblib.load('sleep_disorder_label_encoder.pkl')

In [846]:
# Contoh data baru untuk prediksi
new_data = pd.DataFrame({
    'Gender': [1],
    'Age': [25],
    'Sleep Duration': [0],
    'Quality of Sleep': [0],
    'Physical Activity Level': [0],
    'Stress Level': [4],
    'BMI Category': [0],
    'Heart Rate': [50],
    'Daily Steps': [10],
    'Systolic': [50],
    'Diastolic': [30]
})

In [847]:
# Prediksi gangguan tidur
prediction = model.predict(new_data)
predicted_label = sleep_disorder_label_encoder.inverse_transform(prediction)
print("Predicted Sleep Disorder:", predicted_label[0])

Predicted Sleep Disorder: Healthy Sleep


In [848]:
# Load the trained model
model = joblib.load('sleep_disorder_model.pkl')

# Print feature names
print(model.feature_names_in_)


['Gender' 'Age' 'Sleep Duration' 'Quality of Sleep'
 'Physical Activity Level' 'Stress Level' 'BMI Category' 'Heart Rate'
 'Daily Steps' 'Systolic' 'Diastolic']
