In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle
import os

# Specify the path to your CSV file
# Recommended: Place the CSV in the same directory as your script
CSV_PATH = 'Extended_Sleep_Health_Dataset.csv'

# Check if the CSV file exists
if not os.path.exists(CSV_PATH):
    print(f"Error: CSV file not found at {CSV_PATH}")
    print("Please create a CSV file with the following columns:")
    print("Person ID, Gender, Age, Occupation, Stress Level")
    exit(1)

# Load the dataset
df = pd.read_csv(CSV_PATH)

# Preprocessing
# Encode categorical variables
le_gender = LabelEncoder()
le_occupation = LabelEncoder()

df['Gender_Encoded'] = le_gender.fit_transform(df['Gender'])
df['Occupation_Encoded'] = le_occupation.fit_transform(df['Occupation'])

# Prepare features and target
X = df[['Age', 'Gender_Encoded', 'Occupation_Encoded']]
y = df['Stress Level']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Ensure the model directory exists
os.makedirs('models', exist_ok=True)

# Save the model and label encoders
model_path = 'models/random_forest_model.pkl'
with open(model_path, 'wb') as file:
    pickle.dump({
        'model': rf_model,
        'gender_encoder': le_gender,
        'occupation_encoder': le_occupation
    }, file)

print(f"\nModel saved as '{model_path}'")

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance)

Model Accuracy: 0.413953488372093

Classification Report:
              precision    recall  f1-score   support

           1       0.17      0.15      0.16        13
           2       0.07      0.06      0.07        16
           3       0.73      0.63      0.68        35
           4       0.36      0.43      0.39        23
           5       0.37      0.48      0.42        27
           6       0.25      0.14      0.18        21
           7       0.50      0.50      0.50        20
           8       0.52      0.53      0.53        43
           9       0.25      0.29      0.27        17

    accuracy                           0.41       215
   macro avg       0.36      0.36      0.36       215
weighted avg       0.41      0.41      0.41       215


Model saved as 'models/random_forest_model.pkl'

Feature Importance:
              feature  importance
0                 Age    0.629861
2  Occupation_Encoded    0.331913
1      Gender_Encoded    0.038226
