### STEP 1 - DATA LOADING, PREPROCESSING AND TEST-TRAIN DATA SPLITING

In [17]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Loading the dataset
data = pd.read_csv('Impact_of_Remote_Work_on_Mental_Health.csv')

# Step 1: Removing rows with missing values in critical columns
data.dropna(subset=['Mental_Health_Condition', 'Physical_Activity'], inplace=True)

# Step 2: Checking unique values for Physical Activity
print("Unique values in 'Physical_Activity':", data['Physical_Activity'].unique())

# Step 3: Normalizing `Mental_Health_Condition` values to lowercase for mapping consistency
data['Mental_Health_Condition'] = data['Mental_Health_Condition'].str.lower()

# Defining binary mapping: "High Risk" for anxiety and depression, "Low Risk" for burnout
binary_map = {
    'anxiety': 'High Risk',
    'burnout': 'Low Risk',
    'depression': 'High Risk'
}
# Applying the mapping
data['Mental_Health_Condition'] = data['Mental_Health_Condition'].replace(binary_map)

# Filtering data to keep only "High Risk" and "Low Risk"
data = data[data['Mental_Health_Condition'].isin(['High Risk', 'Low Risk'])]

# Checking if there is any data left
if data.empty:
    print("No data left after filtering for 'High Risk' and 'Low Risk'. Please check the dataset values.")
else:
    print(f"Data shape after filtering: {data.shape}")

    # Step 4: Encoding the binary target variable
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(data['Mental_Health_Condition'])

    # Step 5: Handling categorical features
    # Handling the Physical Activity encoding
    physical_activity_mapping = {
        'Daily': 'Physical_Activity_Daily',
        'Weekly': 'Physical_Activity_Weekly'
    }

    # Creating new columns based on mapping
    for activity in physical_activity_mapping.values():
        data[activity] = 0  # Initialize with zeros

    # Setting to 1 for the corresponding activity type
    for original, encoded in physical_activity_mapping.items():
        data.loc[data['Physical_Activity'] == original, encoded] = 1

    # Dropping the original Physical_Activity column
    data.drop(columns=['Physical_Activity'], inplace=True)

    # Separating features and target
    X = data.drop(columns=['Mental_Health_Condition', 'Employee_ID'])  # Drop ID and target column

    # Encoding ordinal features
    ordinal_cols = ['Satisfaction_with_Remote_Work', 'Stress_Level']  # Adjust based on dataset specifics
    for col in ordinal_cols:
        X[col] = LabelEncoder().fit_transform(X[col])

    # One-hot encoding nominal features
    categorical_cols = X.select_dtypes(include='object').columns
    nominal_cols = [col for col in categorical_cols if col not in ordinal_cols]
    X = pd.get_dummies(X, columns=nominal_cols, drop_first=True)

    # Scaling numerical features
    scaler = StandardScaler()
    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
    X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

    # Step 6: Confirming preprocessing steps
    print("Preprocessed Data (X):\n", X.head())
    print("\nTarget Variable (y):\n", y[:5])

    # Step 7: Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Step 8: Training a classifier
    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train, y_train)

    # Step 9: Making predictions
    y_pred = clf.predict(X_test)

    # Step 10: Calculating evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print("\nEvaluation Metrics:")
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)


Unique values in 'Physical_Activity': ['Weekly' 'Daily']
Data shape after filtering: (2577, 20)
Preprocessed Data (X):
          Age  Years_of_Experience  Hours_Worked_Per_Week  \
0  -0.791644            -0.477010               0.619797   
1  -0.080912            -1.479012               1.040538   
6  -0.880486             0.625192               0.956390   
9  -0.969327             1.025992               1.461279   
12 -0.080912            -1.679413              -1.568056   

    Number_of_Virtual_Meetings  Work_Life_Balance_Rating  Stress_Level  \
0                     -0.09727                 -0.697464             2   
1                     -0.75061                 -1.405088             2   
6                     -0.09727                  0.010160             1   
9                     -0.31505                 -1.405088             1   
12                    -0.09727                 -0.697464             0   

    Social_Isolation_Rating  Satisfaction_with_Remote_Work  \
0           

### STEP 2 - SMOTE AND AIMING FOR BETTER ACCURACY SCORES

In [18]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier  # or any classifier you're using
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Applying SMOTE to the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Standardizing the features
scaler = StandardScaler()
X_train_resampled = scaler.fit_transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Fitting model
model = RandomForestClassifier(random_state=42)  
model.fit(X_train_resampled, y_train_resampled)

# Making predictions
y_pred = model.predict(X_test)

# Calculating evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Output the evaluation metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Accuracy: 0.6356589147286822
Precision: 0.3333333333333333
Recall: 0.09941520467836257
F1 Score: 0.15315315315315314


### STEP 3 - FINE TUNING via GridSearchCV

In [4]:
from sklearn.model_selection import GridSearchCV

# The model
rf_model = RandomForestClassifier(random_state=42)

# Defining the hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'class_weight': ['balanced', None]
}

# Setting up the grid search
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid,
                           scoring='f1', cv=5, verbose=1, n_jobs=-1)

# Fitting the grid search
grid_search.fit(X_train_resampled, y_train_resampled)

# Best parameters
print("Best parameters found: ", grid_search.best_params_)

# Evaluating on the test set using the best model
best_rf_model = grid_search.best_estimator_
y_pred_best = best_rf_model.predict(X_test)

# Calculating evaluation metrics for the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
precision_best = precision_score(y_test, y_pred_best)
recall_best = recall_score(y_test, y_pred_best)
f1_best = f1_score(y_test, y_pred_best)

print(f"Best Model - Accuracy: {accuracy_best}")
print(f"Best Model - Precision: {precision_best}")
print(f"Best Model - Recall: {recall_best}")
print(f"Best Model - F1 Score: {f1_best}")


Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best parameters found:  {'class_weight': None, 'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 100}
Best Model - Accuracy: 0.6492248062015504
Best Model - Precision: 0.391304347826087
Best Model - Recall: 0.10526315789473684
Best Model - F1 Score: 0.16589861751152074


In [19]:
from collections import Counter

X_resampled, y_resampled = smote.fit_resample(X, y)

class_distribution = Counter(y_resampled)
print("Class distribution after SMOTE:", class_distribution)


Class distribution after SMOTE: Counter({0: 1723, 1: 1723})


### Step 4 - RETRIEVING THE BEST PARAMETERS AND MODEL

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
rf_model = RandomForestClassifier(n_estimators=200)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Random Forest - Accuracy:", accuracy)
print("Random Forest - Precision:", precision)
print("Random Forest - Recall:", recall)
print("Random Forest - F1 Score:", f1)


Random Forest - Accuracy: 0.7753623188405797
Random Forest - Precision: 0.8377358490566038
Random Forest - Recall: 0.6646706586826348
Random Forest - F1 Score: 0.7412353923205343


### COMPARING WITH OTHER CLASSIFICATION MODELS

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

# Initializing models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=200),
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(),
    "Gradient Boosting": GradientBoostingClassifier(),
}

# Splitting the balanced dataset
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Storing metrics for comparison
results = []

for name, model in models.items():
    # Fitting the model
    model.fit(X_train, y_train)
    
    # Making predictions
    y_pred = model.predict(X_test)
    
    # Calculating metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Appending results
    results.append({
        "Model": name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    })

# Creating a DataFrame for better visualization
results_df = pd.DataFrame(results)
print(results_df)


                 Model  Accuracy  Precision    Recall  F1 Score
0        Random Forest  0.784058   0.851711  0.670659  0.750419
1  Logistic Regression  0.668116   0.679181  0.595808  0.634769
2                  SVM  0.702899   0.759036  0.565868  0.648370
3    Gradient Boosting  0.714493   0.789030  0.559880  0.654991


### MODEL SELECTED - RANDOM FOREST 

In [23]:
rf_model = RandomForestClassifier(random_state=42)

# Parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'class_weight': [None, 'balanced'] # Weights associated with classes
}


In [24]:
# Setting up GridSearchCV
grid_search = GridSearchCV(estimator=rf_model,
                           param_grid=param_grid,
                           scoring='f1',  # or 'accuracy', 'precision', 'recall', etc.
                           cv=5,  # Number of cross-validation folds
                           n_jobs=-1,  # Use all available cores
                           verbose=2)  # Increase verbosity to see the progress


In [25]:
# Fitting the model
grid_search.fit(X_train, y_train)  # Use your training data


Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [26]:
# Best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best parameters: {best_params}")
print(f"Best CV Score: {best_score}")


Best parameters: {'class_weight': None, 'max_depth': None, 'min_samples_split': 2, 'n_estimators': 300}
Best CV Score: 0.7574853359491702


In [28]:
# Getting the best model
best_rf_model = grid_search.best_estimator_

# Predictions on the test set
y_pred = best_rf_model.predict(X_test)

# Evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Accuracy: 0.7927536231884058
Precision: 0.8715953307392996
Recall: 0.6706586826347305
F1 Score: 0.7580372250423012


### STEP 5 - SAVING THE MODEL

In [39]:
from joblib import dump

dump(best_rf_model, 'mental_health_model.joblib')

['mental_health_model.joblib']

In [40]:
import pickle

with open('mental_health_model.pkl', 'wb') as file:
    pickle.dump(best_rf_model, file)
