# Apply Random Forest in Impact of Remote Work on Mental Health

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd 
import warnings
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV


warnings.filterwarnings("ignore")
df=pd.read_csv("Impact_of_Remote_Work_on_Mental_Health.csv")

df = df.where(pd.notnull(df), "None")

print(df.info())
print(df.isnull().sum())
print(df.head())

def combine_conditions_v2(row):
    mental_conditions = ['Burnout', 'Depression', 'Anxiety', 'None']
    stress_levels = ['High', 'Medium', 'Low']

    if row['Mental_Health_Condition'] not in mental_conditions or row['Stress_Level'] not in stress_levels:
        return "Dados inválidos"

    if row['Stress_Level'] == 'High' and row['Mental_Health_Condition'] in ['Burnout', 'Depression']:
        return "Grave"
    elif row['Mental_Health_Condition'] in ['Burnout', 'Depression']:
        return "Alta vulnerabilidade"
    elif row['Stress_Level'] == 'High' or row['Mental_Health_Condition'] == 'Anxiety':
        return "Alta vulnerabilidade"
    elif row['Stress_Level'] == 'Medium':
        return "Moderada vulnerabilidade"
    elif row['Stress_Level'] == 'Low' and row['Mental_Health_Condition'] == 'None':
        return "Baixa vulnerabilidade"
    else:
        return "Dados inválidos"


# Aplicar ao DataFrame
df['Vulnerability_Level'] = df.apply(combine_conditions_v2, axis=1)
print(df)

vulnerability_counts = df['Vulnerability_Level'].value_counts()

print("Contagem de Vulnerabilidades:")
print(vulnerability_counts)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 20 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   Employee_ID                        5000 non-null   object
 1   Age                                5000 non-null   int64 
 2   Gender                             5000 non-null   object
 3   Job_Role                           5000 non-null   object
 4   Industry                           5000 non-null   object
 5   Years_of_Experience                5000 non-null   int64 
 6   Work_Location                      5000 non-null   object
 7   Hours_Worked_Per_Week              5000 non-null   int64 
 8   Number_of_Virtual_Meetings         5000 non-null   int64 
 9   Work_Life_Balance_Rating           5000 non-null   int64 
 10  Stress_Level                       5000 non-null   object
 11  Mental_Health_Condition            5000 non-null   object
 12  Access

## Transform categorical to Binary

In [2]:
categorical_variables = [
    "Gender", "Job_Role", "Industry", "Work_Location", 
    "Access_to_Mental_Health_Resources", 
    "Productivity_Change", "Satisfaction_with_Remote_Work", 
    "Physical_Activity", "Sleep_Quality", "Region"
]

escolha = 1

y_stress = df["Stress_Level"] # 1
y_mental = df["Mental_Health_Condition"] # 2 
y_vulnerability = df["Vulnerability_Level"] # 3

df_encoded = pd.get_dummies(df.drop(["Stress_Level", "Mental_Health_Condition", "Vulnerability_Level"], axis=1), 
                            columns=categorical_variables, 
                            drop_first=True)

colunas_a_remover = ["Employee_ID"]


df_encoded = df_encoded.drop(colunas_a_remover, axis=1)


## Dataset split

In [3]:
if escolha == 1:
    X_train, X_test, y_train, y_test = train_test_split(df_encoded, y_stress, test_size=0.2, random_state=42)
elif escolha == 2:
    X_train, X_test, y_train, y_test = train_test_split(df_encoded, y_mental, test_size=0.2, random_state=42)
elif escolha == 3:
    X_train, X_test, y_train, y_test = train_test_split(df_encoded, y_vulnerability, test_size=0.2, random_state=42) 


print("Tamanhos dos conjuntos:")
print("X_train:", X_train.shape, "| X_test:", X_test.shape)
print("y_stress_train:", y_train.shape, "| y_stress_test:", y_test.shape)
print("y_mental_train:", y_train.shape, "| y_mental_test:", y_test.shape)

Tamanhos dos conjuntos:
X_train: (4000, 38) | X_test: (1000, 38)
y_stress_train: (4000,) | y_stress_test: (1000,)
y_mental_train: (4000,) | y_mental_test: (1000,)


## Random Forest grid search for the best hyperparemeters

In [4]:
param_grid = {
    'n_estimators': [100, 200, 300],        # Número de árvores
    'max_depth': [10, 20, 30, None],       # Profundidade máxima
    'min_samples_split': [2, 5, 10],       # Mínimo de amostras para dividir
    'min_samples_leaf': [1, 2, 4]          # Mínimo de amostras em uma folha
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), 
                                  param_grid=param_grid, 
                                  scoring='accuracy', 
                                  cv=3,               # Validação cruzada com 3 divisões
                                  verbose=2,          # Exibir progresso
                                  n_jobs=-1)          # Usar todos os núcleos disponíveis

grid_search.fit(X_train, y_train)

print("Melhores parâmetros para Stress_Level:", grid_search.best_params_)

Fitting 3 folds for each of 108 candidates, totalling 324 fits
Melhores parâmetros para Stress_Level: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}


## Best Model

In [8]:
# Criar o modelo com os melhores parâmetros
best_rf = RandomForestClassifier(
    max_depth=10,
    min_samples_leaf=2,
    min_samples_split=5,
    n_estimators=200,
    random_state=0
)

best_rf.fit(X_train, y_train)

y_pred = best_rf.predict(X_test)

# Avaliar a acurácia
accuracy = accuracy_score(y_test, y_pred)
print(f"Acurácia no conjunto de teste: {accuracy:.2f}")

# Relatório de classificação
print("Relatório de Classificação para Stress_Level:")
print(classification_report(y_test, y_pred))

# Matriz de confusão
print("Matriz de Confusão:")
print(confusion_matrix(y_test, y_pred))


Acurácia no conjunto de teste: 0.34
Relatório de Classificação para Stress_Level:
              precision    recall  f1-score   support

        High       0.34      0.41      0.37       335
         Low       0.33      0.30      0.31       326
      Medium       0.37      0.32      0.34       339

    accuracy                           0.34      1000
   macro avg       0.34      0.34      0.34      1000
weighted avg       0.34      0.34      0.34      1000

Matriz de Confusão:
[[137  97 101]
 [140  98  88]
 [124 106 109]]


In [6]:
importances = best_rf.feature_importances_
features = X_train.columns
sorted_importances = sorted(zip(importances, features), reverse=True)

print("Feature Importances:")
for importance, feature in sorted_importances:  # Mostrar as 10 principais
    print(f"{feature}: {importance:.4f}")


Feature Importances:
Age: 0.0982
Hours_Worked_Per_Week: 0.0967
Years_of_Experience: 0.0914
Number_of_Virtual_Meetings: 0.0826
Work_Life_Balance_Rating: 0.0516
Company_Support_for_Remote_Work: 0.0489
Social_Isolation_Rating: 0.0459
Access_to_Mental_Health_Resources_Yes: 0.0202
Sleep_Quality_Poor: 0.0197
Sleep_Quality_Good: 0.0195
Productivity_Change_No Change: 0.0188
Physical_Activity_None: 0.0187
Physical_Activity_Weekly: 0.0184
Satisfaction_with_Remote_Work_Satisfied: 0.0183
Productivity_Change_Increase: 0.0183
Satisfaction_with_Remote_Work_Unsatisfied: 0.0181
Work_Location_Remote: 0.0180
Gender_Prefer not to say: 0.0177
Work_Location_Onsite: 0.0175
Gender_Male: 0.0174
Gender_Non-binary: 0.0170
Region_Oceania: 0.0144
Industry_Healthcare: 0.0141
Job_Role_Software Engineer: 0.0141
Region_Europe: 0.0140
Industry_Education: 0.0140
Region_South America: 0.0138
Region_North America: 0.0136
Job_Role_HR: 0.0136
Industry_Finance: 0.0134
Region_Asia: 0.0134
Industry_Manufacturing: 0.0133
Job_Ro