# Random Forest For Classification

## 0. import necessary libraries

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scipy.stats import randint
import joblib
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

## 1. Load data

In [2]:
# load data
df = pd.read_csv('../Resources/processed_features_job_satisfaction.csv')
df.dtypes

JobSatisfaction      object
Workload_Binned       int64
SleepHours_Binned     int64
Age_Binned           object
Stress_Binned         int64
Experience_Binned    object
JobLevel             object
Gender               object
MaritalStatus        object
Dept                 object
EmpType              object
haveOT_Binned        object
dtype: object


## 2. Defining Features and Target Variable

In [3]:
# Define all features and target
features = ["Workload_Binned", "Stress_Binned", "SleepHours_Binned"]

target = "JobSatisfaction"  # Make sure that 'JobSatisfaction' is a categorical variable for classification

# Extract X and y
x = df[features]  # Features
y = df[target]    # Target



## 3. Splitting Data into Training and Testing set
### train_test_split randomly splits the dataset into two parts: 
### * 80% for training
### * 20% for testing
### * random_state=42 ensures the split is reproducible

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

## 4. Initializing and Training the Rando Forest Classifier
### This block initializes the RandomForestClassifier and trains it using the training data (x_train, y_train).

In [5]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=2, min_samples_leaf=1, max_features='sqrt')

# Train the classifier
rf_classifier.fit(x_train, y_train)

## 5. Making Predictions and Evaluating the Classifier
### After the classifier is trained, predictions are made on the test data (x_test), and the performance is evaluated using metrics like accuracy, classification report, and confusion matrix.

In [6]:
# Make predictions on the test set
y_pred = rf_classifier.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Detailed classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.4318936877076412
Classification Report:
                    precision    recall  f1-score   support

     Dissatisfied       0.00      0.00      0.00        60
          Neutral       0.24      0.19      0.21       100
        Satisfied       0.47      0.86      0.61       256
   Very Satisfied       0.37      0.07      0.11       106
Very dissatisfied       0.38      0.19      0.25        80

         accuracy                           0.43       602
        macro avg       0.29      0.26      0.24       602
     weighted avg       0.36      0.43      0.35       602

Confusion Matrix:
 [[  0   7  50   2   1]
 [  0  19  71   0  10]
 [  0  21 219   7   9]
 [  0  10  85   7   4]
 [  1  21  40   3  15]]


## 5. Hyperparameter Tuning using RandomizedSearchCV
### This block performs hyperparameter tuning using RandomizedSearchCV. It randomly searches over a range of hyperparameters to find the best configuration.

### The hyperparameter space is defined for n_estimators, max_depth, min_samples_split, min_samples_leaf, and max_features. RandomizedSearchCV performs 5-fold cross-validation and iterates through 50 different combinations of parameters.

In [7]:
# Initialize the RandomForestClassifier for hyperparameter tuning
rf = RandomForestClassifier( n_estimators=500, max_leaf_nodes=16, max_features='sqrt', random_state=42, n_jobs=-1)


# Define the hyperparameter space
hyperparameters = {
    'n_estimators': randint(100, 500),               # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],                 # Maximum depth of each tree
    'min_samples_split': randint(2, 11),             # Minimum samples required to split a node
    'min_samples_leaf': randint(1, 5),               # Minimum samples required at a leaf node
    'max_features': ['auto', 'sqrt'],                # Number of features to consider when looking for the best split
}

# Set up the RandomizedSearchCV with 5-fold cross-validation
random_search = RandomizedSearchCV(estimator=rf, param_distributions=hyperparameters, n_iter=50, cv=5, random_state=42, n_jobs=-1, verbose=2)

# Fit the RandomizedSearchCV to the training data
random_search.fit(x_train, y_train)


Fitting 5 folds for each of 50 candidates, totalling 250 fits


140 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
126 fits failed with the following error:
Traceback (most recent call last):
  File "D:\PythonEnv\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\PythonEnv\.venv\lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "D:\PythonEnv\.venv\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "D:\PythonEnv\.venv\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils.

## 6. Evaluating the Best Model after Hyperparameter Tuning

In [8]:

# Get the best hyperparameters
print("Best Hyperparameters: ", random_search.best_params_)

# Use the best estimator to make predictions
best_rf = random_search.best_estimator_
y_pred = best_rf.predict(x_test)

# Save the best model to a file
joblib.dump(best_rf, "random_forest_model.pkl")

# Evaluate the model with best hyperparameters
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Detailed classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Best Hyperparameters:  {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 7, 'n_estimators': 485}
Accuracy: 0.43853820598006643
Classification Report:
                    precision    recall  f1-score   support

     Dissatisfied       0.00      0.00      0.00        60
          Neutral       0.00      0.00      0.00       100
        Satisfied       0.45      0.95      0.61       256
   Very Satisfied       0.62      0.05      0.09       106
Very dissatisfied       0.40      0.20      0.27        80

         accuracy                           0.44       602
        macro avg       0.29      0.24      0.19       602
     weighted avg       0.35      0.44      0.31       602

Confusion Matrix:
 [[  0   0  59   0   1]
 [  0   0  88   0  12]
 [  0   2 243   3   8]
 [  0   2  96   5   3]
 [  0   7  57   0  16]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Output
### Best Hyperparameters:  {'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 3, 'min_samples_split': 8, 'n_estimators': 297}
### Accuracy: 0.4435215946843854

## 7. Save the results to a CSV file

In [9]:

# Get the classification report as a dictionary with zero_division parameter to avoid warning
report = classification_report(y_test, y_pred, output_dict=True, zero_division=1)

# Convert the classification report dictionary into a DataFrame
report_df = pd.DataFrame(report).transpose()

# Get the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Convert the confusion matrix to a DataFrame
conf_matrix_df = pd.DataFrame(conf_matrix, 
                              index=[f"Actual_{i}" for i in range(1, len(conf_matrix) + 1)],
                              columns=[f"Predicted_{i}" for i in range(1, len(conf_matrix) + 1)])

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Create a DataFrame for accuracy
accuracy_df = pd.DataFrame({"Metric": ["Accuracy"], "Value": [accuracy]})

# Concatenate all DataFrames into one
final_df = pd.concat([report_df, conf_matrix_df, accuracy_df], axis=0, ignore_index=False)

# Save the final DataFrame to a CSV file in the "Resource" folder
final_df.to_csv("classification_results.csv", index=True)

print("All results saved to a single CSV file ")


All results saved to a single CSV file 


## 8. Try another Hyperparameter tuning
### Based on the output I got, I will try another hyperparameter tuning to see if I can improve the accuracy.

In [11]:
from sklearn.model_selection import GridSearchCV

# Initialize the RandomForestClassifier for hyperparameter tuning
rf = RandomForestClassifier( n_estimators=500, max_leaf_nodes=16, max_features='sqrt', random_state=42, n_jobs=-1)

# Define the hyperparameter space
hyperparameters = {
    'n_estimators': [100, 200, 300, 400, 500],       # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],                 # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],                 # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4],                   # Minimum samples required at a leaf node
    'max_features': ['auto', 'sqrt']                 # Number of features to consider when looking for the best split
}

grid_search = GridSearchCV(estimator=rf, param_grid=hyperparameters, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(x_train, y_train)


# Set up the RandomizedSearchCV with 5-fold cross-validation
# random_search = RandomizedSearchCV(estimator=rf, param_distributions=hyperparameters, n_iter=50, cv=5, random_state=42, n_jobs=-1, verbose=2)
# 
# # Fit the RandomizedSearchCV to the training data
# random_search.fit(x_train, y_train)
# Get the best hyperparameters and accuracy
print("Best Hyperparameters: ", grid_search.best_params_)
y_pred = grid_search.best_estimator_.predict(x_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))

Fitting 5 folds for each of 360 candidates, totalling 1800 fits


900 fits failed out of a total of 1800.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
218 fits failed with the following error:
Traceback (most recent call last):
  File "D:\PythonEnv\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\PythonEnv\.venv\lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "D:\PythonEnv\.venv\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "D:\PythonEnv\.venv\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils

Best Hyperparameters:  {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 400}
Accuracy:  0.4435215946843854


## Conclusion 
### Even I try to use GridSearch instead of RandomSearch, I got a quite similar accuracy.
### That imply that this model reach the best accuracy with the given data.
