# Extra Trees Classifier

# Import necessary libraries

In [2]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
from sklearn.preprocessing import LabelEncoder
from skopt import BayesSearchCV
import matplotlib.pyplot as plt
import numpy as np

# # Load the dataset and set display options to show all rows and print first few rows


In [3]:
os.chdir(r'/home/zamlamb/KdG/Data n AI 5/Team-Stress/Resources')
df = pd.read_csv('processed_features_job_satisfaction.csv')

# Set display options to show all rows
pd.set_option('display.max_rows', None)

# Print the first few rows of the DataFrame
print(df.head())

  JobSatisfaction  Workload_Binned  SleepHours_Binned Age_Binned  \
0  Very Satisfied                2                  7      30-39   
1  Very Satisfied                2                  7      30-39   
2  Very Satisfied                5                  6      20-29   
3  Very Satisfied                3                  7      20-29   
4  Very Satisfied                2                  4      20-29   

   Stress_Binned Experience_Binned        JobLevel  Gender MaritalStatus  \
0              1              6-10             Mid    Male       Married   
1              2             11-15             Mid  Female       Married   
2              4               1-5  Intern/Fresher  Female        Single   
3              1              6-10          Junior  Female       Married   
4              1               1-5          Junior   Other        Single   

        Dept    EmpType haveOT_Binned  
0         IT  Full-Time           Yes  
1    Finance  Full-Time            No  
2  Marketing  

# List the significant features, and separate features and the target variable


In [4]:
significant_features = ['Workload_Binned', 'SleepHours_Binned', 'Stress_Binned']

# Separate features and target variable
X = df[significant_features]
le = LabelEncoder()
y = le.fit_transform(df['JobSatisfaction']).ravel()  # Ensure y is a 1D array


# Define the hyperparameter search space for Extra Trees


In [5]:
search_space_extra_trees = {
    'n_estimators': (10, 1000),
    'max_features': (1, X.shape[1]),
    'min_samples_split': (2, 20),
    'min_samples_leaf': (1, 20)
}

# Initialize and perform Bayesian optimization for Extra Trees


In [6]:
extra_trees = ExtraTreesClassifier()
opt_extra_trees = BayesSearchCV(extra_trees, search_space_extra_trees, n_iter=16, cv=StratifiedKFold(n_splits=5), n_jobs=-1)
opt_extra_trees.fit(X, y)

  _data = np.array(data, dtype=dtype, copy=copy,


# Evaluate the Extra Trees model


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
opt_extra_trees.best_estimator_.fit(X_train, y_train)
y_test_pred = opt_extra_trees.best_estimator_.predict(X_test)
y_test_prob = opt_extra_trees.best_estimator_.predict_proba(X_test)

# Calculate AUC and accuracy


In [8]:
final_auc = roc_auc_score(y_test, y_test_prob, multi_class='ovr')
final_acc = accuracy_score(y_test, y_test_pred)

print(f"Final Evaluation on Test Set for Extra Trees Model:\nAUC: {final_auc:.4f}, Accuracy: {final_acc:.4f}")


Final Evaluation on Test Set for Extra Trees Model:
AUC: 0.6125, Accuracy: 0.4402


# Save results to a CSV file


In [9]:
results_df = pd.DataFrame({
    'True Label': y_test,
    'Predicted Label': y_test_pred,
    'Probability': y_test_prob.max(axis=1)
})
results_df.to_csv('extra_trees_model_results.csv', index=False)


# Plot ROC curve for the Extra Trees model


In [10]:
def plot_best_roc(model, X, y, model_name):
    y_prob = model.predict_proba(X)
    for i in range(len(model.classes_)):
        fpr, tpr, _ = roc_curve(y, y_prob[:, i], pos_label=i)
        plt.plot(fpr, tpr, label=f'Class {i} (area = {roc_auc_score(y, y_prob[:, i]):.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve for {model_name}')
    plt.legend(loc='lower right')
    plt.show()

plot_best_roc(opt_extra_trees.best_estimator_, X_test, y_test, "Extra Trees Model")

ValueError: multi_class must be in ('ovo', 'ovr')