### Import all the necessary libraries

In [1]:
import joblib
import pandas as pd
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.metrics import accuracy_score
import os
from sklearn.ensemble import VotingClassifier, StackingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV


os.chdir(r'D:\KDG\2024-2025\Semester 1\DAI5\GroupProject\SatisfactionLevel0\Resources')

### Load the dataset

In [2]:
df = pd.read_csv('processed_features_job_satisfaction.csv')

# List of significant features
significant_features = ['Workload_Binned', 'SleepHours_Binned', 'Stress_Binned']

# Separate features and target variable
X = df[significant_features]
y = df['JobSatisfaction']


### Split the dataset

In [3]:
# Train-test-validation split (60% train, 20% validation, 20% test)
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

### Load all models

In [4]:
decision_tree = joblib.load('decision_tree_model.pkl')
extra_trees = joblib.load('extra_trees_model.pkl')
knn = joblib.load('knn_model.pkl')
naive_bayes = joblib.load('gnb_model.pkl')
random_forest = joblib.load('random_forest_model.pkl')


### Create Pipilines

In [5]:
# Create pipelines for each model
pipelines = {
    'Decision Tree': Pipeline([('scaler', StandardScaler()), ('classifier', decision_tree)]),
    'Extra Trees': Pipeline([('scaler', StandardScaler()), ('classifier', extra_trees)]),
    'KNN': Pipeline([('scaler', StandardScaler()), ('classifier', knn)]),
    'Naive Bayes': Pipeline([('scaler', StandardScaler()), ('classifier', naive_bayes)]),
    'Random Forest': Pipeline([('scaler', StandardScaler()), ('classifier', random_forest)])
}

### Train and evaluate each pipeline

In [6]:
# Train and evaluate each pipeline
# Train and evaluate each pipeline using the validation set
for name, pipeline in pipelines.items():
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"{name} Pipeline Accuracy: {accuracy:.4f}")

Decision Tree Pipeline Accuracy: 0.4252
Extra Trees Pipeline Accuracy: 0.4369
KNN Pipeline Accuracy: 0.4286
Naive Bayes Pipeline Accuracy: 0.4169
Random Forest Pipeline Accuracy: 0.4319


### Set Up Ensemble Methods

#### Soft Voting Classifier

In [7]:
# Create Voting Classifier (Soft and Hard)
voting_soft = VotingClassifier(
    estimators=[
        ('dt', pipelines['Decision Tree']),
        ('et', pipelines['Extra Trees']),
        ('knn', pipelines['KNN']),
        ('nb', pipelines['Naive Bayes']),
        ('rf', pipelines['Random Forest'])
    ],
    voting='soft'
)


#### Hard Voting Classifier

In [8]:
voting_hard = VotingClassifier(
    estimators=[
        ('dt', pipelines['Decision Tree']),
        ('et', pipelines['Extra Trees']),
        ('knn', pipelines['KNN']),
        ('nb', pipelines['Naive Bayes']),
        ('rf', pipelines['Random Forest'])
    ],
    voting='hard'
)


#### Stacking Classifier

In [9]:
# Create Stacking Classifier
stacking_clf = StackingClassifier(
    estimators=[
        ('dt', pipelines['Decision Tree']),
        ('et', pipelines['Extra Trees']),
        ('knn', pipelines['KNN']),
        ('nb', pipelines['Naive Bayes']),
        ('rf', pipelines['Random Forest'])
    ],
    final_estimator=random_forest
)


#### Define all the classifier

### Define all the classifier

In [10]:
# Define a dictionary to hold all classifiers for easy comparison
# Define all the classifiers
classifiers = {
    'Voting Soft': voting_soft,
    'Voting Hard': voting_hard,
    'Stacking': stacking_clf,
}

### Fit each classifier and evaluate the performance

In [11]:
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.4f}")


Voting Soft Accuracy: 0.4269
Voting Hard Accuracy: 0.4369
Stacking Accuracy: 0.4369


## Hyperparameter Tuning

In [12]:
# Define hyperparameter search spaces
param_grids = {
    'Decision Tree': {'classifier__max_depth': [None, 10, 20, 30]},
    'Extra Trees': {'classifier__n_estimators': [50, 100, 200]},
    'KNN': {'classifier__n_neighbors': [3, 5, 7]},
    'Naive Bayes': {},  # Naive Bayes has no hyperparameters to tune in this context
    'Random Forest': {'classifier__n_estimators': [50, 100, 120, 150, 200]}
}


In [13]:
# Perform hyperparameter tuning using GridSearchCV
best_estimators = {}
for name, pipeline in pipelines.items():    
    print(f"Training and tuning {name}...")    
    grid_search = GridSearchCV(pipeline, param_grids[name], cv=5, n_jobs=-1)    
    grid_search.fit(X_train, y_train)        # Store the best estimator for each model   
    best_estimators[name] = grid_search.best_estimator_        # Print the best hyperparameters    
    print(f"Best hyperparameters for {name}: {grid_search.best_params_}")
    print(f"Best estimator for {name}: {best_estimators[name]}")
    print("-" * 50)
    # Example usage to get the best Decision Tree hyperparameters:print("Best Decision Tree Hyperparameters:", best_estimators['Decision Tree'])

Training and tuning Decision Tree...
Best hyperparameters for Decision Tree: {'classifier__max_depth': None}
Best estimator for Decision Tree: Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier',
                 DecisionTreeClassifier(min_samples_leaf=3,
                                        min_samples_split=11,
                                        random_state=42))])
--------------------------------------------------
Training and tuning Extra Trees...
Best hyperparameters for Extra Trees: {'classifier__n_estimators': 100}
Best estimator for Extra Trees: Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier',
                 ExtraTreesClassifier(max_features=1, min_samples_leaf=16,
                                      min_samples_split=19))])
--------------------------------------------------
Training and tuning KNN...
Best hyperparameters for KNN: {'classifier__n_neighbors': 7}
Best estimator for KNN: Pipeline(steps=[('scaler', Sta

In [14]:

voting_soft = VotingClassifier(
    estimators=[
        ('dt', best_estimators['Decision Tree']),
        ('et', best_estimators['Extra Trees']),
        ('knn', best_estimators['KNN']),
        ('nb', best_estimators['Naive Bayes']),
        ('rf', best_estimators['Random Forest'])
    ],
    voting='soft'
)



In [15]:
voting_hard = VotingClassifier(
    estimators=[
        ('dt', best_estimators['Decision Tree']),
        ('et', best_estimators['Extra Trees']),
        ('knn', best_estimators['KNN']),
        ('nb', best_estimators['Naive Bayes']),
        ('rf', best_estimators['Random Forest'])
    ],
    voting='hard'
)


In [16]:
stacking_clf = StackingClassifier(
    estimators=[
        ('dt', best_estimators['Decision Tree']),
        ('et', best_estimators['Extra Trees']),
        ('knn', best_estimators['KNN']),
        ('nb', best_estimators['Naive Bayes']),
        ('rf', best_estimators['Random Forest'])
    ],
    final_estimator=random_forest
)


In [17]:
# Define a dictionary to hold all ensemble classifiers for easy comparison
ensemble_classifiers = {
    'Voting Soft': voting_soft,
    'Voting Hard': voting_hard,
    'Stacking': stacking_clf
}


In [18]:
# Train and evaluate each ensemble classifier using the validation set
for name, clf in ensemble_classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"{name} Ensemble Accuracy: {accuracy:.4f}")

Voting Soft Ensemble Accuracy: 0.4336
Voting Hard Ensemble Accuracy: 0.4286
Stacking Ensemble Accuracy: 0.4452


## Conclusion
The best ensemble method is the Stacking classifier with an accuracy of 0.4452, which is the maximum accuracy that we can get.

We updated the hyperparameters for the various individual models and their accuracies increased, some with 1 or 2 percent increase. 
While others had a negligible low increase or no increase at all.