# Import necessary libraries


In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
from sklearn.preprocessing import LabelEncoder
from skopt import BayesSearchCV
import matplotlib.pyplot as plt
import numpy as np
import joblib

# Load the dataset and set display options to show all rows and print first few rows


In [2]:
# Load the dataset
os.chdir(r'D:\KDG\2024-2025\Semester 1\DAI5\GroupProject\SatisfactionLevel0\Resources')
df = pd.read_csv('processed_features_job_satisfaction.csv')

# Set display options to show all rows
pd.set_option('display.max_rows', None)

# Print the first few rows of the DataFrame
print(df.head())

  JobSatisfaction  Workload_Binned  SleepHours_Binned Age_Binned  \
0  Very Satisfied                2                  7      30-39   
1  Very Satisfied                2                  7      30-39   
2  Very Satisfied                5                  6      20-29   
3  Very Satisfied                3                  7      20-29   
4  Very Satisfied                2                  4      20-29   

   Stress_Binned Experience_Binned        JobLevel  Gender MaritalStatus  \
0              1              6-10             Mid    Male       Married   
1              2             11-15             Mid  Female       Married   
2              4               1-5  Intern/Fresher  Female        Single   
3              1              6-10          Junior  Female       Married   
4              1               1-5          Junior   Other        Single   

        Dept    EmpType haveOT_Binned  
0         IT  Full-Time           Yes  
1    Finance  Full-Time            No  
2  Marketing  

# List the significant features, and separate features and the target variable

In [3]:

# List of significant features
significant_features = ['Workload_Binned', 'SleepHours_Binned', 'Stress_Binned']

# Separate features and target variable
X = df[significant_features]
le = LabelEncoder()
y = le.fit_transform(df['JobSatisfaction']).ravel()  # Ensure y is a 1D array


# Define the hyperparameter search space for KNN


In [4]:
search_space_knn = {
    'n_neighbors': (1, 30, 50, 100, 125, 150, 200, 225), # Increased range for n_neighbors
    'weights': ['uniform', 'distance'], # Categorical distribution remains the same
    'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski'] # Added more distance metrics
}

# Initialize and perform Bayesian optimization for KNN


In [5]:
knn = KNeighborsClassifier() # Initialize KNN model
opt_knn = BayesSearchCV(knn, search_space_knn, n_iter=16, cv=StratifiedKFold(n_splits=5), n_jobs=-1) # Initialize Bayesian optimization
opt_knn.fit(X, y) # Perform Bayesian optimization

  _data = np.array(data, dtype=dtype, copy=copy,
  _data = np.array(data, dtype=dtype, copy=copy,
  _data = np.array(data, dtype=dtype, copy=copy,
  _data = np.array(data, dtype=dtype, copy=copy,


# Print the best hyperparameters

In [6]:
print(f"Best hyperparameters for KNN: {opt_knn.best_params_}")

Best hyperparameters for KNN: OrderedDict([('metric', 'manhattan'), ('n_neighbors', 200), ('weights', 'uniform')])


# Evaluate the KNN model


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Split the dataset into training and testing sets
opt_knn.best_estimator_.fit(X_train, y_train) # Fit the best KNN model on the training set
y_test_pred = opt_knn.best_estimator_.predict(X_test) # Predict the target variable on the testing set
y_test_prob = opt_knn.best_estimator_.predict_proba(X_test) # Predict the probabilities of the target variable on the testing set

# Calculate AUC and accuracy


In [8]:
final_auc = roc_auc_score(y_test, y_test_prob, multi_class='ovr') # Calculate AUC, ovr = one-vs-rest which is used for multi-class classification
final_acc = accuracy_score(y_test, y_test_pred) # Calculate accuracy

print(f"Final Evaluation on Test Set for KNN Model:\nAUC: {final_auc:.4f}, Accuracy: {final_acc:.4f}") # Print the final evaluation metrics


Final Evaluation on Test Set for KNN Model:
AUC: 0.6172, Accuracy: 0.4535


# Accuracy increase from 0.43 to 0.45

# Save results to a CSV file


In [9]:
results_df = pd.DataFrame({ # Create a DataFrame to save the results
    'True Label': y_test,
    'Predicted Label': y_test_pred,
    'Probability': y_test_prob.max(axis=1)
})
results_df.to_csv('knn_model_results.csv', index=False)

# Save the best KNN model to a file


In [10]:
joblib.dump(opt_knn.best_estimator_, 'knn_model.pkl')



['knn_model.pkl']

# Plot ROC curve for the KNN model