### Members - NIU


Dallon Archibald - 100547834\
Keanu Simons - 100545207

In [1]:
import numpy as np

NIU = 100547834
np.random.seed(NIU)

From Assignment 1, we identified XGBoost to produce the highest accuracy. This approach will be expanded on for our model while encorporating **feature selection**.

## 1. EDA & Preprocessing

First, we will load the data from a pickle (.pkl) file. This should be located at the root of the directory. Otherwise, you must upload the appropiate file, "attrition_available_26.pkl", from your local directory.

In [2]:
from google.colab import files
import io
import os

if not os.path.exists("attrition_available_26.pkl"):
  uploaded = files.upload()

Our EDA and preprocessing is consistent with our approach from Assignment 1. However, we have moved some steps into pipelines for consistency and organization.

In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# read the file data
employee_data_df = pd.read_pickle("attrition_available_26.pkl")

# separate the predictors from the target column
X = employee_data_df.drop('Attrition', axis=1)
y = employee_data_df['Attrition']

target_names = y.unique()

# encode categorical columns
categorical_cols = X.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
categorical_cols_copy = X[categorical_cols].copy()
le = LabelEncoder()

for col in categorical_cols:
    X[col] = le.fit_transform(X[col])

# remove constant and irrelevant columns
sg_val_cols = employee_data_df.columns[employee_data_df.nunique() == 1].tolist()
sg_val_cols.append('EmployeeID')
X.drop(sg_val_cols, axis=1, inplace=True)

# Display the shape of the features
print("Shape of X (features):", X.shape)

# Feature names and target options
feature_names = X.columns.tolist()
print("Feature names:", feature_names)
print("Target groups:", target_names)

Shape of X (features): (1426, 26)
Feature names: ['hrs', 'absences', 'JobInvolvement', 'PerformanceRating', 'EnvironmentSatisfaction', 'JobSatisfaction', 'WorkLifeBalance', 'Age', 'BusinessTravel', 'Department', 'DistanceFromHome', 'Education', 'EducationField', 'Gender', 'JobLevel', 'JobRole', 'MaritalStatus', 'MonthlyIncome', 'NumCompaniesWorked', 'PercentSalaryHike', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager']
Target groups: ['Yes' 'No']


Now we can assemble a "preprocessor" pipeline to streamline our workflow and minimize data leakage.

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin

numerical_cols = X.select_dtypes(exclude=['object', 'category', 'bool']).columns.tolist()

# FROM CHATGPT: Transformer to convert NumPy array back to DataFrame
# prompted based on errors received when converting our preprocessing to a pipeline
class ToDataFrame(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return pd.DataFrame(X, columns=self.columns)

preprocessor = Pipeline([
    ('iterative_imputer', IterativeImputer()), # fill missing values before scaling
    ('to_dataframe', ToDataFrame(columns=numerical_cols)), # convert to DataFrame after imputer
    ('scale', ColumnTransformer([
        ('num', StandardScaler(), numerical_cols)
    ], remainder='passthrough'))  # use a column transformer so that our categorical columns remain unscaled
])

In [5]:
from sklearn.model_selection import train_test_split

# create train/test splits and map target values to numerics for using XGBoost
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=NIU)

y_trainXG = y_train.map({'No': 0, 'Yes': 1})
y_testXG = y_test.map({'No': 0, 'Yes': 1})

## 2. Feature Selection

In our pipeline, we will incorporate **feature selection** using two approaches.

In [6]:
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif

# declaring objects used in this section
scaler = StandardScaler()
XGB = XGBClassifier(random_state = NIU)

### SelectKBest and criterion f_classif

In [7]:
selector_f = SelectKBest(f_classif)

# include model and f selector in pipeline
xgb_pipe_full_f = Pipeline([
  ('preprocessing', preprocessor),
  ('select', selector_f),
  ('xgb', XGB)
])

# train and evaluate
xgb_pipe_full_f.fit(X_train, y_trainXG)
y_pred_f = xgb_pipe_full_f.predict(X_test)

print("For f_classif:\n")
print(f"Features selected boolean array: {xgb_pipe_full_f.named_steps['select'].get_support()}")
print(f"Locations where features selected: {np.where(xgb_pipe_full_f.named_steps['select'].get_support())}")
feature_names_before_selection = xgb_pipe_full_f.named_steps['preprocessing'].named_steps['scale'].get_feature_names_out()
feature_names_after_selection = xgb_pipe_full_f.named_steps['select'].get_feature_names_out(feature_names_before_selection)
print(f"Number of features selected: {len(feature_names_after_selection)}")
print(f"Feature names after selection: {feature_names_after_selection}")

For f_classif:

Features selected boolean array: [ True False False False  True  True  True  True False False False False
  True False False False  True False False False False  True False  True
 False  True]
Locations where features selected: (array([ 0,  4,  5,  6,  7, 12, 16, 21, 23, 25]),)
Number of features selected: 10
Feature names after selection: ['num__hrs' 'num__EnvironmentSatisfaction' 'num__JobSatisfaction'
 'num__WorkLifeBalance' 'num__Age' 'num__EducationField'
 'num__MaritalStatus' 'num__TotalWorkingYears' 'num__YearsAtCompany'
 'num__YearsWithCurrManager']


### SelectKBest and criterion mutual_info_classif

In [8]:
selector_mutual = SelectKBest(mutual_info_classif)

# include model and mutual_info selector in pipeline
xgb_pipe_full_mutual = Pipeline([
  ('preprocessing', preprocessor),
  ('select', selector_mutual),
  ('xgb', XGB)
])

# train and evaluate
xgb_pipe_full_mutual.fit(X_train, y_trainXG)
y_pred_mutual = xgb_pipe_full_mutual.predict(X_test)

print("For mutual_info_classif:\n")
print(f"Features selected boolean array: {xgb_pipe_full_mutual.named_steps['select'].get_support()}")
print(f"Locations where features selected: {np.where(xgb_pipe_full_mutual.named_steps['select'].get_support())}")
feature_names_before_selection = xgb_pipe_full_mutual.named_steps['preprocessing'].named_steps['scale'].get_feature_names_out()
feature_names_after_selection = xgb_pipe_full_mutual.named_steps['select'].get_feature_names_out(feature_names_before_selection)
print(f"Number of features selected: {len(feature_names_after_selection)}")
print(f"Feature names after selection: {feature_names_after_selection}")

For mutual_info_classif:

Features selected boolean array: [ True False False False False False False  True  True False  True False
 False False False False False  True False False  True  True False  True
  True  True]
Locations where features selected: (array([ 0,  7,  8, 10, 17, 20, 21, 23, 24, 25]),)
Number of features selected: 10
Feature names after selection: ['num__hrs' 'num__Age' 'num__BusinessTravel' 'num__DistanceFromHome'
 'num__MonthlyIncome' 'num__StockOptionLevel' 'num__TotalWorkingYears'
 'num__YearsAtCompany' 'num__YearsSinceLastPromotion'
 'num__YearsWithCurrManager']


### Evaluation

In [9]:
from sklearn.metrics import accuracy_score

acc_f = accuracy_score(y_testXG, y_pred_f)
print("f_classif:")
print(f"Accuracy of the XGBoost model with f_classif feature selection: {round(acc_f*100,4)}%\n")

acc_mutual = accuracy_score(y_testXG, y_pred_mutual)
print("mutual_info_classif:")
print(f"Accuracy of the XGBoost model with mutual_info_classif feature selection: {round(acc_mutual*100,4)}%")

f_classif:
Accuracy of the XGBoost model with f_classif feature selection: 91.3165%

mutual_info_classif:
Accuracy of the XGBoost model with mutual_info_classif feature selection: 90.1961%


We observe that both models perform well using feature selection with the `f_classif` criterion yielding a slightly better accuracy.

## 3. Grid Search (HPO)

In this section, the only hyper-parameter we will tune is `k`, the number of features, for each approach

In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

# create a grid with the possible options for k, between 1 and the total number of features
param_grid = {'select__k': list(range(1,len(X.columns)))}

# create 3-folds
inner = KFold(n_splits=3, shuffle=True, random_state = NIU)

### Tuning `k`

In [11]:
# feature selection with criterin f_classif
tune_scale_XGB_f = GridSearchCV(xgb_pipe_full_f,
                                param_grid,
                                scoring="accuracy",
                                cv=inner)

# train
tune_scale_XGB_f.fit(X_train, y_trainXG)

# evaluate using the best k identified
trained_pipeline_f = tune_scale_XGB_f.best_estimator_
y_pred_HPO_f = trained_pipeline_f.predict(X_test)

In [12]:
# feature selection with criterin mutual_info_classif
tune_scale_XGB_mutual = GridSearchCV(xgb_pipe_full_mutual,
                                     param_grid,
                                     scoring="accuracy",
                                     cv=inner)

# train
tune_scale_XGB_mutual.fit(X_train, y_trainXG)

# evaluate using the best k identified
trained_pipeline_mutual = tune_scale_XGB_mutual.best_estimator_
y_pred_HPO_mutual = trained_pipeline_mutual.predict(X_test)

## 4. `k` HPO Evaluation

### Analyzing Features Selected

In [13]:
print("For f_classif with Grid Search HPO:\n")
feature_names_before_selection_f = trained_pipeline_f.named_steps['preprocessing'].named_steps['scale'].get_feature_names_out()
feature_names_after_selection_f = trained_pipeline_f.named_steps['select'].get_feature_names_out(feature_names_before_selection_f)
print(f"Number of features selected: {len(feature_names_after_selection_f)}")
print(f"Names of features selected: {feature_names_after_selection_f}\n")


print("==========\n")

print("For mutual_info_classif with Grid Search HPO:\n")
feature_names_before_selection_mut = trained_pipeline_mutual.named_steps['preprocessing'].named_steps['scale'].get_feature_names_out()
feature_names_after_selection_mut = trained_pipeline_mutual.named_steps['select'].get_feature_names_out(feature_names_before_selection_mut)
print(f"Number of features selected: {len(feature_names_after_selection_mut)}")
print(f"Names of features selected: {feature_names_after_selection_mut}")

For f_classif with Grid Search HPO:

Number of features selected: 21
Names of features selected: ['num__hrs' 'num__absences' 'num__PerformanceRating'
 'num__EnvironmentSatisfaction' 'num__JobSatisfaction'
 'num__WorkLifeBalance' 'num__Age' 'num__Department'
 'num__DistanceFromHome' 'num__Education' 'num__EducationField'
 'num__JobLevel' 'num__MaritalStatus' 'num__MonthlyIncome'
 'num__NumCompaniesWorked' 'num__StockOptionLevel'
 'num__TotalWorkingYears' 'num__TrainingTimesLastYear'
 'num__YearsAtCompany' 'num__YearsSinceLastPromotion'
 'num__YearsWithCurrManager']


For mutual_info_classif with Grid Search HPO:

Number of features selected: 25
Names of features selected: ['num__hrs' 'num__JobInvolvement' 'num__PerformanceRating'
 'num__EnvironmentSatisfaction' 'num__JobSatisfaction'
 'num__WorkLifeBalance' 'num__Age' 'num__BusinessTravel' 'num__Department'
 'num__DistanceFromHome' 'num__Education' 'num__EducationField'
 'num__Gender' 'num__JobLevel' 'num__JobRole' 'num__MaritalStatus'


### Comparison

In [14]:
print(f"Accuracy of the XGBoost model with f_classif feature selection and HPO for k: {round(accuracy_score(y_testXG, y_pred_HPO_f)*100,4)}%\n")

print(f"Accuracy of the XGBoost model with mutual_info_classif feature selection and HPO for k: {round(accuracy_score(y_testXG, y_pred_HPO_mutual)*100,4)}%")

Accuracy of the XGBoost model with f_classif feature selection and HPO for k: 93.8375%

Accuracy of the XGBoost model with mutual_info_classif feature selection and HPO for k: 91.5966%


Both models with HPO for the number of features selected (`k`) produce an improved accuracy from the best approach without HPO. Using accuracy as our metric, we find that the `f_classif` criterion continues to give the highest score.

Compared to the previous assignment, our HPO thus far is optimistic. Our model with feature selection and tuning for `k` beats the default XGBoost model from the previous assignment. The accuracy was increased from 92.7171% before to 93.8375%, showing an **improvement of 1.1204%**. However, our model is outperformed by the final XGBoost model with HPO on its parameters, which had an accuracy of 94.1176%.

## 5. XGBoost with Optuna HPO

Now we will perform HPO on the hyper-parameters for XGBoost.

### Pipeline Set-up

In [15]:
# fixing number of features to the optimal amount identified
best_k = len(feature_names_after_selection_f)
selector_f = SelectKBest(f_classif, k=best_k)

In [16]:
!pip install optuna
!pip install optuna-integration --user
!pip install optuna-integration[sklearn]



In [17]:
# FROM ASSIGNMENT 1
from optuna.distributions import IntDistribution as IntDist, FloatDistribution as FltDist

param_optuna_xgb = {
    'xgb__n_estimators': IntDist(50, 300),     # Boosting rounds
    'xgb__max_depth': IntDist(3, 10),          # Controls model complexity
    'xgb__learning_rate': FltDist(0.01, 0.3),  # Lower values require more boosting rounds
    'xgb__subsample': FltDist(0.6, 1),         # Helps prevent overfitting by using a subset of data
    'xgb__colsample_bytree': FltDist(0.6, 1),  # Controls feature sampling
}

In [18]:
# reassembling the model pipeline
xgb_pipe_full_f = Pipeline([
  ('preprocessing', preprocessor),
  ('select', selector_f),
  ('xgb', XGB)
])

### Optuna Approach

In [19]:
import optuna

# perform Optuna search with 50 trials
opt_budget = 50
optuna_xgb_clf = optuna.integration.OptunaSearchCV(xgb_pipe_full_f,
                                                   param_optuna_xgb,
                                                   scoring='accuracy',
                                                   n_trials=opt_budget,
                                                   n_jobs=1, verbose=1,
                                                   timeout=600,
                                                   random_state=NIU,
                                                   refit=True)

# train and predict
optuna_xgb_clf.fit(X_train, y_trainXG)
y_pred = optuna_xgb_clf.predict(X_test)

  optuna_xgb_clf = optuna.integration.OptunaSearchCV(xgb_pipe_full_f,
[I 2024-12-13 22:52:22,171] A new study created in memory with name: no-name-03af3771-3c5a-4cfa-a653-e8b0cb8532aa
INFO:optuna_integration.sklearn.sklearn:Searching the best hyperparameters using 1069 samples...
[I 2024-12-13 22:52:30,141] Trial 0 finished with value: 0.7512131981922688 and parameters: {'xgb__n_estimators': 109, 'xgb__max_depth': 3, 'xgb__learning_rate': 0.018824661554805402, 'xgb__subsample': 0.6544935572451392, 'xgb__colsample_bytree': 0.8298472582221841}. Best is trial 0 with value: 0.7512131981922688.
[I 2024-12-13 22:52:48,081] Trial 1 finished with value: 0.8999341845465315 and parameters: {'xgb__n_estimators': 122, 'xgb__max_depth': 10, 'xgb__learning_rate': 0.07049977690311085, 'xgb__subsample': 0.8418091483754582, 'xgb__colsample_bytree': 0.7940711376847844}. Best is trial 1 with value: 0.8999341845465315.
[I 2024-12-13 22:53:01,621] Trial 2 finished with value: 0.8896581984116538 and paramet

In [20]:
# view the features selected
Locations = np.where(optuna_xgb_clf.best_estimator_.named_steps['select'].get_support())[0]
selected_features = [feature_names[i] for i in Locations]

print("Selected Features:", selected_features)
print("Number of features selected:", len(selected_features))

Selected Features: ['hrs', 'absences', 'PerformanceRating', 'EnvironmentSatisfaction', 'JobSatisfaction', 'WorkLifeBalance', 'Age', 'Department', 'DistanceFromHome', 'Education', 'EducationField', 'JobLevel', 'MaritalStatus', 'MonthlyIncome', 'NumCompaniesWorked', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager']
Number of features selected: 21


## 6. Examining Results

In [21]:
final_acc = accuracy_score(y_testXG, y_pred)
print(f"XGBoost model accuracy with f_classif criterion and HPO: {round(final_acc*100, 4)}%")

XGBoost model accuracy with f_classif criterion and HPO: 94.1176%


After additionally applying Optuna HPO, we obtain an improved accuracy for our XGBoost model compared to using default hyper-parameters. On the testing dataset, our accuracy is now 94.1176%, up 0.2801%.

However, compared to our final model produced from Assignment 1, we observe that our accuracy scores are exactly the same, including even the confidence intervals.

In [22]:
from statsmodels.stats.proportion import proportion_confint

conf_int_XGB = proportion_confint(len(y_testXG) * final_acc, len(y_testXG), method="wilson")
print("XGB with HPO (Assignment 1) -- (0.9117523092058187, 0.9612072476851393)")
print(f"XGB with feature selection and HPO (Assignment 2) -- {conf_int_XGB}")

XGB with HPO (Assignment 1) -- (0.9117523092058187, 0.9612072476851393)
XGB with feature selection and HPO (Assignment 2) -- (0.9117523092058187, 0.9612072476851393)


Interestingly, these values were obtained with different hyper-parameters for our XGBoost model.

In [23]:
print("Assignment 1, best_params_:")
print("{'n_estimators': 157,\n'max_depth': 9,\n'learning_rate': 0.09811338700649763,\n'subsample': 0.9248027925331067,\n'colsample_bytree': 0.7464780338351065}\n")
print("Assignment 2, best_params_:")
optuna_xgb_clf.best_params_

Assignment 1, best_params_:
{'n_estimators': 157,
'max_depth': 9,
'learning_rate': 0.09811338700649763,
'subsample': 0.9248027925331067,
'colsample_bytree': 0.7464780338351065}

Assignment 2, best_params_:


{'xgb__n_estimators': 69,
 'xgb__max_depth': 10,
 'xgb__learning_rate': 0.07694005923017846,
 'xgb__subsample': 0.9949307290731089,
 'xgb__colsample_bytree': 0.611334480883694}

Acknowledging this, the similar accuracies are caused by us reading from the same, mass data. While the accuracy was not improved, it should generalize more and show higher performance for new data.

## 7. Competition Set and Final Model

We can now train our final model and make predictions against the competition dataset.

In [24]:
# creating our final model
final_model = XGBClassifier(
  n_estimators=optuna_xgb_clf.best_params_['xgb__n_estimators'],
  max_depth=optuna_xgb_clf.best_params_['xgb__max_depth'],
  learning_rate=optuna_xgb_clf.best_params_['xgb__learning_rate'],
  subsample=optuna_xgb_clf.best_params_['xgb__subsample'],
  colsample_bytree=optuna_xgb_clf.best_params_['xgb__colsample_bytree'],
  random_state=NIU
)

# reassembling the pipeline
final_pipe = Pipeline([
  ('preprocessing', preprocessor),
  ('select', selector_f),
  ('xgb', final_model)
])

# train using the entire dataset
y_XG = y.map({'No': 0, 'Yes': 1})
final_pipe.fit(X, y_XG)

Upload the competition dataset, if not already at the root of the directory, called "attrition_compet_26.pkl".

In [25]:
if not os.path.exists("attrition_compet_26.pkl"):
  uploaded = files.upload()

Saving attrition_compet_26.pkl to attrition_compet_26.pkl


We will preprocess our data so that it is in the same structure as before.

In [26]:
# Read the competition dataset
X_compet = pd.read_pickle("attrition_compet_26.pkl")

# Convert categorical columns to numeric representations
for col in categorical_cols:
    X_compet[col] = le.fit_transform(X_compet[col])

# Drop constant and irrelevant columns
X_compet.drop(sg_val_cols, axis=1, inplace=True)

# Display the shape of the features
print("Shape of X (features):", X_compet.shape)

# Feature names and target options
feature_names = X_compet.columns.tolist()
print("Feature names:", feature_names)
print("Target groups:", target_names)

Shape of X (features): (200, 26)
Feature names: ['hrs', 'absences', 'JobInvolvement', 'PerformanceRating', 'EnvironmentSatisfaction', 'JobSatisfaction', 'WorkLifeBalance', 'Age', 'BusinessTravel', 'Department', 'DistanceFromHome', 'Education', 'EducationField', 'Gender', 'JobLevel', 'JobRole', 'MaritalStatus', 'MonthlyIncome', 'NumCompaniesWorked', 'PercentSalaryHike', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager']
Target groups: ['Yes' 'No']


Finally, we can make predictions, saving our model using joblib and the predictions in csv format.

In [27]:
# Predict and convert numeric value to user-classification
y_competition_pred = final_pipe.predict(X_compet)
y_competition_pred = pd.Series(y_competition_pred).map({0: 'No', 1: 'Yes'})

# Write results to a .csv file
competition_results = pd.DataFrame({'Id': range(len(y_competition_pred)), 'Prediction': y_competition_pred})
competition_results.to_csv('assignment2_competition_predictions.csv', index=False)

In [28]:
from joblib import dump, load

# Save the final model to a file
dump(final_pipe, 'assignment2_final_model.joblib')

['assignment2_final_model.joblib']

In [29]:
# The final pipeline can be retrieved by running this line
final_pipe_reloaded = load('assignment2_final_model.joblib')