In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Colab installation cell (run once)
!pip install xgboost==1.7.6 ngboost==0.5.2 joblib --quiet

# ======================================
# 0) Imports & helper functions
# ======================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             confusion_matrix, classification_report, hamming_loss)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from ngboost import NGBoost
from ngboost.distns import k_categorical  # for discrete multiclass via NGBoost
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import warnings
warnings.filterwarnings('ignore')
print("Imports ready.")

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m200.3/200.3 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m349.3/349.3 kB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m117.3/117.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for autograd-gamma (setup.py) ... [?25l[?25hdone
Imports ready.


In [None]:
# ======================================
# 1) Load dataset (update path if necessary)
# ======================================
# Replace path with your Drive/Colab path
DATA_PATH = '/content/drive/MyDrive/Colab Notebooks/SIS3044 - Sustainable Information System/Extended_Employee_Performance_and_Productivity_Data.csv'
df = pd.read_csv(DATA_PATH)
print("Loaded:", df.shape)
df.head(2)

Loaded: (100000, 20)


Unnamed: 0,Employee_ID,Department,Gender,Age,Job_Title,Hire_Date,Years_At_Company,Education_Level,Performance_Score,Monthly_Salary,Work_Hours_Per_Week,Projects_Handled,Overtime_Hours,Sick_Days,Remote_Work_Frequency,Team_Size,Training_Hours,Promotions,Employee_Satisfaction_Score,Resigned
0,1,IT,Male,55,Specialist,2022-01-19 08:03:05.556036,2,High School,5,6750.0,33,32,22,2,0,14,66,0,2.63,False
1,2,Finance,Male,29,Developer,2024-04-18 08:03:05.556036,0,High School,5,7500.0,34,34,13,14,100,12,61,2,1.72,False


In [None]:
# ======================================
# 2) Minimal (reproducible) cleaning & feature engineering
#    (This mirrors what you've been doing; edit if you already have cleaned df)
# ======================================
# Basic missing handling (median/mode), keep original safe copy
df = df.copy()
df_original = df.copy()

# Numeric / categorical lists
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

# Fill numeric with median
for c in numeric_cols:
    df[c] = df[c].fillna(df[c].median())

# Fill categorical with mode
for c in cat_cols:
    df[c] = df[c].fillna(df[c].mode().iloc[0])

# Feature engineering (salary per hour, attendance rate) - only if missing
if 'Salary_Per_Hour' not in df.columns and {'Monthly_Salary','Work_Hours_Per_Week'}.issubset(df.columns):
    df['Salary_Per_Hour'] = df['Monthly_Salary'] / df['Work_Hours_Per_Week']

if 'Attendance_Rate' not in df.columns and 'Sick_Days' in df.columns:
    df['Attendance_Rate'] = (1 - (df['Sick_Days'] / 260)) * 100
    df['Attendance_Rate'] = df['Attendance_Rate'].clip(0, 100)

# If Performance_Score is textual, normalize and map ordinal if needed.
# We will treat Performance_Score as class labels. If it's numeric 1-5, keep as-is.
print("Columns after FE:", df.columns.tolist())

Columns after FE: ['Employee_ID', 'Department', 'Gender', 'Age', 'Job_Title', 'Hire_Date', 'Years_At_Company', 'Education_Level', 'Performance_Score', 'Monthly_Salary', 'Work_Hours_Per_Week', 'Projects_Handled', 'Overtime_Hours', 'Sick_Days', 'Remote_Work_Frequency', 'Team_Size', 'Training_Hours', 'Promotions', 'Employee_Satisfaction_Score', 'Resigned', 'Salary_Per_Hour', 'Attendance_Rate']


In [None]:
# ======================================
# 3) Target / Label preparation
#    Ensure we have a classification target named 'Performance_Class' (integers starting at 0)
# ======================================
# If your dataset already has numeric Performance_Score (1-5 integer), convert to 0..K-1
if 'Performance_Score' in df.columns:
    # If it's object, try to normalize text and map common labels
    if df['Performance_Score'].dtype == 'object':
        df['Performance_Score'] = df['Performance_Score'].astype(str).str.strip().str.lower()
        # Attempt common mappings; if values are custom, update mapping accordingly
        mapping = {
            'low':1, 'below average':2, 'average':3, 'high':4, 'outstanding':5,
            '1':1, '2':2, '3':3, '4':4, '5':5
        }
        df['Performance_Score_Mapped'] = df['Performance_Score'].map(mapping)
        # If mapping failed, fall back to label encoding
        if df['Performance_Score_Mapped'].isna().any():
            le = LabelEncoder()
            df['Performance_Class'] = le.fit_transform(df['Performance_Score'].astype(str))
        else:
            # Convert to 0..K-1
            df['Performance_Class'] = df['Performance_Score_Mapped'].astype(int) - 1
    else:
        # numeric
        vals = sorted(df['Performance_Score'].unique())
        # if 1..5, convert to 0..4
        if set(vals).issubset({1,2,3,4,5}):
            df['Performance_Class'] = df['Performance_Score'].astype(int) - 1
        else:
            # just map unique numeric values to labels 0..K-1
            le = LabelEncoder()
            df['Performance_Class'] = le.fit_transform(df['Performance_Score'])
else:
    raise ValueError("No column named 'Performance_Score' found. Please provide target column.")

print("Target classes:", sorted(df['Performance_Class'].unique()))

Target classes: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4)]


In [None]:
# ======================================
# 4) Feature selection for modeling
#    Choose numeric features; you may include encoded categorical features if desired.
# ======================================
# Select a set of features (modify if you have additional engineered features)
candidate_features = [
    'Projects_Handled','Overtime_Hours','Attendance_Rate','Salary_Per_Hour',
    'Training_Hours','Employee_Satisfaction_Score','Work_Hours_Per_Week','Monthly_Salary'
]
# Keep only those present
features = [f for f in candidate_features if f in df.columns]
print("Features used:", features)

# Drop rows with missing target if any
df = df.dropna(subset=['Performance_Class'])
X = df[features].copy()
y = df['Performance_Class'].astype(int).copy()

# Optional: small imputation safety
X = X.fillna(X.median())

Features used: ['Projects_Handled', 'Overtime_Hours', 'Attendance_Rate', 'Salary_Per_Hour', 'Training_Hours', 'Employee_Satisfaction_Score', 'Work_Hours_Per_Week', 'Monthly_Salary']


In [None]:
# ======================================
# 5) Train-test split (stratified)
# ======================================
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)
print("Train/test sizes:", X_train.shape, X_test.shape)

Train/test sizes: (75000, 8) (25000, 8)


In [None]:
# ======================================
# 6) Scaling (for models that need it)
#    We'll scale numeric features for logistic regression and SVM; tree models don't require scaling.
# ======================================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save scaler for deployment
os.makedirs('models', exist_ok=True)
joblib.dump(scaler, 'models/scaler.joblib')
print("Scaler saved.")

Scaler saved.


In [None]:
# ======================================
# 7) Model training & evaluation helpers
# ======================================
def evaluate_model(name, model, X_tr, X_te, y_tr, y_te, use_scaled=False):
    if use_scaled:
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_te)
    else:
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_te)
    acc = accuracy_score(y_te, y_pred)
    prec = precision_score(y_te, y_pred, average='macro', zero_division=0)
    rec = recall_score(y_te, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_te, y_pred, average='macro', zero_division=0)
    ham = hamming_loss(y_te, y_pred)
    print(f"---- {name} ----")
    print("Accuracy: {:.4f}  Precision(macro): {:.4f}  Recall(macro): {:.4f}  F1(macro): {:.4f}  Hamming: {:.4f}".format(acc,prec,rec,f1,ham))
    print("Confusion matrix:")
    print(confusion_matrix(y_te, y_pred))
    print("Classification report:")
    print(classification_report(y_te, y_pred, zero_division=0))
    print()
    return {'name':name, 'model':model, 'accuracy':acc, 'precision':prec, 'recall':rec, 'f1':f1, 'hamming':ham}

In [None]:
# ======================================
# 8) Train models (Logistic Regression, RandomForest, XGBoost, NGBoost, QDA-as-QRM)
# ======================================
results = []

# 8.1 Logistic Regression (scaled)
lr = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
results.append(evaluate_model("LogisticRegression", lr, X_train_scaled, X_test_scaled, y_train, y_test, use_scaled=True))
joblib.dump(lr, 'models/logistic_regression.joblib')

# 8.2 Random Forest (no scaling required)
rf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced', n_jobs=-1)
results.append(evaluate_model("RandomForest", rf, X_train, X_test, y_train, y_test, use_scaled=False))
joblib.dump(rf, 'models/random_forest.joblib')

# 8.3 XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42, verbosity=0)
results.append(evaluate_model("XGBoost", xgb, X_train, X_test, y_train, y_test, use_scaled=False))
joblib.dump(xgb, 'models/xgboost.joblib')

# 8.4 NGBoost (probabilistic)
# NGBoost requires a discrete distribution wrapper for multiclass.
# We'll use NGBoost with k_categorical distribution if available.
try:
    ngb = NGBoost(n_estimators=200, verbose=False, random_state=42)
    # NGBoost expects 1d y as integers starting at 0; works for multiclass
    ngb.fit(X_train.values, y_train.values)
    y_pred_ngb = ngb.predict(X_test.values)
    # Evaluate
    acc = accuracy_score(y_test, y_pred_ngb)
    prec = precision_score(y_test, y_pred_ngb, average='macro', zero_division=0)
    rec = recall_score(y_test, y_pred_ngb, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred_ngb, average='macro', zero_division=0)
    ham = hamming_loss(y_test, y_pred_ngb)
    print("---- NGBoost ----")
    print("Accuracy: {:.4f}  Precision(macro): {:.4f}  Recall(macro): {:.4f}  F1(macro): {:.4f}  Hamming: {:.4f}".format(acc,prec,rec,f1,ham))
    print(confusion_matrix(y_test, y_pred_ngb))
    print(classification_report(y_test, y_pred_ngb, zero_division=0))
    results.append({'name':'NGBoost','model':ngb,'accuracy':acc,'precision':prec,'recall':rec,'f1':f1,'hamming':ham})
    joblib.dump(ngb, 'models/ngboost.joblib')
except Exception as e:
    print("NGBoost training failed:", e)

# 8.5 QRM approximation using QuadraticDiscriminantAnalysis (QDA)
# NOTE: QRM (Quadratic Risk Minimization) is not a standard sklearn classifier.
# QDA is a quadratic classifier (Gaussian-based) and used here as a defendable quadratic-method baseline.
qda = QuadraticDiscriminantAnalysis()
results.append(evaluate_model("QDA (QRM-approx)", qda, X_train, X_test, y_train, y_test))
joblib.dump(qda, 'models/qda_qrm_approx.joblib')

# Save results summary
res_df = pd.DataFrame(results).sort_values(by='f1', ascending=False)
display(res_df)
res_df.to_csv('models/model_results_summary.csv', index=False)
print("All trained models saved to /content/models/")

---- LogisticRegression ----
Accuracy: 0.3091  Precision(macro): 0.2806  Recall(macro): 0.3089  F1(macro): 0.2768  Hamming: 0.6909
Confusion matrix:
[[2865  612 1141  412    0]
 [2152  636  571  515 1129]
 [1428  674  694  432 1772]
 [1228  264  670  718 2105]
 [ 702  642  153  670 2815]]
Classification report:
              precision    recall  f1-score   support

           0       0.34      0.57      0.43      5030
           1       0.22      0.13      0.16      5003
           2       0.21      0.14      0.17      5000
           3       0.26      0.14      0.19      4985
           4       0.36      0.57      0.44      4982

    accuracy                           0.31     25000
   macro avg       0.28      0.31      0.28     25000
weighted avg       0.28      0.31      0.28     25000


---- RandomForest ----
Accuracy: 0.9355  Precision(macro): 0.9365  Recall(macro): 0.9355  F1(macro): 0.9350  Hamming: 0.0645
Confusion matrix:
[[4918  101    7    4    0]
 [ 708 4025    5    0  265

Unnamed: 0,name,model,accuracy,precision,recall,f1,hamming
2,XGBoost,"XGBClassifier(base_score=None, booster=None, c...",0.94092,0.941328,0.940922,0.940197,0.05908
1,RandomForest,"(DecisionTreeClassifier(max_features='sqrt', r...",0.93552,0.936532,0.935499,0.934962,0.06448
0,LogisticRegression,"LogisticRegression(class_weight='balanced', ma...",0.30912,0.280643,0.308914,0.276804,0.69088
3,QDA (QRM-approx),QuadraticDiscriminantAnalysis(),0.2892,0.252511,0.28889,0.254757,0.7108


All trained models saved to /content/models/


In [None]:
import joblib
import pandas as pd

def test_xgboost_return_df(new_data_path, model_path):
    # Load model
    print("‚ö° Loading model...")
    model = joblib.load(model_path)

    # Load new dataset
    print("üì• Loading new data...")
    df_new = pd.read_excel(new_data_path)

    # Compute engineered features if missing
    if 'Salary_Per_Hour' not in df_new.columns:
        df_new['Salary_Per_Hour'] = df_new['Monthly_Salary'] / df_new['Work_Hours_Per_Week']

    if 'Attendance_Rate' not in df_new.columns:
        df_new['Attendance_Rate'] = (1 - df_new['Sick_Days'] / 260) * 100
        df_new['Attendance_Rate'] = df_new['Attendance_Rate'].clip(0, 100)

    # Features used in training
    features = [
        'Projects_Handled', 'Overtime_Hours', 'Attendance_Rate', 'Salary_Per_Hour',
        'Training_Hours', 'Employee_Satisfaction_Score', 'Work_Hours_Per_Week', 'Monthly_Salary'
    ]

    # Prepare features
    X_new = df_new[features].fillna(df_new[features].median())

    # Predict
    y_pred = model.predict(X_new)

    # Shift predictions from 0-4 to 1-5
    df_new['Predicted_Performance'] = y_pred + 1

    # Return full dataframe for inspection
    return df_new

    result_df[['Employee_ID', 'Performance_Score', 'Predicted_Performance']].head(10)

In [None]:
result_df = test_xgboost_return_df(
    new_data_path='/content/drive/MyDrive/Colab Notebooks/SIS3044 - Sustainable Information System/Extended_Employee_Performance_and_Productivity_Data.xlsx',
    model_path='/content/models/xgboost.joblib'
)

# Show relevant columns
result_df[['Employee_ID', 'Performance_Score', 'Predicted_Performance']].head(10)

‚ö° Loading model...
üì• Loading new data...


Unnamed: 0,Employee_ID,Performance_Score,Predicted_Performance
0,1,5,5
1,2,5,5
2,3,3,3
3,4,2,2
4,5,2,2
5,6,3,3
6,7,5,5
7,8,2,2
8,9,2,2
9,10,1,1


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Ensure both are integers
y_true = result_df['Performance_Score'].astype(int)
y_pred = result_df['Predicted_Performance'].astype(int)

# Compute metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)

print("Model Evaluation Metrics on New Dataset:")
print("---------------------------------------")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}\n")

print("Classification Report:\n")
print(classification_report(y_true, y_pred, zero_division=0))

result_df['Error'] = result_df['Predicted_Performance'] - result_df['Performance_Score']

Model Evaluation Metrics on New Dataset:
---------------------------------------
Accuracy: 0.9818
Precision: 0.9838
Recall: 0.9818
F1 Score: 0.9821

Classification Report:

              precision    recall  f1-score   support

           1       0.89      1.00      0.94         8
           2       1.00      0.95      0.97        20
           3       1.00      1.00      1.00         9
           4       1.00      1.00      1.00         8
           5       1.00      1.00      1.00        10

    accuracy                           0.98        55
   macro avg       0.98      0.99      0.98        55
weighted avg       0.98      0.98      0.98        55



In [None]:
from google.colab import files
files.download("/content/models/xgboost.joblib")

files.download("/content/models/scaler.joblib")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>