In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import r2_score, mean_absolute_error
import pickle
import os

# Load dataset
try:
    data = pd.read_csv('INX_Future_Inc_Employee_Performance_CDS_Project2_Data_V1.8.csv')
except FileNotFoundError:
    print("Error: 'INX_Future_Inc_Employee_Performance_CDS_Project2_Data_V1.8.csv' not found.")
    exit()

# Drop non-useful columns
X = data.drop(columns=['PerformanceRating', 'EmpNumber'])
y = data['PerformanceRating']

# Convert categorical features to numerical with one-hot encoding
X = pd.get_dummies(X, drop_first=True)

# Save feature names for app.py
feature_names = X.columns.tolist()
os.makedirs('backend', exist_ok=True)
with open('backend/feature_names.pkl', 'wb') as f:
    pickle.dump(feature_names, f)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling (for X)
feature_scaler = StandardScaler()
X_train_scaled = feature_scaler.fit_transform(X_train)
X_test_scaled = feature_scaler.transform(X_test)

# Target Scaling (for y, to 1-100 range)
target_scaler = MinMaxScaler(feature_range=(1, 100))
y_train_scaled = target_scaler.fit_transform(y_train.values.reshape(-1, 1)).ravel()
y_test_scaled = target_scaler.transform(y_test.values.reshape(-1, 1)).ravel()

# Debug: Verify target scaling
print("Original y (first 5):", y.head().tolist())
print("y_train_scaled (first 5):", y_train_scaled[:5])
print("y_test_scaled (first 5):", y_test_scaled[:5])

# Define and train Random Forest with increased flexibility
model_rf = RandomForestRegressor(n_estimators=200, max_depth=15, random_state=42)  # Increased max_depth
model_rf.fit(X_train_scaled, y_train_scaled)

# Define and train XGBoost
model_xg = XGBRegressor(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)
model_xg.fit(X_train_scaled, y_train_scaled)

# Evaluate models
rf_pred = model_rf.predict(X_test_scaled)
xg_pred = model_xg.predict(X_test_scaled)
print("Random Forest R²:", r2_score(y_test_scaled, rf_pred) * 100)
print("Random Forest MAE:", mean_absolute_error(y_test_scaled, rf_pred))
print("Random Forest Predictions (first 5):", rf_pred[:5])
print("XGBoost R²:", r2_score(y_test_scaled, xg_pred) * 100)
print("XGBoost MAE:", mean_absolute_error(y_test_scaled, xg_pred))
print("XGBoost Predictions (first 5):", xg_pred[:5])

# Feature Importance
rf_importance = pd.Series(model_rf.feature_importances_, index=feature_names).sort_values(ascending=False)
xg_importance = pd.Series(model_xg.feature_importances_, index=feature_names).sort_values(ascending=False)
print("Random Forest Top 10 Features:\n", rf_importance.head(10))
print("XGBoost Top 10 Features:\n", xg_importance.head(10))

# Save models and scalers
with open('backend/model_RF.pkl', 'wb') as f:
    pickle.dump(model_rf, f)
with open('backend/model_xg.pkl', 'wb') as f:
    pickle.dump(model_xg, f)
with open('backend/feature_scaler.pkl', 'wb') as f:
    pickle.dump(feature_scaler, f)
with open('backend/target_scaler.pkl', 'wb') as f:
    pickle.dump(target_scaler, f)

# Save mappings
mappings = {
    'Gender': {'Male': 1, 'Female': 0},
    'EducationBackground': {'Life Sciences': 1, 'Marketing': 2, 'Medical': 3, 'Other': 4, 'Technical Degree': 5, 'Human Resources': 6},
    'MaritalStatus': {'Single': 1, 'Married': 2, 'Divorced': 3},
    'EmpDepartment': {'Sales': 1, 'Human Resources': 2, 'Development': 3, 'Data Science': 4, 'Research & Development': 5, 'Finance': 6},
    'EmpJobRole': {'Sales Executive': 1, 'Developer': 2, 'Manager': 3, 'Research Scientist': 4, 'Human Resources': 5, 'Senior Developer': 6, 'Data Scientist': 7, 'Sales Representative': 8, 'Laboratory Technician': 9, 'Senior Manager R&D': 10, 'Finance Manager': 11, 'Technical Architect': 12, 'Business Analyst': 13, 'Technical Lead': 14, 'Research Director': 15, 'Delivery Manager': 16, 'Manager R&D': 17, 'Healthcare Representative': 18, 'Manufacturing Director': 19},
    'BusinessTravelFrequency': {'Travel_Rarely': 1, 'Travel_Frequently': 2, 'Non-Travel': 3},
    'OverTime': {'Yes': 1, 'No': 0},
    'Attrition': {'Yes': 1, 'No': 0}
}
with open('backend/mappings.pkl', 'wb') as f:
    pickle.dump(mappings, f)

print("Models, scalers, and mappings saved to 'backend/' directory.")

Original y (first 5): [3, 3, 4, 3, 3]
y_train_scaled (first 5): [ 1.  50.5 50.5 50.5 50.5]
y_test_scaled (first 5): [50.5  1.  50.5 50.5 50.5]
Random Forest R²: 73.2609448329315
Random Forest MAE: 5.509595366379311
Random Forest Predictions (first 5): [32.185  3.97  50.995 50.5   50.5  ]
XGBoost R²: 72.06781411308548
XGBoost MAE: 6.242448676214553
XGBoost Predictions (first 5): [38.334854  6.728392 49.7944   50.621037 50.476856]
Random Forest Top 10 Features:
 EmpLastSalaryHikePercent        0.233694
EmpEnvironmentSatisfaction      0.225229
YearsSinceLastPromotion         0.187019
ExperienceYearsInCurrentRole    0.067274
EmpDepartment_Development       0.050266
EmpWorkLifeBalance              0.043151
EmpJobRole_Developer            0.023623
Age                             0.018357
YearsWithCurrManager            0.017079
EmpHourlyRate                   0.014326
dtype: float64
XGBoost Top 10 Features:
 EmpDepartment_Development               0.148851
YearsSinceLastPromotion            