In [3]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.neural_network import MLPClassifier
from tpot import TPOTClassifier

df = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")

low_variance_cols = [col for col in df.columns if df[col].nunique() == 1]
df = df.drop(columns=low_variance_cols)

categorical_cols = ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

numeric_cols = ['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EnvironmentSatisfaction', 
                'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome', 
                'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'PerformanceRating', 
                'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears', 
                'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 
                'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']
scaler = MinMaxScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

X = df.drop(columns=['Attrition'])
y = LabelEncoder().fit_transform(df['Attrition'])

smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)

pca = PCA(n_components=10, random_state=42)
X_pca = pca.fit_transform(X_balanced)
X_meta = pd.DataFrame(X_pca, columns=[f"PCA_{i}" for i in range(X_pca.shape[1])])

X_combined = pd.concat([pd.DataFrame(X_balanced), X_meta], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X_combined, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)

tpot = TPOTClassifier(
    generations=5,
    population_size=50,
    verbosity=2,
    random_state=42,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)
tpot.fit(X_train, y_train)

y_pred_tpot = tpot.predict(X_test)
print("TPOT Accuracy:", accuracy_score(y_test, y_pred_tpot))
print("Classification Report:\n", classification_report(y_test, y_pred_tpot))

# Save the best pipeline
tpot.export('best_pipeline.py')

mlp = MLPClassifier(hidden_layer_sizes=(50, 25), max_iter=500, random_state=42)
stacking_clf = StackingClassifier(
    estimators=[
        ('xgb', XGBClassifier(eval_metric='logloss', random_state=42)),
        ('lgbm', LGBMClassifier(random_state=42)),
        ('mlp', mlp)
    ],
    final_estimator=LogisticRegression(),
    cv=5,
    n_jobs=-1
)
stacking_clf.fit(X_train, y_train)

y_pred_stacking = stacking_clf.predict(X_test)
print("Stacking Accuracy:", accuracy_score(y_test, y_pred_stacking))
print("Classification Report:\n", classification_report(y_test, y_pred_stacking))


Optimization Progress:   0%|          | 0/300 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.9159889419452124

Generation 2 - Current best internal CV score: 0.9165602747759067

Generation 3 - Current best internal CV score: 0.9206165703275531

Generation 4 - Current best internal CV score: 0.9206165703275531

Generation 5 - Current best internal CV score: 0.9229320599815699

Best pipeline: XGBClassifier(input_matrix, learning_rate=0.1, max_depth=7, min_child_weight=7, n_estimators=100, n_jobs=1, subsample=0.9000000000000001, verbosity=0)
TPOT Accuracy: 0.918918918918919
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.96      0.92       370
           1       0.96      0.88      0.92       370

    accuracy                           0.92       740
   macro avg       0.92      0.92      0.92       740
weighted avg       0.92      0.92      0.92       740

Stacking Accuracy: 0.9297297297297298
Classification Report:
               precision    recall  f1-score   suppor