In [2]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [3]:
# Load dataset
df = pd.read_csv("../dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv")


In [4]:
features = [
    'Age','DailyRate','DistanceFromHome','Education',
    'EnvironmentSatisfaction','JobInvolvement','JobLevel',
    'JobSatisfaction','MonthlyIncome','NumCompaniesWorked',
    'PercentSalaryHike','PerformanceRating','TotalWorkingYears',
    'WorkLifeBalance','YearsAtCompany'
]

In [5]:
X = df[features]
y = df['Attrition']


In [6]:
# Encode target
le = LabelEncoder()
y = le.fit_transform(y)  # Yes=1, No=0


In [7]:
# Handle missing values (safety)
X.fillna(X.median(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(X.median(), inplace=True)


In [8]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [9]:

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [10]:
# Model
model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42
)

In [11]:
model.fit(X_train_scaled, y_train)

In [12]:
# Accuracy
y_pred = model.predict(X_test_scaled)
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.8707482993197279


In [13]:
# Save model & scaler
pickle.dump(model, open("../model/model.pkl", "wb"))
pickle.dump(scaler, open("../model/scaler.pkl", "wb"))

In [14]:
print("Model & Scaler saved successfully")

Model & Scaler saved successfully
