In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv('../Dataset/Employee-Attrition.csv')

In [4]:
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [5]:
df.drop(columns=["EmployeeCount","StandardHours","Over18","EmployeeNumber"], inplace=True)

In [6]:
col = ['Age','DistanceFromHome','Education','JobLevel','MonthlyIncome','NumCompaniesWorked','PercentSalaryHike',
       'JobSatisfaction','WorkLifeBalance','TotalWorkingYears','YearsAtCompany','YearsWithCurrManager','BusinessTravel',
       'Department','EducationField','Gender','JobRole','MaritalStatus','OverTime']

X = df[col]
y = df['Attrition'].map({"Yes":1,"No":0})

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [8]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [9]:
num_cols = X.select_dtypes(exclude="object").columns
cat_cols = X.select_dtypes(include="object").columns

preprocessor = ColumnTransformer(
    transformers=[
        ("num", MinMaxScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ]
)

In [10]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=260,
    max_depth=9,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

In [11]:
pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("model", model)
])

In [12]:
pipeline.fit(X_train, y_train)

In [13]:
from sklearn.metrics import accuracy_score, classification_report

In [14]:
y_pred = pipeline.predict(X_test)

In [15]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8571428571428571
              precision    recall  f1-score   support

           0       0.87      0.98      0.92       380
           1       0.42      0.08      0.14        61

    accuracy                           0.86       441
   macro avg       0.64      0.53      0.53       441
weighted avg       0.81      0.86      0.81       441



In [16]:
test_input = {
    'Age': 29,
    'DistanceFromHome': 10,
    'Education': 3,
    'JobLevel': 2,
    'MonthlyIncome': 5500,
    'NumCompaniesWorked': 1,
    'PercentSalaryHike': 13,
    'JobSatisfaction': 3,
    'WorkLifeBalance': 2,
    'TotalWorkingYears': 6,
    'YearsAtCompany': 3,
    'YearsWithCurrManager': 2,
    'BusinessTravel': 'Travel_Rarely',
    'Department': 'Sales',
    'EducationField': 'Life Sciences',
    'Gender': 'Female',
    'JobRole': 'Sales Executive',
    'MaritalStatus': 'Single',
    'OverTime': 'Yes'
}

In [17]:
test_df = pd.DataFrame([test_input])
prediction = pipeline.predict(test_df)

print("Prediction:", "Attrition" if prediction[0] == 1 else "No Attrition")

Prediction: No Attrition


In [18]:
import joblib
joblib.dump(pipeline, "attrition_model.pkl")

print("✅ Model saved successfully")

✅ Model saved successfully
