In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

In [6]:
# Load the dataset
data = pd.read_csv('Dataset/employeeData.csv')

In [7]:
# Preprocessing
# Drop unnecessary columns
data.drop(['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours'], axis=1, inplace=True)

In [8]:
# Convert categorical variables to numerical using Label Encoding
label_encoder = LabelEncoder()
for column in data.columns:
    if data[column].dtype == 'object':
        data[column] = label_encoder.fit_transform(data[column])

In [9]:
# Split data into features and target
X = data.drop('Attrition', axis=1)
y = data['Attrition']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Train Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

In [11]:
# Make predictions
y_pred = rf_classifier.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8809523809523809


In [12]:
# Save the model
joblib.dump(rf_classifier, 'employee_attrition_model.pkl')

['employee_attrition_model.pkl']

In [13]:
# Load the model from the .pkl file
loaded_model = joblib.load('employee_attrition_model.pkl')

# Inspect the loaded model
print(loaded_model)

RandomForestClassifier(random_state=42)


In [14]:
print(loaded_model.feature_importances_)

[0.06156931 0.01304853 0.05408678 0.01277683 0.0459989  0.01943436
 0.02347133 0.02616171 0.00939511 0.04736942 0.02345759 0.02520962
 0.03139227 0.02742481 0.02953681 0.07821165 0.0483734  0.03795134
 0.06666901 0.03445207 0.00430341 0.02313979 0.02933773 0.05249112
 0.02495238 0.02090786 0.04134737 0.0290104  0.02773798 0.03078111]


In [15]:
# Higher values: Features with higher values are considered more important by the model in predicting employee attrition.
# Relative importance: The values are relative to each other within the model. A higher value for a feature indicates that it has a larger impact on the model's predictions compared to features with lower values.
# Ranking: You can rank the features based on their importances, from most to least important. This can help you identify which features have the most significant influence on employee attrition according to the model.