# Create Synthetic HR Data

In [1]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Number of samples
num_samples = 1000

# Generate synthetic data
data = {
    'Age': np.random.randint(18, 65, size=num_samples),
    'Gender': np.random.choice(['Male', 'Female'], size=num_samples),
    'Department': np.random.choice(['Sales', 'Engineering', 'HR', 'Marketing'], size=num_samples),
    'JobSatisfaction': np.random.randint(1, 5, size=num_samples),  # 1 to 4 scale
    'YearsAtCompany': np.random.randint(0, 40, size=num_samples),
    'Attrition': np.random.choice([0, 1], size=num_samples, p=[0.8, 0.2])  # 0: No, 1: Yes
}

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv('employee_attrition.csv', index=False)

# Display the first few rows
df.head()


Unnamed: 0,Age,Gender,Department,JobSatisfaction,YearsAtCompany,Attrition
0,56,Male,Engineering,2,22,0
1,46,Female,HR,4,21,0
2,32,Female,Engineering,1,18,0
3,60,Female,Engineering,3,2,0
4,25,Male,Engineering,2,18,0


# Load and Explore the Dataset

In [2]:
import pandas as pd

# Load the dataset
data_path = 'employee_attrition.csv'
df = pd.read_csv(data_path)

# Display the first few rows of the dataframe
print(df.head())


   Age  Gender   Department  JobSatisfaction  YearsAtCompany  Attrition
0   56    Male  Engineering                2              22          0
1   46  Female           HR                4              21          0
2   32  Female  Engineering                1              18          0
3   60  Female  Engineering                3               2          0
4   25    Male  Engineering                2              18          0


# Data Preprocessing

In [3]:
# Display basic information about the dataset
df.info()

# Check for missing values
print(df.isnull().sum())

# Handle missing values if any (example: filling missing values with the median)
# df.fillna(df.median(), inplace=True)

# Encode categorical variables
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
le = LabelEncoder()

# Example of encoding a categorical column
df['Gender'] = le.fit_transform(df['Gender'])
df['Department'] = le.fit_transform(df['Department'])
# Repeat for other categorical columns as needed

# Alternatively, use one-hot encoding for nominal categorical variables
df = pd.get_dummies(df, drop_first=True)

# Display the first few rows of the dataframe after encoding
df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Age              1000 non-null   int64 
 1   Gender           1000 non-null   object
 2   Department       1000 non-null   object
 3   JobSatisfaction  1000 non-null   int64 
 4   YearsAtCompany   1000 non-null   int64 
 5   Attrition        1000 non-null   int64 
dtypes: int64(4), object(2)
memory usage: 47.0+ KB
Age                0
Gender             0
Department         0
JobSatisfaction    0
YearsAtCompany     0
Attrition          0
dtype: int64


Unnamed: 0,Age,Gender,Department,JobSatisfaction,YearsAtCompany,Attrition
0,56,1,0,2,22,0
1,46,0,1,4,21,0
2,32,0,0,1,18,0
3,60,0,0,3,2,0
4,25,1,0,2,18,0


 # Feature Engineering

In [4]:
# Select relevant features (example feature columns)
features = ['Age', 'Gender', 'Department', 'JobSatisfaction', 'YearsAtCompany']
target = 'Attrition'  # Assuming 'Attrition' is the target variable

# Define feature matrix X and target vector y
X = df[features]
y = df[target]

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# Model Training

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Initialize the models
lr = LogisticRegression(max_iter=1000)
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the models
lr.fit(X_train, y_train)
rf.fit(X_train, y_train)


# Model Evaluation

In [6]:
from sklearn.metrics import classification_report, confusion_matrix

# Predictions on the test set
y_pred_lr = lr.predict(X_test)
y_pred_rf = rf.predict(X_test)

# Evaluation
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred_lr))
print("Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf))

print("Logistic Regression Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("Random Forest Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.81      1.00      0.89       161
           1       0.00      0.00      0.00        39

    accuracy                           0.81       200
   macro avg       0.40      0.50      0.45       200
weighted avg       0.65      0.81      0.72       200

Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.94      0.88       161
           1       0.36      0.13      0.19        39

    accuracy                           0.79       200
   macro avg       0.59      0.54      0.53       200
weighted avg       0.73      0.79      0.74       200

Logistic Regression Confusion Matrix:
 [[161   0]
 [ 39   0]]
Random Forest Confusion Matrix:
 [[152   9]
 [ 34   5]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Hyperparameter Tuning

In [7]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid for random forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
print("Best parameters found: ", grid_search.best_params_)

# Evaluate the best model
best_rf = grid_search.best_estimator_
y_pred_best_rf = best_rf.predict(X_test)
print("Best Random Forest Classification Report:\n", classification_report(y_test, y_pred_best_rf))


Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best parameters found:  {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 50}
Best Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.81      1.00      0.89       161
           1       1.00      0.03      0.05        39

    accuracy                           0.81       200
   macro avg       0.90      0.51      0.47       200
weighted avg       0.85      0.81      0.73       200

