# SageMaker Demo: Employee Attrition Prediction Using Feature Store and XGBoost

This notebook demonstrates how to use Amazon SageMaker's Feature Store and XGBoost built-in algorithm to predict employee attrition.

In [None]:
import pandas as pd

# Load the dataset
file_path = 'Employee.csv'  # Replace with your actual file path in S3 if needed
employee_df = pd.read_csv(file_path)
employee_df.head()

In [None]:
# Step 2: Data Preparation
# Convert categorical columns to numeric
employee_df['Education'] = employee_df['Education'].astype('category').cat.codes
employee_df['City'] = employee_df['City'].astype('category').cat.codes
employee_df['Gender'] = employee_df['Gender'].astype('category').cat.codes
employee_df['EverBenched'] = employee_df['EverBenched'].map({'Yes': 1, 'No': 0})

# Drop rows with NaN values in the target column
employee_df.dropna(subset=['LeaveOrNot'])

# Convert target column to numeric if needed
employee_df['LeaveOrNot'] = employee_df['LeaveOrNot'].astype(int)

# Ensure no missing values in feature columns
employee_df = employee_df.dropna()

# Verify all columns are numeric
print(employee_df.dtypes)

# Define features and target
feature_columns = [
    'Education', 'JoiningYear', 'City', 'PaymentTier', 'Age',
    'Gender', 'EverBenched', 'ExperienceInCurrentDomain'
]
target_column = 'LeaveOrNot'

employee_df = employee_df[[target_column] + feature_columns]

# Display the transformed dataset
employee_df.head()

In [None]:
from sklearn.model_selection import train_test_split 

# Check if we have any retrieved records
if not employee_df.empty:
    # Split the data into training and test sets
    train_df, test_df = train_test_split(employee_df, test_size=0.2, random_state=42)
    print("Training and test data split after retrieval from Feature Store.")
else:
    print("No records retrieved. Please check the feature group and identifiers.")

## Train the Model Using Local Data with S3 Mode (Default)

In [None]:
# Save the data locally first
train_file = 'train.csv'
validation_file = 'validation.csv'
train_df.to_csv(train_file, index=False)
test_df.to_csv(validation_file, index=False)


In [None]:
train_df = pd.read_csv("train.csv")
valid_df = pd.read_csv("validation.csv")

target_col = "LeaveOrNot"

X_train = train_df.drop(columns=[target_col])
y_train = train_df[target_col]

X_valid = valid_df.drop(columns=[target_col])
y_valid = valid_df[target_col]

In [None]:
%pip install xgboost


In [None]:
from xgboost import XGBClassifier

model = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss'
)

model.fit(X_train, y_train)
