# Setup: Install MLflow

In [None]:
pip install mlflow==2.13.2 sagemaker-mlflow==0.1.0 

# Setup: Import libraries

In [None]:
import xgboost as xgb
import mlflow
import mlflow.pyfunc
import mlflow.xgboost
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import LabelEncoder


# Load and Prepare Data

In [None]:
# Load your dataset
employee_df = pd.read_csv('Employee.csv')


# Convert relevant columns to 'category' dtype
employee_df['Education'] = employee_df['Education'].astype('category')
employee_df['City'] = employee_df['City'].astype('category')
employee_df['Gender'] = employee_df['Gender'].astype('category')
employee_df['EverBenched'] = employee_df['EverBenched'].astype('category')

# Split your data
X = employee_df.drop(columns=['LeaveOrNot'])  # Assuming 'LeaveOrNot' is your target column
y = employee_df['LeaveOrNot']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert datasets to DMatrix, with enable_categorical set to True
train_dmatrix = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
val_dmatrix = xgb.DMatrix(X_val, label=y_val, enable_categorical=True)

# Setting up MLflow Tracking

In [None]:


mlflow.set_tracking_uri('arn:aws:sagemaker:us-east-1:448049810900:mlflow-tracking-server/training-tracking-server')
mlflow.set_experiment("test-experiment-1")



# Start MLflow run & log parameters/metrics

In [None]:
# Start an MLflow run
with mlflow.start_run():
    # Set XGBoost parameters
    params = {
        'max_depth': 5,
        'eta': 0.2,
        'gamma': 4,
        'min_child_weight': 6,
        'subsample': 0.7,
        'objective': 'reg:squarederror'
    }

    # Train the model
    bst = xgb.train(params, train_dmatrix, num_boost_round=50)  # num_boost_round is passed directly

    # Log model and parameters
    mlflow.log_params(params)
    
    # Log the model to MLflow
    mlflow.xgboost.log_model(bst, "xgboost_model")

    # Predict on the validation set
    preds = bst.predict(val_dmatrix)

    # Calculate RMSE
    rmse = ((preds - y_val) ** 2).mean() ** 0.5
    mlflow.log_metric("validation_rmse", rmse)

    # Calculate additional metrics
    accuracy = accuracy_score(y_val, preds.round())
    precision = precision_score(y_val, preds.round(), average='macro')
    recall = recall_score(y_val, preds.round(), average='macro')

    # Log additional metrics
    mlflow.log_metric("validation_accuracy", accuracy)
    mlflow.log_metric("validation_precision", precision)
    mlflow.log_metric("validation_recall", recall)

    print(f"Logged model with validation RMSE: {rmse}")
    print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}")

# Start MLflow run & log parameters/metrics with Hyperparameter Tuning

In [None]:
import itertools

# Define a grid of hyperparameters to search over
param_grid = {
    'max_depth': [3, 5],         # Only 2 values
    'eta': [0.1, 0.2],           # Only 2 values
    'gamma': [0, 1],             # Only 2 values
    'min_child_weight': [1, 3],  # Only 2 values
    'subsample': [0.8, 1.0]      # Only 2 values
}

# Generate all combinations of hyperparameters
param_combinations = list(itertools.product(
    param_grid['max_depth'],
    param_grid['eta'],
    param_grid['gamma'],
    param_grid['min_child_weight'],
    param_grid['subsample']
))


In [None]:
import mlflow.xgboost
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score

best_rmse = float("inf")
best_params = None

for i, (max_depth, eta, gamma, min_child_weight, subsample) in enumerate(param_combinations):
    with mlflow.start_run(run_name=f"run_{i+1}"):
        # Set XGBoost parameters
        params = {
            'max_depth': max_depth,
            'eta': eta,
            'gamma': gamma,
            'min_child_weight': min_child_weight,
            'subsample': subsample,
            'objective': 'reg:squarederror'
        }

        # Train the model
        bst = xgb.train(params, train_dmatrix, num_boost_round=50)

        # Log model and parameters
        mlflow.log_params(params)
        mlflow.xgboost.log_model(bst, f"xgboost_model_run_{i+1}")

        # Predict on the validation set
        preds = bst.predict(val_dmatrix)

        # Calculate RMSE
        rmse = ((preds - y_val) ** 2).mean() ** 0.5
        mlflow.log_metric("validation_rmse", rmse)

        # Calculate additional metrics
        accuracy = accuracy_score(y_val, preds.round())
        precision = precision_score(y_val, preds.round(), average='macro')
        recall = recall_score(y_val, preds.round(), average='macro')

        # Log additional metrics
        mlflow.log_metric("validation_accuracy", accuracy)
        mlflow.log_metric("validation_precision", precision)
        mlflow.log_metric("validation_recall", recall)

        print(f"Run {i+1} logged with RMSE: {rmse}, Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}")

        # Update the best RMSE score and corresponding parameters
        if rmse < best_rmse:
            best_rmse = rmse
            best_params = params
