In [1]:
import datetime
from google.cloud import aiplatform as ai
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.ensemble import RandomForestClassifier
import joblib
import os
import fsspec
from tqdm import tqdm
from xgboost import XGBClassifier
import gcsfs
import certifi

os.environ['SSL_CERT_FILE'] = certifi.where()
os.environ['GRPC_VERBOSITY'] = 'ERROR'

In [2]:
# Configuration
PROJECT_ID = 'vertextutorial-429522'
BUCKET_URI = 'gs://mlops_churn_bucket'
ARTIFACTS_FOLDER = 'artifacts_mlops_project'
REGION = 'us-central1'
EXPERIMENT_NAME = 'experiment-mlops-project'

# Initialize Vertex AI and set experiment
ai.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI, experiment=EXPERIMENT_NAME)

# Initialize GCS filesystem
fs = gcsfs.GCSFileSystem(project=PROJECT_ID)

In [3]:
def load_and_split_data(file_path):
    """Load, clean, and split the data into train, validation, and test sets."""
    # Load the dataset
    with fs.open(file_path) as f:
        df = pd.read_csv(f)

    # Drop unnecessary columns
    columns_to_drop = ['RowNumber', 'CustomerId', 'Surname']
    df.drop(columns=columns_to_drop, inplace=True)

    # Convert Gender to binary (0 and 1)
    df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})

    # Define features and target
    X = df.drop(columns=['Exited'])
    y = df['Exited']

    # Split the data into train, validation, and test sets
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

    # Verify the sizes
    print(f"Training set size: {X_train.shape[0]} records")
    print(f"Validation set size: {X_valid.shape[0]} records")

    return X_train, X_valid, y_train, y_valid


data_path = 'gs://mlops_churn_bucket/Churn_Modeling_Main.csv'
X_train, X_valid, y_train, y_valid = load_and_split_data(data_path)

Training set size: 6750 records
Validation set size: 2250 records


In [4]:
def transform_data(X_train, X_valid):
    """Create a preprocessing pipeline and apply it to the data."""
    # Identify numeric, binary, and categorical columns
    numeric_features = ['Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
    binary_features = ['HasCrCard', 'IsActiveMember', 'Gender']
    categorical_features = ['Geography']
    binning_features = ['CreditScore']

    # Create the preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('bin', 'passthrough', binary_features),  # Pass through binary features without change
            ('cat', OneHotEncoder(drop='first'), categorical_features),  # One-hot encode categorical features
            ('credit_bin', KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform'), binning_features)  # Binning for CreditScore
        ], remainder='passthrough')  # Ensure passthrough for any remaining columns

    # Create the full pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor)
    ])

    # Fit and transform the training data
    X_train_preprocessed = pipeline.fit_transform(X_train)

    # Transform the validation data
    X_valid_preprocessed = pipeline.transform(X_valid)

    # Print the shape of the transformed training data
    print(f"Transformed training data shape: {X_train_preprocessed.shape}")

    # Generate feature names for all features including one-hot encoded features
    onehot_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features).tolist()
    preprocessor_feature_names = (
        preprocessor.named_transformers_['num'].get_feature_names_out(numeric_features).tolist() +
        binary_features +
        onehot_feature_names +
        binning_features  # Single binned CreditScore column
    )

    # Ensure the number of feature names matches the number of columns in the transformed data
    assert len(preprocessor_feature_names) == X_train_preprocessed.shape[1], "Number of feature names does not match the number of columns in the transformed data."

    # Convert the preprocessed data back to DataFrame for better readability
    X_train_preprocessed_df = pd.DataFrame(X_train_preprocessed, columns=preprocessor_feature_names)
    X_valid_preprocessed_df = pd.DataFrame(X_valid_preprocessed, columns=preprocessor_feature_names)

    return X_train_preprocessed_df, X_valid_preprocessed_df


X_train_preprocessed, X_valid_preprocessed = transform_data(X_train, X_valid)

Transformed training data shape: (6750, 11)


In [5]:
def train_and_evaluate_model(model, X_train, y_train, X_valid, y_valid):
    model.fit(X_train, y_train)
    y_valid_pred = model.predict(X_valid)
    accuracy = accuracy_score(y_valid, y_valid_pred)
    precision = precision_score(y_valid, y_valid_pred)
    recall = recall_score(y_valid, y_valid_pred)
    f1 = f1_score(y_valid, y_valid_pred)
    
    return model, accuracy, precision, recall, f1

In [6]:
def upload_artifact_to_gcs(local_path, bucket_uri, artifacts_folder):
    from google.cloud import storage
    client = storage.Client()
    bucket = client.bucket(bucket_uri.replace('gs://', ''))
    blob = bucket.blob(f'{artifacts_folder}/{os.path.basename(local_path)}')
    blob.upload_from_filename(local_path)
    return f'{bucket_uri}/{artifacts_folder}/{os.path.basename(local_path)}'

In [7]:
# Custom scorer
scorer = make_scorer(f1_score)

# Define hyperparameters for XGBoost
xgb_param_grid = {
    'n_estimators': [200],
    'max_depth': [5, 7],
    'learning_rate': [0.1],
    'subsample': [0.8],
    'colsample_bytree': [0.8],
    'gamma': [0],
    'random_state': [42]
}
# xgb_param_grid = {
#     'n_estimators': [50, 100],
#     'max_depth': [3, 5],
#     'learning_rate': [0.01, 0.1],
#     'subsample': [0.8, 1.0],
#     'colsample_bytree': [0.8, 1.0],
#     'gamma': [0, 0.1],
#     'random_state': [42]
# }
# Run experiments for XGBoost
print("Starting XGBoost grid search...")
xgb_grid_search = GridSearchCV(estimator=XGBClassifier(eval_metric='logloss'), param_grid=xgb_param_grid, scoring=scorer, cv=3, n_jobs=-1)
xgb_grid_search.fit(X_train_preprocessed, y_train)
print("XGBoost grid search completed.")

# Log all runs and only save the best model artifact for XGBoost
best_xgb_run_name = None
best_xgb_run_metrics = None

print("Starting XGBoost experiment runs...")
with tqdm(total=len(xgb_grid_search.cv_results_['params'])) as pbar:
    for i, (params, mean_score, std_score) in enumerate(zip(xgb_grid_search.cv_results_['params'], xgb_grid_search.cv_results_['mean_test_score'], xgb_grid_search.cv_results_['std_test_score'])):
        run_name = f'xgb-run-{datetime.datetime.now().strftime("%Y%m%d%H%M%S")}-{i}'
        print(f"Starting run: {run_name}")
        with ai.start_run(run_name) as run:
            model = xgb_grid_search.estimator.set_params(**params)
            model, accuracy, precision, recall, f1 = train_and_evaluate_model(model, X_train_preprocessed, y_train, X_valid_preprocessed, y_valid)
            run.log_params(params)
            metrics = {
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1_score': f1,
                'mean_test_score': mean_score,
                'std_test_score': std_score
            }
            run.log_metrics(metrics)
            print(f"Completed run: {run_name} with F1 score: {f1}")
            if best_xgb_run_name is None or f1 > best_xgb_run_metrics['f1_score']:
                best_xgb_run_name = run_name
                best_xgb_run_metrics = metrics
        pbar.update(1)

print("XGBoost experiment runs completed.")

# Save the best XGBoost model artifact
print("Saving the best XGBoost model artifact...")
best_xgb_model = xgb_grid_search.best_estimator_
best_xgb_params = xgb_grid_search.best_params_
run_name = f'best-xgb-run-{datetime.datetime.now().strftime("%Y%m%d%H%M%S")}'
with ai.start_run(run_name) as run:
    model, accuracy, precision, recall, f1 = train_and_evaluate_model(best_xgb_model, X_train_preprocessed, y_train, X_valid_preprocessed, y_valid)
    run.log_params(best_xgb_params)
    metrics = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    }
    run.log_metrics(metrics)
    artifact_path = f'{run_name}.joblib'
    joblib.dump(model, artifact_path)
    artifact_gcs_path = upload_artifact_to_gcs(artifact_path, BUCKET_URI, ARTIFACTS_FOLDER)
    run.log_params({'model_artifact': artifact_gcs_path})
    os.remove(artifact_path)
print("Best XGBoost model artifact saved.")

Starting XGBoost grid search...
XGBoost grid search completed.
Starting XGBoost experiment runs...


  0%|          | 0/2 [00:00<?, ?it/s]

Starting run: xgb-run-20240730165311-0
Associating projects/68763299890/locations/us-central1/metadataStores/default/contexts/experiment-mlops-project-xgb-run-20240730165311-0 to Experiment: experiment-mlops-project


Completed run: xgb-run-20240730165311-0 with F1 score: 0.5721784776902887


 50%|█████     | 1/2 [00:03<00:03,  3.03s/it]

Starting run: xgb-run-20240730165314-1
Associating projects/68763299890/locations/us-central1/metadataStores/default/contexts/experiment-mlops-project-xgb-run-20240730165314-1 to Experiment: experiment-mlops-project


Completed run: xgb-run-20240730165314-1 with F1 score: 0.5548387096774193


100%|██████████| 2/2 [00:06<00:00,  3.13s/it]


XGBoost experiment runs completed.
Saving the best XGBoost model artifact...
Associating projects/68763299890/locations/us-central1/metadataStores/default/contexts/experiment-mlops-project-best-xgb-run-20240730165317 to Experiment: experiment-mlops-project


Best XGBoost model artifact saved.
