In [None]:
# in this notebook, we will use ray tune to tune the hyperparameters of xgboost
# we first download a sample of data for classification and break it into train and test
# then we define the search space for ray tune hyperparameter assuming we are using a sklearn pipeline that had this steps:
## 1. impute missing values
## 2. scale the data
## 3. encode categorical features
## 4. train the model
# we will use the tune.run() function to run the hyperparameter search
# we will use the tune.track.log() function to log the metrics we want to optimize
# we will use the tune.report() function to report the metrics we want to optimize


In [None]:
# first we import the libraries we need
import pandas as pd
import numpy as np
import xgboost as xgb
import ray
from ray import tune
# from ray.tune import track
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
import warnings
warnings.filterwarnings('ignore')

In [None]:
# now let's download a free dataset from kaggle to use for classification
# the dataset is about predicting if a customer will churn or not
# we will use the telecom customer churn dataset

data_df = pd.read_csv("../data/churn-bigml-80.csv")

In [None]:
data_df.head()


In [None]:
# not we split the data into train and test
train_df, test_df = train_test_split(data_df, test_size=0.2, random_state=42)

In [None]:
categories = ['Area code', 'International plan']
numerical = ['Total day minutes', 'Total day calls', 'Total day charge', 'Total eve minutes', 'Total eve calls',
                'Total eve charge', 'Total night minutes', 'Total night calls', 'Total night charge', 'Total intl minutes',
                'Total intl calls', 'Total intl charge', 'Customer service calls', 'Number vmail messages', 'Account length', ]

label = 'Churn'


X_train = train_df.drop(label, axis=1)
y_train = train_df[label].apply(lambda x: 1 if x == 'True.' else 0)

In [None]:
X_test = test_df.drop(label, axis=1)
y_test = test_df[label].apply(lambda x: 1 if x == 'True.' else 0)

In [None]:
# we first define the search space for the transformers

transformer_space = {
    'simple_imputer__strategy': tune.choice(['mean', 'median', 'most_frequent']),
    'standard_scaler__with_mean': tune.choice([True, False]),
    'standard_scaler__with_std': tune.choice([True, False]),
}


In [None]:
# now we define the search space for the model, which will be xgboost
# we will use the tune.choice() function to define the search space for each hyperparameter

model_space = {
    'n_estimators': tune.choice([100, 200, 300, 400, 500]),
    'max_depth': tune.choice([3, 4, 5, 6, 7, 8, 9, 10]),
    'learning_rate': tune.choice([0.01, 0.05, 0.1, 0.15, 0.2]),
    'gamma': tune.choice([0, 0.1, 0.2, 0.3, 0.4, 0.5]),
    'min_child_weight': tune.choice([0, 1, 2, 3, 4, 5]),
    'subsample': tune.choice([0.5, 0.6, 0.7, 0.8, 0.9, 1]),
    'colsample_bytree': tune.choice([0.5, 0.6, 0.7, 0.8, 0.9, 1]),
    'colsample_bylevel': tune.choice([0.5, 0.6, 0.7, 0.8, 0.9, 1]),
    'colsample_bynode': tune.choice([0.5, 0.6, 0.7, 0.8, 0.9, 1]),
    'reg_alpha': tune.choice([0, 0.1, 0.2, 0.3, 0.4, 0.5]),
    'reg_lambda': tune.choice([0, 0.1, 0.2, 0.3, 0.4, 0.5]),
    'scale_pos_weight': tune.choice([1, 2, 3, 4, 5]),
}


In [None]:
def make_transformer(config):
    return ColumnTransformer([
        (
            'one_hot_encoder',
            OneHotEncoder(handle_unknown='ignore'),
            categories
            ),
        (
            'simple_imputer',
            SimpleImputer(strategy=config['transformer']['simple_imputer__strategy']),
            numerical
        ),
        (
        'standard_scaler',
        StandardScaler(
            with_mean=config['transformer']['standard_scaler__with_mean'], 
            with_std=config['transformer']['standard_scaler__with_std']
            ),
        numerical
        ),
    ], remainder='passthrough')

def make_classifier(config):
    return xgb.XGBClassifier(
        n_estimators=config['model']['n_estimators'],
        max_depth=config['model']['max_depth'],
        learning_rate=config['model']['learning_rate'],
        gamma=config['model']['gamma'],
        min_child_weight=config['model']['min_child_weight'],
        subsample=config['model']['subsample'],
        colsample_bytree=config['model']['colsample_bytree'],
        colsample_bylevel=config['model']['colsample_bylevel'],
        colsample_bynode=config['model']['colsample_bynode'],
        reg_alpha=config['model']['reg_alpha'],
        reg_lambda=config['model']['reg_lambda'],
        scale_pos_weight=config['model']['scale_pos_weight']
    )


In [None]:

def train_model(config):
    # Create transformer with given configuration
    preprocessing = make_transformer(config)

    # Create and train model with given configuration
    model = make_classifier(config['model'])

    # Combine preprocessing and model into a single pipeline
    pipeline = Pipeline([
        ('preprocessing', preprocessing),
        ('model', model)
    ])

    # Fit the model
    pipeline.fit(X_train, y_train)

    # Predict on test set
    predictions = pipeline.predict(X_test)

    # Calculate accuracy
    acc = accuracy_score(y_test, predictions)

    # Send the accuracy to Tune to track the performance of this set of hyperparameters
    tune.report(accuracy=acc)


In [None]:


config = {
    "transformer": transformer_space,
    "model": model_space
}

analysis = tune.run(
    train_model, 
    config=config, 
    num_samples=10,  # number of times to sample from the configuration space
    resources_per_trial={"cpu": 4}
)
