In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sb
import warnings
warnings.filterwarnings('ignore')

In [2]:
train_dataset = pd.read_csv("/kaggle/input/playground-series-s4e8/train.csv")
test_dataset = pd.read_csv("/kaggle/input/playground-series-s4e8/test.csv")

pd.options.display.float_format = '{:.3f}'.format # Set display option for 3 decimal places

In [3]:
def preprocess(dataset, training=True):
    if(not training):
        dataset.drop_duplicates(inplace=True) #Dropping duplicates

    # Impute missing numerical values
    dataset['cap-diameter'].fillna(dataset['cap-diameter'].mean(), inplace=True)
    dataset['stem-height'].fillna(dataset['stem-height'].mean(), inplace=True)
    dataset['stem-width'].fillna(dataset['stem-width'].mean(), inplace=True)
    
    #Impute missing Categorical columns
    categorical_cols = dataset.select_dtypes(include="object").columns
    for col in categorical_cols:
        dataset[col].fillna('na', inplace=True)
        #print(f"{col} : {len(dataset[col].unique())}") #to check cardinality
        
    return dataset
    

In [4]:
train_dataset = preprocess(dataset=train_dataset)
test_dataset = preprocess(dataset=test_dataset, training=False)

X_train = train_dataset.drop(['id', 'class'], axis=1)
y_train = train_dataset['class']

X_test = test_dataset.drop(['id'], axis=1)

In [5]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

In [6]:
from sklearn.preprocessing import LabelEncoder
import category_encoders as ce

label_encoder = LabelEncoder()

def encode_y(y_train, y_test=None):
    
    y_train = label_encoder.fit_transform(y_train)
    if(y_test is not None): 
        y_test = label_encoder.transform(y_test)
        
    return (y_train, y_test)
    
    
def encode_X(X_train, y_train, X_test=None, categorical_cols=None):
    
    #target_encoder = ce.TargetEncoder(cols= categorical_cols.drop('class'))
    target_encoder = ce.TargetEncoder(cols= categorical_cols)
    X_train = target_encoder.fit_transform(X_train, y_train)
    if(X_test is not None):
        X_test = target_encoder.transform(X_test)
    
    return (X_train, X_test)
    

In [7]:
y_train, _ = encode_y(y_train=y_train)

X_train, X_test = encode_X(X_train=X_train, y_train=y_train, X_test=X_test, categorical_cols=X_train.select_dtypes(include="object").columns)

In [8]:
# correlation_matrix = X_train.corr().round(2)

# plt.figure(figsize=(12, 8))
# sb.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
# plt.show()
# stem-width and cap-diameter are highly correlated 

In [9]:
#Feature scaling is not necessary for a random forest classifier. But it slighly increase the accuracy
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

def feature_scale(num_cols, X_train, X_test=None):

    #numerical_cols = ['cap-diameter','stem-height', 'stem-width']
    ct = ColumnTransformer(transformers=[('feature_scaler', StandardScaler(), num_cols)] , remainder='passthrough' )

    X_train = ct.fit_transform(X_train)
    if(X_test is not None):
        X_test = ct.transform(X_test)
        
    return (X_train, X_test)

In [10]:
num_cols = ['cap-diameter','stem-height', 'stem-width']
X_train, X_test = feature_scale(num_cols=num_cols, X_train=X_train, X_test=X_test)

## XGB Model

In [11]:
# from sklearn.model_selection import StratifiedKFold, cross_val_score
# from sklearn.metrics import matthews_corrcoef, make_scorer
# from xgboost import XGBClassifier


# Xparams_s1 = {'n_estimators': 948,
#            'max_depth': 12,
#            'learning_rate': 0.025559161851111477, 
#            'reg_alpha': 0.7178566258816612, 
#            'reg_lambda': 0.00029868510908985876,
#            'subsample': 0.7997054056983265, 
#            'colsample_bytree': 0.5013225770330585}

# Xparams_s2 = {'n_estimators': 1396, 
#               'max_depth': 19, 
#               'learning_rate': 0.010455050159676566, 
#               'subsample': 0.8006842727555243, 
#               'colsample_bytree': 0.5001438770455072, 
#               'colsample_bylevel': 0.8027576507794217, 
#               'min_child_weight': 5,
#               'reg_alpha': 1.1586967014672253e-08, 
#               'reg_lambda': 3.3517458803447213e-06, 
#               'gamma': 0.01841032988451454}

# Define the model
# xgb_clf = XGBClassifier(**Xparams_s2, tree_method='gpu_hist', random_state=42)

# # Define the k-fold cross-validator
# kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# # Create a custom scorer for MCC
# mcc_scorer = make_scorer(matthews_corrcoef)

# # Perform cross-validation with MCC as the scoring metric
# scores = cross_val_score(xgb_clf, X_train, y_train, cv=kfold, scoring=mcc_scorer)

# # Print the MCC for each fold
# print("MCC for each fold: ", scores)

# # Print the mean MCC and standard deviation
# print("Mean MCC: {:.2f}".format(scores.mean()))
# print("Standard deviation of MCC: {:.2f}".format(scores.std()))


In [12]:
# xgb_clf.fit(X_train, y_train)

In [13]:
# import xgboost as xgb

# xgb_clf.save_model('xgb_classifier_model.bin')

# # Load the model back from the file
# loaded_model = XGBClassifier()
# loaded_model.load_model('xgb_classifier_model.bin')


## Fine tune XGB model 

In [14]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer, matthews_corrcoef
from xgboost import XGBClassifier
from skopt import BayesSearchCV
import numpy as np

Xparams_s2 = {'n_estimators': 1396, 
              'max_depth': 19, 
              'learning_rate': 0.010455050159676566, 
              'subsample': 0.8006842727555243, 
              'colsample_bytree': 0.5001438770455072, 
              'colsample_bylevel': 0.8027576507794217, 
              'min_child_weight': 5,
              'reg_alpha': 1.1586967014672253e-08, 
              'reg_lambda': 3.3517458803447213e-06, 
              'gamma': 0.01841032988451454}
Xparams_s3 = {'subsample': 0.7888888888888889, 
              'reg_lambda': 0.00046415888336127817, 
              'reg_alpha': 1.6681005372000592e-08, 
              'n_estimators': 1340, 
              'min_child_weight': 1, 
              'max_depth': 20, 
              'learning_rate': 0.009444444444444443, 
              'gamma': 0.0, 
              'enable_categorical': False, 
              'colsample_bytree': 0.4222222222222222, 
              'colsample_bylevel': 0.8111111111111111}

param_grid = {
    'n_estimators': np.arange(1200, 1500, 1),  # Tuning around the current value
    'max_depth': np.arange(15, 22, 1),  # Fine-tuning max_depth
    'learning_rate': np.linspace(0.001, 0.02, 20),  # Narrow down around current learning rate
    'subsample': np.linspace(0.7, 0.9, 10),  # Adjust subsample range
    'colsample_bytree': np.linspace(0.4, 0.6, 10),  # Tuning colsample_bytree
    'colsample_bylevel': np.linspace(0.7, 0.9, 10),  # Tuning colsample_bylevel
    'min_child_weight': np.arange(1, 7, 1),  # Tuning min_child_weight
    'reg_alpha': np.logspace(-10, 0, 10),  # Regularization alpha tuning
    'reg_lambda': np.logspace(-10, 0, 10),  # Regularization lambda tuning
    'gamma': np.linspace(0, 0.1, 10),  # Tuning gamma
    'enable_categorical': [True, False]
}


xgb_clf = XGBClassifier(
    tree_method='hist',
    device='cuda',
#     enable_categorical=True,
    random_state=42,
    **Xparams_s3,
#     predictor='gpu_predictor'
)
# xgb_clf.set_params(predictor='gpu_predictor')

mcc_scorer = make_scorer(matthews_corrcoef)

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

bayes_search = BayesSearchCV(
    estimator=xgb_clf,
    search_spaces=param_grid,
    n_iter=10,  # Number of parameter settings that are sampled
    cv=kfold,
    scoring=mcc_scorer,
    verbose=3,
    n_jobs=-1
)

# Fit BayesSearch
bayes_search.fit(X_train, y_train)




Fitting 5 folds for each of 1 candidates, totalling 5 fits


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 3/5] END colsample_bylevel=0.8777777777777778, colsample_bytree=0.5111111111111111, enable_categorical=False, gamma=0.07777777777777778, learning_rate=0.013000000000000001, max_depth=21, min_child_weight=2, n_estimators=1418, reg_alpha=9.999999999999999e-11, reg_lambda=1.6681005372000592e-08, subsample=0.7222222222222222;, score=0.985 total time= 9.8min
[CV 2/5] END colsample_bylevel=0.8111111111111111, colsample_bytree=0.5111111111111111, enable_categorical=True, gamma=0.06666666666666667, learning_rate=0.008, max_depth=21, min_child_weight=1, n_estimators=1393, reg_alpha=0.00046415888336127817, reg_lambda=1.0, subsample=0.8333333333333334;, score=0.985 total time=11.1min
[CV 4/5] END colsample_bylevel=0.8777777777777778, colsample_bytree=0.5111111111111111, enable_categorical=False, gamma=0.07777777777777778, learning_rate=0.013000000000000001, max_depth=21, min_child_weight=2, n_estimators=1418, reg_alpha=9.9999999999999

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Fitting 5 folds for each of 1 candidates, totalling 5 fits


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 4/5] END colsample_bylevel=0.7, colsample_bytree=0.4, enable_categorical=True, gamma=0.1, learning_rate=0.015, max_depth=20, min_child_weight=6, n_estimators=1292, reg_alpha=1.0, reg_lambda=9.999999999999999e-11, subsample=0.7222222222222222;, score=0.985 total time= 4.0min
[CV 2/5] END colsample_bylevel=0.8555555555555556, colsample_bytree=0.4444444444444445, enable_categorical=False, gamma=0.1, learning_rate=0.010000000000000002, max_depth=17, min_child_weight=3, n_estimators=1217, reg_alpha=1.6681005372000592e-08, reg_lambda=3.5938136638046256e-05, subsample=0.8111111111111111;, score=0.985 total time= 5.8min
[CV 3/5] END colsample_bylevel=0.7, colsample_bytree=0.4, enable_categorical=True, gamma=0.1, learning_rate=0.015, max_depth=20, min_child_weight=6, n_estimators=1292, reg_alpha=1.0, reg_lambda=9.999999999999999e-11, subsample=0.7222222222222222;, score=0.985 total time= 4.0min
[CV 1/5] END colsample_bylevel=0.85555

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 3/5] END colsample_bylevel=0.7, colsample_bytree=0.5555555555555556, enable_categorical=True, gamma=0.044444444444444446, learning_rate=0.013000000000000001, max_depth=19, min_child_weight=6, n_estimators=1279, reg_alpha=2.1544346900318867e-07, reg_lambda=1.6681005372000592e-08, subsample=0.8111111111111111;, score=0.985 total time= 5.6min
[CV 2/5] END colsample_bylevel=0.9, colsample_bytree=0.4444444444444445, enable_categorical=True, gamma=0.0, learning_rate=0.009000000000000001, max_depth=15, min_child_weight=6, n_estimators=1284, reg_alpha=1.2915496650148826e-09, reg_lambda=3.5938136638046256e-05, subsample=0.7666666666666666;, score=0.985 total time= 5.2min
[CV 1/5] END colsample_bylevel=0.7888888888888889, colsample_bytree=0.4222222222222222, e

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [15]:
# Get the best parameters and the best MCC score
print("Best Parameters: ", bayes_search.best_params_)
print("Best MCC Score: {:.5f}".format(bayes_search.best_score_))

Best Parameters:  OrderedDict([('colsample_bylevel', 0.8555555555555556), ('colsample_bytree', 0.4444444444444445), ('enable_categorical', False), ('gamma', 0.1), ('learning_rate', 0.010000000000000002), ('max_depth', 17), ('min_child_weight', 3), ('n_estimators', 1217), ('reg_alpha', 1.6681005372000592e-08), ('reg_lambda', 3.5938136638046256e-05), ('subsample', 0.8111111111111111)])
Best MCC Score: 0.98485


In [16]:
# from joblib import dump, load

# # Save the RandomizedSearchCV object to a file
# dump(random_search, 'random_search_xgb.joblib')

# # Later on, load the RandomizedSearchCV object from the file
# loaded_random_search = load('random_search_xgb.joblib')

# # Access the best model or other attributes
# print("Best parameters found: ", loaded_random_search.best_params_)
# print("Best score achieved: ", loaded_random_search.best_score_)

# {'subsample': 0.8111111111111111, 'reg_lambda': 1.0, 'reg_alpha': 1.6681005372000592e-08, 'n_estimators': 1200, 'min_child_weight': 5, 'max_depth': 21, 'learning_rate': 0.009444444444444443, 'gamma': 0.03333333333333333, 'colsample_bytree': 0.4888888888888889, 'colsample_bylevel': 0.7444444444444445}

## Submission set

In [17]:
# y_pred = xgb_clf.predict(X_test)
y_pred = bayes_search.predict(X_test)

In [18]:
predictions = label_encoder.inverse_transform(y_pred)
y_pred = pd.DataFrame({'id': test_dataset['id'], 'class': predictions})
y_pred.to_csv('submission_xgboost_bayseyanSearch.csv', index=False)

[CV 3/5] END colsample_bylevel=0.7, colsample_bytree=0.4444444444444445, enable_categorical=True, gamma=0.1, learning_rate=0.005, max_depth=21, min_child_weight=6, n_estimators=1464, reg_alpha=2.1544346900318867e-07, reg_lambda=0.005994842503189421, subsample=0.7666666666666666;, score=0.985 total time= 7.6min
[CV 2/5] END colsample_bylevel=0.7, colsample_bytree=0.4444444444444445, enable_categorical=True, gamma=0.1, learning_rate=0.005, max_depth=21, min_child_weight=6, n_estimators=1464, reg_alpha=2.1544346900318867e-07, reg_lambda=0.005994842503189421, subsample=0.7666666666666666;, score=0.985 total time= 7.7min
[CV 4/5] END colsample_bylevel=0.7, colsample_bytree=0.4444444444444445, enable_categorical=True, gamma=0.1, learning_rate=0.005, max_depth=21, min_child_weight=6, n_estimators=1464, reg_alpha=2.1544346900318867e-07, reg_lambda=0.005994842503189421, subsample=0.7666666666666666;, score=0.985 total time= 7.7min
[CV 5/5] END colsample_bylevel=0.8111111111111111, colsample_byt