In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sb
import warnings
warnings.filterwarnings('ignore')

In [2]:
train_dataset = pd.read_csv("/kaggle/input/playground-series-s4e8/train.csv")
test_dataset = pd.read_csv("/kaggle/input/playground-series-s4e8/test.csv")

pd.options.display.float_format = '{:.3f}'.format # Set display option for 3 decimal places

In [3]:
def preprocess(dataset, training=True):
    if(not training):
        dataset.drop_duplicates(inplace=True) #Dropping duplicates

    # Impute missing numerical values
    dataset['cap-diameter'].fillna(dataset['cap-diameter'].mean(), inplace=True)
    dataset['stem-height'].fillna(dataset['stem-height'].mean(), inplace=True)
    dataset['stem-width'].fillna(dataset['stem-width'].mean(), inplace=True)
    
    #Impute missing Categorical columns
    categorical_cols = dataset.select_dtypes(include="object").columns
    for col in categorical_cols:
        dataset[col].fillna('na', inplace=True)
        #print(f"{col} : {len(dataset[col].unique())}") #to check cardinality
        
    return dataset
    

In [49]:
train_dataset = preprocess(dataset=train_dataset)
test_dataset = preprocess(dataset=test_dataset, training=False)

X_train = train_dataset.drop(['id', 'class'], axis=1)
y_train = train_dataset['class']

X_test = test_dataset.drop(['id'], axis=1)

## Removing noise

In [44]:
com_name = 'season'
print(train_dataset[com_name].unique())
counts = train_dataset[com_name].value_counts()


counts = train_dataset[com_name].value_counts()

print(type(counts))  # This will print <class 'pandas.core.series.Series'>

# Iterate over the counts and print each unique value with its count
for category, count in counts.items():
    print(f"Category: {category}, Count: {count}")

['a' 'w' 'u' 's']
<class 'pandas.core.series.Series'>
Category: a, Count: 1543321
Category: u, Count: 1153588
Category: w, Count: 278189
Category: s, Count: 141847


In [42]:
# Print unique values to inspect anomalies
print(train_dataset[com_name].unique())

# Step 1: Identify and Handle Anomalies
# Create a list of known valid categorical values for 'cap-shape'
valid_categories = ['d','l','g','h','p','m','u','w','y','na','n','a','s','k','z','b','t','c','e','r','f','o','x','i']

# Step 2: Remove or Replace Anomalous Values
# Replace anomalous values with NaN
train_dataset[com_name] = train_dataset[com_name].apply(lambda x: x if x in valid_categories else 'unknown')
test_dataset[com_name] = train_dataset[com_name].apply(lambda x: x if x in valid_categories else 'unknown')


# Alternatively, if you want to replace with a placeholder
# train_dataset['cap-shape'] = train_dataset['cap-shape'].apply(lambda x: x if x in valid_categories else 'unknown')

# Check again after replacing anomalies
print(train_dataset[com_name].unique())


['d' 'l' 'g' 'h' 'p' 'm' 'u' 'w' 'y' 'na' 'n' 'a' 's' 'k' 'habitat' 'z'
 '8.09' '17.1' 'b' 't' 'c' '9.28' 'ring-type' 'e' 'r'
 'does-bruise-or-bleed' 'f' 'is w' 'o' '2.94' 'x' '4' 'is h' '5.56'
 'class' 'i' '10.07' '7.31' '5.62' 'spore-print-color' 'cap-diameter'
 '3.11' '16.46' '7.37' 'veil-type' '17.38' '1.66' '6.63' '18.35' '6.75'
 '2.44' '3.68' '2.25']
['d' 'l' 'g' 'h' 'p' 'm' 'u' 'w' 'y' 'na' 'n' 'a' 's' 'k' 'unknown' 'z'
 'b' 't' 'c' 'e' 'r' 'f' 'o' 'x' 'i']


In [48]:
# Define file paths
train_file_path = 'train_dataset_preprocessed.csv'
test_file_path = 'test_dataset_preprocessed.csv'

# Save the preprocessed train dataset to a CSV file
train_dataset.to_csv(train_file_path, index=False)

# Save the preprocessed test dataset to a CSV file
test_dataset.to_csv(test_file_path, index=False)

print(f"Train dataset saved to {train_file_path}")
print(f"Test dataset saved to {test_file_path}")

Train dataset saved to train_dataset_preprocessed.csv
Test dataset saved to test_dataset_preprocessed.csv


In [43]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

## Encoding

In [50]:
from sklearn.preprocessing import LabelEncoder
import category_encoders as ce

label_encoder = LabelEncoder()

def encode_y(y_train, y_test=None):
    
    y_train = label_encoder.fit_transform(y_train)
    if(y_test is not None): 
        y_test = label_encoder.transform(y_test)
        
    return (y_train, y_test)
    
    
def encode_X(X_train, y_train, X_test=None, categorical_cols=None):
    
    #target_encoder = ce.TargetEncoder(cols= categorical_cols.drop('class'))
    target_encoder = ce.TargetEncoder(cols= categorical_cols)
    X_train = target_encoder.fit_transform(X_train, y_train)
    if(X_test is not None):
        X_test = target_encoder.transform(X_test)
    
    return (X_train, X_test)
    

In [51]:
y_train, _ = encode_y(y_train=y_train)

X_train, X_test = encode_X(X_train=X_train, y_train=y_train, X_test=X_test, categorical_cols=X_train.select_dtypes(include="object").columns)

In [None]:
# correlation_matrix = X_train.corr().round(2)

# plt.figure(figsize=(12, 8))
# sb.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
# plt.show()
# stem-width and cap-diameter are highly correlated 

In [52]:
#Feature scaling is not necessary for a random forest classifier. But it slighly increase the accuracy
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

def feature_scale(num_cols, X_train, X_test=None):

    #numerical_cols = ['cap-diameter','stem-height', 'stem-width']
    ct = ColumnTransformer(transformers=[('feature_scaler', StandardScaler(), num_cols)] , remainder='passthrough' )

    X_train = ct.fit_transform(X_train)
    if(X_test is not None):
        X_test = ct.transform(X_test)
        
    return (X_train, X_test)

In [53]:
num_cols = ['cap-diameter','stem-height', 'stem-width']
X_train, X_test = feature_scale(num_cols=num_cols, X_train=X_train, X_test=X_test)

## XGB Model

In [None]:
# from sklearn.model_selection import StratifiedKFold, cross_val_score
# from sklearn.metrics import matthews_corrcoef, make_scorer
# from xgboost import XGBClassifier


# Xparams_s1 = {'n_estimators': 948,
#            'max_depth': 12,
#            'learning_rate': 0.025559161851111477, 
#            'reg_alpha': 0.7178566258816612, 
#            'reg_lambda': 0.00029868510908985876,
#            'subsample': 0.7997054056983265, 
#            'colsample_bytree': 0.5013225770330585}

# Xparams_s2 = {'n_estimators': 1396, 
#               'max_depth': 19, 
#               'learning_rate': 0.010455050159676566, 
#               'subsample': 0.8006842727555243, 
#               'colsample_bytree': 0.5001438770455072, 
#               'colsample_bylevel': 0.8027576507794217, 
#               'min_child_weight': 5,
#               'reg_alpha': 1.1586967014672253e-08, 
#               'reg_lambda': 3.3517458803447213e-06, 
#               'gamma': 0.01841032988451454}

# Define the model
# xgb_clf = XGBClassifier(**Xparams_s2, tree_method='gpu_hist', random_state=42)

# # Define the k-fold cross-validator
# kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# # Create a custom scorer for MCC
# mcc_scorer = make_scorer(matthews_corrcoef)

# # Perform cross-validation with MCC as the scoring metric
# scores = cross_val_score(xgb_clf, X_train, y_train, cv=kfold, scoring=mcc_scorer)

# # Print the MCC for each fold
# print("MCC for each fold: ", scores)

# # Print the mean MCC and standard deviation
# print("Mean MCC: {:.2f}".format(scores.mean()))
# print("Standard deviation of MCC: {:.2f}".format(scores.std()))


In [None]:
# xgb_clf.fit(X_train, y_train)

In [None]:
# import xgboost as xgb

# xgb_clf.save_model('xgb_classifier_model.bin')

# # Load the model back from the file
# loaded_model = XGBClassifier()
# loaded_model.load_model('xgb_classifier_model.bin')


## Fine tune XGB model 

In [55]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer, matthews_corrcoef
from xgboost import XGBClassifier, DMatrix
from skopt import BayesSearchCV
import numpy as np

Xparams_s2 = {'n_estimators': 1396, 
              'max_depth': 19, 
              'learning_rate': 0.010455050159676566, 
              'subsample': 0.8006842727555243, 
              'colsample_bytree': 0.5001438770455072, 
              'colsample_bylevel': 0.8027576507794217, 
              'min_child_weight': 5,
              'reg_alpha': 1.1586967014672253e-08, 
              'reg_lambda': 3.3517458803447213e-06, 
              'gamma': 0.01841032988451454}
Xparams_s3 = {'subsample': 0.7888888888888889, 
              'reg_lambda': 0.00046415888336127817, 
              'reg_alpha': 1.6681005372000592e-08, 
              'n_estimators': 1340, 
              'min_child_weight': 1, 
              'max_depth': 20, 
              'learning_rate': 0.009444444444444443, 
              'gamma': 0.0, 
              'enable_categorical': False, 
              'colsample_bytree': 0.4222222222222222, 
              'colsample_bylevel': 0.8111111111111111}

param_grid = {
    'n_estimators': np.arange(1200, 1500, 1),  # Tuning around the current value
    'max_depth': np.arange(15, 22, 1),  # Fine-tuning max_depth
    'learning_rate': np.linspace(0.001, 0.02, 20),  # Narrow down around current learning rate
    'subsample': np.linspace(0.7, 0.9, 10),  # Adjust subsample range
    'colsample_bytree': np.linspace(0.4, 0.6, 10),  # Tuning colsample_bytree
    'colsample_bylevel': np.linspace(0.7, 0.9, 10),  # Tuning colsample_bylevel
    'min_child_weight': np.arange(1, 7, 1),  # Tuning min_child_weight
    'reg_alpha': np.logspace(-10, 0, 10),  # Regularization alpha tuning
    'reg_lambda': np.logspace(-10, 0, 10),  # Regularization lambda tuning
    'gamma': np.linspace(0, 0.1, 10),  # Tuning gamma
    'enable_categorical': [True, False]
}

dtrain = DMatrix(X_train, label=y_train)

xgb_clf = XGBClassifier(
    tree_method='hist',
    device='cuda',
#     enable_categorical=True,
    random_state=42,
    **Xparams_s3,
#     predictor='gpu_predictor'
)
# xgb_clf.set_params(predictor='gpu_predictor')

mcc_scorer = make_scorer(matthews_corrcoef)

# kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

bayes_search = BayesSearchCV(
    estimator=xgb_clf,
    search_spaces=param_grid,
    n_iter=10,  # Number of parameter settings that are sampled
    cv=None,
    scoring=mcc_scorer,
    verbose=3,
    n_jobs=-1
)

# Fit BayesSearch
bayes_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 4/5] END colsample_bylevel=0.8555555555555556, colsample_bytree=0.4444444444444445, enable_categorical=False, gamma=0.044444444444444446, learning_rate=0.009000000000000001, max_depth=16, min_child_weight=5, n_estimators=1424, reg_alpha=0.005994842503189421, reg_lambda=2.1544346900318867e-07, subsample=0.7888888888888889;, score=0.985 total time= 5.5min
[CV 5/5] END colsample_bylevel=0.8555555555555556, colsample_bytree=0.4444444444444445, enable_categorical=False, gamma=0.044444444444444446, learning_rate=0.009000000000000001, max_depth=16, min_child_weight=5, n_estimators=1424, reg_alpha=0.005994842503189421, reg_lambda=2.1544346900318867e-07, subsample=0.7888888888888889;, score=0.985 total time= 1.5min
[CV 4/5] END colsample_bylevel=0.7222222222222222, colsample_bytree=0.5111111111111111, enable_categori

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Fitting 5 folds for each of 1 candidates, totalling 5 fits


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 2/5] END colsample_bylevel=0.9, colsample_bytree=0.5333333333333333, enable_categorical=True, gamma=0.06666666666666667, learning_rate=0.007, max_depth=16, min_child_weight=6, n_estimators=1332, reg_alpha=0.005994842503189421, reg_lambda=9.999999999999999e-11, subsample=0.8333333333333334;, score=0.985 total time= 5.9min
[CV 3/5] END colsample_bylevel=0.7, colsample_bytree=0.4888888888888889, enable_categorical=False, gamma=0.05555555555555556, learning_rate=0.003, max_depth=17, min_child_weight=6, n_estimators=1498, reg_alpha=3.5938136638046256e-05, reg_lambda=1.6681005372000592e-08, subsample=0.8333333333333334;, score=0.985 total time= 8.1min
[CV 3/5] END colsample_bylevel=0.9, colsample_bytree=0.5333333333333333, enable_categorical=True, gamma=0.06666666666666667, learning_rate=0.007, max_depth=16, min_child_weight=6, n_estimators=1332, reg_alpha=0.005994842503189421, reg_lambda=9.999999999999999e-11, subsample=0.833333

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 3/5] END colsample_bylevel=0.7222222222222222, colsample_bytree=0.4666666666666667, enable_categorical=False, gamma=0.05555555555555556, learning_rate=0.009000000000000001, max_depth=21, min_child_weight=2, n_estimators=1361, reg_alpha=9.999999999999999e-11, reg_lambda=2.782559402207126e-06, subsample=0.7222222222222222;, score=0.985 total time=10.0min
[CV 2/5] END colsample_bylevel=0.7, colsample_bytree=0.4888888888888889, enable_categorical=False, gamma=0.06666666666666667, learning_rate=0.005, max_depth=17, min_child_weight=2, n_estimators=1288, reg_alpha=0.00046415888336127817, reg_lambda=9.999999999999999e-11, subsample=0.7444444444444445;, score=0.985 total time= 7.9min
[CV 2/5] END colsample_bylevel=0.7222222222222222, colsample_bytree=0.5333333333333333, enable_categorical=False, gamma=0.05555555555555556, learning_rate=0.012, max_depth=18, min_child_weight=

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[CV 5/5] END colsample_bylevel=0.7222222222222222, colsample_bytree=0.4666666666666667, enable_categorical=False, gamma=0.05555555555555556, learning_rate=0.009000000000000001, max_depth=21, min_child_weight=2, n_estimators=1361, reg_alpha=9.999999999999999e-11, reg_lambda=2.782559402207126e-06, subsample=0.7222222222222222;, score=0.985 total time= 2.5min
[CV 4/5] END colsample_bylevel=0.7, colsample_bytree=0.4888888888888889, enable_categorical=False, gamma=0.06666666666666667, learning_rate=0.005, max_depth=17, min_child_weight=2, n_estimators=1288, reg_alpha=0.00046415888336127817, reg_lambda=9.999999999999999e-11, subsample=0.7444444444444445;, score=0.984 total time= 7.8min
[CV 1/5] END colsample_bylevel=0.7222222222222222, colsample_bytree=0.5333333333333333, enable_categorical=False, gamma=0.05555555555555556, learning_rate=0.012, max_depth=18, min_child_weight=1, n_estimators=1354, reg_alpha=0.005994842503189421, reg_lambda=3.5938136638046256e-05, subsample=0.7888888888888889;

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[CV 4/5] END colsample_bylevel=0.8111111111111111, colsample_bytree=0.4444444444444445, enable_categorical=True, gamma=0.011111111111111112, learning_rate=0.006, max_depth=20, min_child_weight=1, n_estimators=1422, reg_alpha=3.5938136638046256e-05, reg_lambda=2.1544346900318867e-07, subsample=0.7444444444444445;, score=0.985 total time=15.9min


In [57]:
# Get the best parameters and the best MCC score
print("Best Parameters: ", bayes_search.best_params_)
print("Best MCC Score: {:.5f}".format(bayes_search.best_score_))

Best Parameters:  OrderedDict([('colsample_bylevel', 0.8111111111111111), ('colsample_bytree', 0.4444444444444445), ('enable_categorical', True), ('gamma', 0.011111111111111112), ('learning_rate', 0.006), ('max_depth', 20), ('min_child_weight', 1), ('n_estimators', 1422), ('reg_alpha', 3.5938136638046256e-05), ('reg_lambda', 2.1544346900318867e-07), ('subsample', 0.7444444444444445)])
Best MCC Score: 0.98486
[CV 5/5] END colsample_bylevel=0.8111111111111111, colsample_bytree=0.4444444444444445, enable_categorical=True, gamma=0.011111111111111112, learning_rate=0.006, max_depth=20, min_child_weight=1, n_estimators=1422, reg_alpha=3.5938136638046256e-05, reg_lambda=2.1544346900318867e-07, subsample=0.7444444444444445;, score=0.985 total time= 4.0min


In [59]:
from joblib import dump, load

# # Save the RandomizedSearchCV object to a file
dump(bayes_search, 'bayes_search.joblib')

# # Later on, load the RandomizedSearchCV object from the file
# loaded_random_search = load('random_search_xgb.joblib')

# # Access the best model or other attributes
# print("Best parameters found: ", loaded_random_search.best_params_)
# print("Best score achieved: ", loaded_random_search.best_score_)

# {'subsample': 0.8111111111111111, 'reg_lambda': 1.0, 'reg_alpha': 1.6681005372000592e-08, 'n_estimators': 1200, 'min_child_weight': 5, 'max_depth': 21, 'learning_rate': 0.009444444444444443, 'gamma': 0.03333333333333333, 'colsample_bytree': 0.4888888888888889, 'colsample_bylevel': 0.7444444444444445}

['bayes_search.joblib']

## Submission set

In [60]:
# y_pred = xgb_clf.predict(X_test)
y_pred = bayes_search.predict(X_test)

In [61]:
predictions = label_encoder.inverse_transform(y_pred)
y_pred = pd.DataFrame({'id': test_dataset['id'], 'class': predictions})
y_pred.to_csv('submission_xgboost_bayseyanSearch_removeNoise.csv', index=False)