In [35]:
# Import the necessary libraries, packages and modules

import warnings
warnings.filterwarnings("ignore")

import category_encoders as ce
import logging
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import time

from catboost import CatBoostClassifier
from collections import Counter
from lightgbm import LGBMClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from xgboost import XGBClassifier

pd.set_option('display.max_columns', None)
sns.set()
%matplotlib inline

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
start_time = time.time()

In [36]:
# Test to see if TensorFlow can utilize the GPU
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [37]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

train_df.drop(columns=['id'], inplace=True)
test_df.drop(columns=['id'], inplace=True)

logger.info(f"Train data load completed. Time elapsed: {time.time() - start_time:.2f} seconds")

train_df.head(2)

INFO:__main__:Train data load completed. Time elapsed: 7.11 seconds


Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,e,8.8,f,s,u,f,a,c,w,4.51,15.39,,,w,,,f,f,,d,a
1,p,4.51,x,h,o,f,a,c,n,4.79,6.48,,y,o,,,t,z,,d,w


In [38]:
def handle_missing_values(train_df, test_df, seed=None):
    
    # Identify numerical and categorical columns
    numerical_cols = list(train_df.select_dtypes(include=['float64', 'int64']).columns)
    categorical_cols = list(train_df.select_dtypes(include=['object']).columns)
    
    if 'class' in categorical_cols:
        categorical_cols.remove('class')
    
    I = IterativeImputer(random_state = 42)
    train_df[numerical_cols] = I.fit_transform(train_df[numerical_cols])
    test_df[numerical_cols] = I.transform(test_df[numerical_cols])
    
    for col in categorical_cols:
        train_df[col].fillna('Not Available', inplace=True)
        test_df[col].fillna('Not Available', inplace=True)
    
    return train_df, test_df, categorical_cols

def align_columns(train_df, test_df):

    common_cols = train_df.columns.intersection(test_df.columns)
    train_df = train_df[common_cols]
    test_df = test_df[common_cols]
    return train_df, test_df

In [39]:
# Preprocessing

train_df, test_df, categorical_cols = handle_missing_values(train_df, test_df, seed = 42)

target = train_df['class']
train_features = train_df.drop(columns = ['class'], errors = 'ignore')

train_features_aligned, test_features_aligned = align_columns(train_features, test_df)

logger.info(f"Missing values treatment completed. Time elapsed: {time.time() - start_time:.2f} seconds")

INFO:__main__:Missing values treatment completed. Time elapsed: 15.26 seconds


In [40]:
# Encoding

encoder = ce.OrdinalEncoder(cols = categorical_cols, handle_unknown='ignore')
train_df = encoder.fit_transform(train_features_aligned)
test_df = encoder.transform(test_features_aligned)

train_df['class'] = target

le = LabelEncoder()

train_df['class'] = le.fit_transform(train_df['class'])

logger.info(f"Categorical columns encoding completed. Time elapsed: {time.time() - start_time:.2f} seconds")

INFO:__main__:Categorical columns encoding completed. Time elapsed: 31.64 seconds


In [41]:
train_df.head(2)

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season,class
0,8.8,1,1,1,1,1,1,1,4.51,15.39,1,1,1,1,1,1,1,1,1,1,0
1,4.51,2,2,2,1,1,1,2,4.79,6.48,1,2,2,1,1,2,2,1,1,2,1


In [42]:
test_df.head(2)

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,8.64,2.0,8.0,6.0,2.0,6.0,2.0,1.0,11.13,17.12,2.0,1.0,1.0,2.0,3.0,2.0,7.0,1.0,1.0,1
1,6.9,5.0,5.0,2.0,1.0,6.0,1.0,5.0,1.27,10.75,1.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1


In [48]:
def evaluate(Model, X, y, test_data, name):

    logger.info(f"Evaluating {name}")
    
    skf = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 42)
    train_scores = []
    val_scores = []
    test_predictions = [] 
    models = []

    for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
        X_train, X_val = X.iloc[train_index], X.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]
        
        model = Model
        
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        train_mcc = matthews_corrcoef(y_train, y_train_pred)
        train_scores.append(train_mcc)

        y_val_pred = model.predict(X_val)
        val_mcc = matthews_corrcoef(y_val, y_val_pred)
        val_scores.append(val_mcc)

        logger.info(f"{name} (Fold {fold}) - Train MCC Score: {train_mcc:.4f}")
        logger.info(f"{name} (Fold {fold}) - Validation MCC Score: {val_mcc:.4f}")
        logger.info(f"Time elapsed: {time.time() - start_time:.2f} seconds")
        
        y_test_pred_proba = model.predict(test_data)
        test_predictions.append(y_test_pred_proba)
        logger.info(f"{name} (Predictions for Fold {fold}) completed")
        logger.info(f"Time elapsed: {time.time() - start_time:.2f} seconds")
        
        models.append(model)

        print(f"Fold {fold}: Train MCC = {train_mcc:.6f}, Validation MCC = {val_mcc:.6f}")

    mean_train_mcc = np.mean(train_scores)
    mean_val_mcc = np.mean(val_scores)

    logger.info(f"{name} completed - Mean Train MCC Score: {mean_train_mcc:.4f}")
    logger.info(f"{name} completed - Mean Validation MCC Score: {mean_val_mcc:.4f}")
    logger.info(f"Time elapsed: {time.time() - start_time:.2f} seconds")
    print(f"Mean Train MCC: {mean_train_mcc:.6f}")
    print(f"Mean Validation MCC: {mean_val_mcc:.6f}")

    return model,test_predictions

In [49]:
X = train_df.drop('class', axis = 1)
y = train_df['class']

# Searched hyper parameters
Xparams_s1 = {'n_estimators': 1396, 'max_depth': 19, 'learning_rate': 0.010455050159676566, 'subsample': 0.8006842727555243, 
              'colsample_bytree': 0.5001438770455072, 'colsample_bylevel': 0.8027576507794217, 'min_child_weight': 5,
              'reg_alpha': 1.1586967014672253e-08, 'reg_lambda': 3.3517458803447213e-06, 'gamma': 0.01841032988451454}

xgb_s1 = XGBClassifier(**Xparams_s1, random_state = 42, tree_method = 'gpu_hist')   
xgb_model_s1, xgb_preds_s1 = evaluate(xgb_s1, X, y ,test_df, "XGBoost")

INFO:__main__:Evaluating XGBoost
INFO:__main__:XGBoost (Fold 1) - Train MCC Score: 0.9873
INFO:__main__:XGBoost (Fold 1) - Validation MCC Score: 0.9844
INFO:__main__:Time elapsed: 303.42 seconds
INFO:__main__:XGBoost (Predictions for Fold 1) completed
INFO:__main__:Time elapsed: 306.62 seconds


Fold 1: Train MCC = 0.987311, Validation MCC = 0.984390


INFO:__main__:XGBoost (Fold 2) - Train MCC Score: 0.9872
INFO:__main__:XGBoost (Fold 2) - Validation MCC Score: 0.9852
INFO:__main__:Time elapsed: 426.08 seconds
INFO:__main__:XGBoost (Predictions for Fold 2) completed
INFO:__main__:Time elapsed: 429.19 seconds


Fold 2: Train MCC = 0.987234, Validation MCC = 0.985166


INFO:__main__:XGBoost (Fold 3) - Train MCC Score: 0.9873
INFO:__main__:XGBoost (Fold 3) - Validation MCC Score: 0.9846
INFO:__main__:Time elapsed: 551.86 seconds
INFO:__main__:XGBoost (Predictions for Fold 3) completed
INFO:__main__:Time elapsed: 555.11 seconds


Fold 3: Train MCC = 0.987266, Validation MCC = 0.984573


INFO:__main__:XGBoost (Fold 4) - Train MCC Score: 0.9873
INFO:__main__:XGBoost (Fold 4) - Validation MCC Score: 0.9852
INFO:__main__:Time elapsed: 742.24 seconds
INFO:__main__:XGBoost (Predictions for Fold 4) completed
INFO:__main__:Time elapsed: 745.91 seconds


Fold 4: Train MCC = 0.987261, Validation MCC = 0.985179


INFO:__main__:XGBoost (Fold 5) - Train MCC Score: 0.9873
INFO:__main__:XGBoost (Fold 5) - Validation MCC Score: 0.9849
INFO:__main__:Time elapsed: 927.31 seconds
INFO:__main__:XGBoost (Predictions for Fold 5) completed
INFO:__main__:Time elapsed: 930.91 seconds


Fold 5: Train MCC = 0.987267, Validation MCC = 0.984856


INFO:__main__:XGBoost (Fold 6) - Train MCC Score: 0.9873
INFO:__main__:XGBoost (Fold 6) - Validation MCC Score: 0.9849
INFO:__main__:Time elapsed: 1113.60 seconds
INFO:__main__:XGBoost (Predictions for Fold 6) completed
INFO:__main__:Time elapsed: 1117.22 seconds


Fold 6: Train MCC = 0.987265, Validation MCC = 0.984947


INFO:__main__:XGBoost (Fold 7) - Train MCC Score: 0.9872
INFO:__main__:XGBoost (Fold 7) - Validation MCC Score: 0.9849
INFO:__main__:Time elapsed: 1300.16 seconds
INFO:__main__:XGBoost (Predictions for Fold 7) completed
INFO:__main__:Time elapsed: 1303.82 seconds


Fold 7: Train MCC = 0.987244, Validation MCC = 0.984926


INFO:__main__:XGBoost (Fold 8) - Train MCC Score: 0.9873
INFO:__main__:XGBoost (Fold 8) - Validation MCC Score: 0.9845
INFO:__main__:Time elapsed: 1489.73 seconds
INFO:__main__:XGBoost (Predictions for Fold 8) completed
INFO:__main__:Time elapsed: 1493.42 seconds


Fold 8: Train MCC = 0.987330, Validation MCC = 0.984510


INFO:__main__:XGBoost (Fold 9) - Train MCC Score: 0.9873
INFO:__main__:XGBoost (Fold 9) - Validation MCC Score: 0.9849
INFO:__main__:Time elapsed: 1676.63 seconds
INFO:__main__:XGBoost (Predictions for Fold 9) completed
INFO:__main__:Time elapsed: 1679.79 seconds


Fold 9: Train MCC = 0.987264, Validation MCC = 0.984882


INFO:__main__:XGBoost (Fold 10) - Train MCC Score: 0.9873
INFO:__main__:XGBoost (Fold 10) - Validation MCC Score: 0.9850
INFO:__main__:Time elapsed: 1850.48 seconds
INFO:__main__:XGBoost (Predictions for Fold 10) completed
INFO:__main__:Time elapsed: 1854.05 seconds
INFO:__main__:XGBoost completed - Mean Train MCC Score: 0.9873
INFO:__main__:XGBoost completed - Mean Validation MCC Score: 0.9848
INFO:__main__:Time elapsed: 1854.05 seconds


Fold 10: Train MCC = 0.987260, Validation MCC = 0.984965
Mean Train MCC: 0.987270
Mean Validation MCC: 0.984840


In [51]:
sub_test_df = pd.read_csv('test.csv')

xgb_test = np.mean(xgb_preds_s1, axis=0)

xgbpreds = (xgb_test >= 0.5).astype(int)
x_f = le.inverse_transform(xgbpreds)

submision_df = pd.DataFrame({
    'id' : sub_test_df['id'],
    'class': x_f
})

submision_df.head(2)

Unnamed: 0,id,class
0,3116945,e
1,3116946,p


In [52]:
submision_df.to_csv('Submission.csv', index=False)
logger.info(f"Submission file created - Time elapsed: {time.time() - start_time:.2f} seconds")

INFO:__main__:Submission file created - Time elapsed: 1960.34 seconds
