In [1]:
# Import the necessary libraries, packages and modules

import warnings
warnings.filterwarnings("ignore")

import category_encoders as ce
import lightgbm as lgb
import logging
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import time

from catboost import CatBoostClassifier
from collections import Counter
from lightgbm import LGBMClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import matthews_corrcoef as mcc
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from xgboost import XGBClassifier

pd.set_option('display.max_columns', None)
sns.set()
%matplotlib inline

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
start_time = time.time()

In [2]:
# Test to see if TensorFlow can utilize the GPU
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [3]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

train_df.drop(columns=['id'], inplace=True)
test_df.drop(columns=['id'], inplace=True)

logger.info(f"Train data load completed. Time elapsed: {time.time() - start_time:.2f} seconds")

train_df.head(2)

INFO:__main__:Train data load completed. Time elapsed: 4.42 seconds


Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,e,8.8,f,s,u,f,a,c,w,4.51,15.39,,,w,,,f,f,,d,a
1,p,4.51,x,h,o,f,a,c,n,4.79,6.48,,y,o,,,t,z,,d,w


In [4]:
def handle_missing_values(train_df, test_df, seed=None):
    
    # Identify numerical and categorical columns
    numerical_cols = list(train_df.select_dtypes(include=['float64', 'int64']).columns)
    categorical_cols = list(train_df.select_dtypes(include=['object']).columns)
    
    if 'class' in categorical_cols:
        categorical_cols.remove('class')
    
    I = IterativeImputer(random_state = 42)
    train_df[numerical_cols] = I.fit_transform(train_df[numerical_cols])
    test_df[numerical_cols] = I.transform(test_df[numerical_cols])
    
    for col in categorical_cols:
        train_df[col].fillna('Not Available', inplace=True)
        test_df[col].fillna('Not Available', inplace=True)
    
    return train_df, test_df, categorical_cols

def align_columns(train_df, test_df):

    common_cols = train_df.columns.intersection(test_df.columns)
    train_df = train_df[common_cols]
    test_df = test_df[common_cols]
    return train_df, test_df

In [5]:
# Preprocessing

train_df, test_df, categorical_cols = handle_missing_values(train_df, test_df, seed = 42)

target = train_df['class']
train_features = train_df.drop(columns = ['class'], errors = 'ignore')

train_features_aligned, test_features_aligned = align_columns(train_features, test_df)

logger.info(f"Missing values treatment completed. Time elapsed: {time.time() - start_time:.2f} seconds")

INFO:__main__:Missing values treatment completed. Time elapsed: 9.55 seconds


In [6]:
# Encoding

encoder = ce.OrdinalEncoder(cols = categorical_cols, handle_unknown='ignore')
train_df = encoder.fit_transform(train_features_aligned)
test_df = encoder.transform(test_features_aligned)

train_df['class'] = target

le = LabelEncoder()

train_df['class'] = le.fit_transform(train_df['class'])

logger.info(f"Categorical columns encoding completed. Time elapsed: {time.time() - start_time:.2f} seconds")

INFO:__main__:Categorical columns encoding completed. Time elapsed: 20.17 seconds


In [7]:
train_df.head()

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season,class
0,8.8,1,1,1,1,1,1,1,4.51,15.39,1,1,1,1,1,1,1,1,1,1,0
1,4.51,2,2,2,1,1,1,2,4.79,6.48,1,2,2,1,1,2,2,1,1,2,1
2,6.94,1,1,3,1,2,1,1,6.85,9.93,1,3,3,1,1,1,1,1,2,2,0
3,3.88,1,3,4,1,3,2,3,4.16,6.53,1,1,1,1,1,1,1,1,1,3,0
4,5.85,2,4,5,1,4,2,1,3.37,8.36,1,1,1,1,1,1,1,1,3,1,0


In [9]:
# Load your dataset
X = train_df[['cap-diameter', 'stem-height', 'stem-width', 'cap-shape', 'cap-surface', 'cap-color',
          'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-root',
          'stem-surface', 'stem-color', 'veil-type', 'veil-color', 'has-ring', 'ring-type',
          'spore-print-color', 'habitat', 'season']]
y = train_df['class']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Bagging with standard LightGBM using GPU
bagging_model = BaggingClassifier(base_estimator=lgbm_model,
                                   n_estimators=25,
                                   max_samples=0.5,
                                   max_features=0.5,
                                   bootstrap=True,
                                   n_jobs=-1,
                                   random_state=42)

# Base LightGBM model with GPU support
lgbm_model = lgb.LGBMClassifier(boosting_type='gbdt', objective='binary', metric='mcc', device_type='gpu')

# Base LightGBMXT model with GPU support
lgbm_xt_model = lgb.LGBMClassifier(
    boosting_type='rf',
    objective='binary',
    metric='mcc',
    device_type='gpu',
    bagging_freq=1,           # Bagging enabled (frequency > 0)
    bagging_fraction=0.8,     # A fraction of the data is used (between 0.0 and 1.0)
    feature_fraction=0.8      # A fraction of features is used (between 0.0 and 1.0)
)

# Bagging with LightGBMXT using GPU
bagging_xt_model = BaggingClassifier(
    base_estimator=lgbm_xt_model,
    n_estimators=25,
    max_samples=0.5,
    max_features=0.5,
    bootstrap=True,
    n_jobs=-1,
    random_state=42
)

# Train the standard LightGBM model
bagging_model.fit(X_train, y_train)

# Train the LightGBMXT model
bagging_xt_model.fit(X_train, y_train)

# Predictions and evaluation for standard LightGBM
y_pred_train = bagging_model.predict(X_train)
y_pred_test = bagging_model.predict(X_test)

train_mcc = mcc(y_train, y_pred_train)
test_mcc = mcc(y_test, y_pred_test)

print(f"Standard LightGBM - Train MCC Score: {train_mcc:.4f}")
print(f"Standard LightGBM - Test MCC Score: {test_mcc:.4f}")

logger.info(f"Standard LGBM completed. Time elapsed: {time.time() - start_time:.2f} seconds")

# Predictions and evaluation for LightGBMXT
y_pred_train_xt = bagging_xt_model.predict(X_train)
y_pred_test_xt = bagging_xt_model.predict(X_test)

train_mcc_xt = mcc(y_train, y_pred_train_xt)
test_mcc_xt = mcc(y_test, y_pred_test_xt)

print(f"LightGBMXT - Train MCC Score: {train_mcc_xt:.4f}")
print(f"LightGBMXT - Test MCC Score: {test_mcc_xt:.4f}")

logger.info(f"LGBM with XT completed. Time elapsed: {time.time() - start_time:.2f} seconds")

INFO:__main__:Standard LGBM completed. Time elapsed: 571.69 seconds


Standard LightGBM - Train MCC Score: 0.9810
Standard LightGBM - Test MCC Score: 0.9806


INFO:__main__:LGBM with XT completed. Time elapsed: 597.20 seconds


LightGBMXT - Train MCC Score: 0.8832
LightGBMXT - Test MCC Score: 0.8827
