In [14]:
# Import the necessary libraries, packages and modules

import warnings
warnings.filterwarnings("ignore")

import category_encoders as ce
import lightgbm as lgb
import logging
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import time

from catboost import CatBoostClassifier
from collections import Counter
from lightgbm import LGBMClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import matthews_corrcoef as mcc
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OrdinalEncoder, StandardScaler
from xgboost import XGBClassifier

pd.set_option('display.max_columns', None)
sns.set()
%matplotlib inline

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
start_time = time.time()

In [15]:
# Test to see if TensorFlow can utilize the GPU
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [16]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

train_df.drop(columns=['id'], inplace=True)
test_df.drop(columns=['id'], inplace=True)

logger.info(f"Train data load completed. Time elapsed: {time.time() - start_time:.2f} seconds")

train_df.head(2)

INFO:__main__:Train data load completed. Time elapsed: 6.23 seconds


Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,e,8.8,f,s,u,f,a,c,w,4.51,15.39,,,w,,,f,f,,d,a
1,p,4.51,x,h,o,f,a,c,n,4.79,6.48,,y,o,,,t,z,,d,w


In [17]:
def handle_missing_values(train_df, test_df, seed=None):
    
    # Identify numerical and categorical columns
    numerical_cols = list(train_df.select_dtypes(include=['float64', 'int64']).columns)
    categorical_cols = list(train_df.select_dtypes(include=['object']).columns)
    
    if 'class' in categorical_cols:
        categorical_cols.remove('class')
    
    I = IterativeImputer(random_state = 42)
    train_df[numerical_cols] = I.fit_transform(train_df[numerical_cols])
    test_df[numerical_cols] = I.transform(test_df[numerical_cols])
    
    for col in categorical_cols:
        train_df[col].fillna('Not Available', inplace=True)
        test_df[col].fillna('Not Available', inplace=True)
    
    return train_df, test_df, categorical_cols

def align_columns(train_df, test_df):

    common_cols = train_df.columns.intersection(test_df.columns)
    train_df = train_df[common_cols]
    test_df = test_df[common_cols]
    return train_df, test_df

In [18]:
# Preprocessing

train_df, test_df, categorical_cols = handle_missing_values(train_df, test_df, seed = 42)

target = train_df['class']
train_features = train_df.drop(columns = ['class'], errors = 'ignore')

train_features_aligned, test_features_aligned = align_columns(train_features, test_df)

logger.info(f"Missing values treatment completed. Time elapsed: {time.time() - start_time:.2f} seconds")

INFO:__main__:Missing values treatment completed. Time elapsed: 11.40 seconds


In [19]:
# Encoding

encoder = ce.OrdinalEncoder(cols = categorical_cols, handle_unknown='ignore')
train_df = encoder.fit_transform(train_features_aligned)
test_df = encoder.transform(test_features_aligned)

train_df['class'] = target

le = LabelEncoder()

train_df['class'] = le.fit_transform(train_df['class'])

logger.info(f"Categorical columns encoding completed. Time elapsed: {time.time() - start_time:.2f} seconds")

INFO:__main__:Categorical columns encoding completed. Time elapsed: 22.38 seconds


In [20]:
# col_scale = ['cap-diameter', 'stem-height', 'stem-width']
col_scale =  ['cap-diameter', 'stem-height', 'stem-width', 'cap-shape', 'cap-surface', 'cap-color',
                  'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-root',
                  'stem-surface', 'stem-color', 'veil-type', 'veil-color', 'has-ring', 'ring-type',
                  'spore-print-color', 'habitat', 'season']
std = StandardScaler()

# Fit the scaler on the training data and transform both train_df and test_df
train_df[col_scale] = std.fit_transform(train_df[col_scale])
test_df[col_scale] = std.transform(test_df[col_scale])

In [21]:
train_df.head()

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season,class
0,0.534605,-0.948906,-1.502766,-2.409792,-0.451541,-1.3178,-0.946718,-0.982769,-0.680926,0.523282,-0.304972,-0.579907,-0.844119,-0.224831,-0.331898,-0.556229,-0.429,-0.25638,-0.540961,-0.944448,0
1,-0.386405,-0.435849,-1.190524,-1.929372,-0.451541,-1.3178,-0.946718,-0.643885,-0.577213,-0.577333,-0.304972,-0.108063,-0.414189,-0.224831,-0.331898,1.7578,0.158882,-0.25638,-0.540961,0.033262,1
2,0.135286,-0.948906,-1.502766,-1.448952,-0.451541,-0.872817,-0.946718,-0.982769,0.185819,-0.151169,-0.304972,0.363782,0.01574,-0.224831,-0.331898,-0.556229,-0.429,-0.25638,0.151327,0.033262,0
3,-0.521659,-0.948906,-0.878282,-0.968533,-0.451541,-0.427834,0.265216,-0.305001,-0.810568,-0.571157,-0.304972,-0.579907,-0.844119,-0.224831,-0.331898,-0.556229,-0.429,-0.25638,-0.540961,1.010971,0
4,-0.098724,-0.435849,-0.566041,-0.488113,-0.451541,0.017149,0.265216,-0.982769,-1.103187,-0.345105,-0.304972,-0.579907,-0.844119,-0.224831,-0.331898,-0.556229,-0.429,-0.25638,0.843616,-0.944448,0


In [22]:
# Load your dataset
X = train_df[['cap-diameter', 'stem-height', 'stem-width', 'cap-shape', 'cap-surface', 'cap-color',
              'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-root',
              'stem-surface', 'stem-color', 'veil-type', 'veil-color', 'has-ring', 'ring-type',
              'spore-print-color', 'habitat', 'season']]
y = train_df['class']

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

# Define MCC evaluation function
def mcc_eval(y_true, y_pred):
    score = mcc(y_true, np.round(y_pred))
    return 'mcc', score, True  # Name of the metric, score, and whether higher is better

# Custom callback to use MCC as an evaluation metric
class MCC_Eval_Callback:
    def __init__(self, valid_data):
        self.valid_data = valid_data

    def __call__(self, env):
        valid_preds = env.model.predict(self.valid_data[0])
        mcc_score = mcc_eval(self.valid_data[1], valid_preds)
        print(f'Validation MCC: {mcc_score:.4f}')

# Define Early Stopping Callback
early_stopping = lgb.callback.early_stopping(stopping_rounds=50)

# Base LightGBM model with GPU support
lgbm_model = lgb.LGBMClassifier(boosting_type='gbdt',
                                objective='binary',
                                device_type='gpu',
                                n_estimators=1000)  # large number for early stopping

# Train the LightGBM model with early stopping
lgbm_model.fit(X_train, y_train,
               eval_set=[(X_val, y_val)],
               eval_metric=mcc_eval,  # Use custom MCC evaluation function
               callbacks=[early_stopping]
               )

# Bagging with the trained LightGBM model
bagging_model = BaggingClassifier(base_estimator=lgbm_model,
                                  n_estimators=25,
                                  max_samples=0.5,
                                  max_features=0.5,
                                  bootstrap=True,
                                  n_jobs=-1,
                                  random_state=42)

# Train the Bagging model
start_time = time.time()
bagging_model.fit(X_train, y_train)

# Predictions and evaluation
y_pred_train = bagging_model.predict(X_train)
y_pred_val = bagging_model.predict(X_val)

train_mcc = mcc(y_train, y_pred_train)
val_mcc = mcc(y_val, y_pred_val)

print(f"Standard LightGBM with Bagging - Train MCC Score: {train_mcc:.4f}")
print(f"Standard LightGBM with Bagging - Validation MCC Score: {val_mcc:.4f}")

logger.info(f"Standard LGBM with Bagging and Callbacks completed. "
            f"Time elapsed: {time.time() - start_time:.2f} seconds")


[LightGBM] [Info] Number of positive: 1193768, number of negative: 988093
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 997
[LightGBM] [Info] Number of data points in the train set: 2181861, number of used features: 20
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4060, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 13 dense feature groups (33.29 MB) transferred to GPU in 0.024073 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.547133 -> initscore=0.189093
[LightGBM] [Info] Start training from score 0.189093
Training until validation scores don't improve for 50 rounds


KeyboardInterrupt: 

In [None]:
test_df.head(2)

In [None]:
X_test = test_df[['cap-diameter', 'stem-height', 'stem-width', 'cap-shape', 'cap-surface', 'cap-color',
                  'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-root',
                  'stem-surface', 'stem-color', 'veil-type', 'veil-color', 'has-ring', 'ring-type',
                  'spore-print-color', 'habitat', 'season']]

y_test_pred = bagging_model.predict(X_test)

sub_test_df = pd.read_csv('test.csv')

x_f = le.inverse_transform(y_test_pred)

submision_df = pd.DataFrame({
    'id' : sub_test_df['id'],
    'class': x_f
})

submision_df.head(2)

In [None]:
submision_df.to_csv('submission.csv', index=False)
logger.info(f"Submission file created - Time elapsed: {time.time() - start_time:.2f} seconds")