In [1]:
import warnings
warnings.filterwarnings("ignore")

import category_encoders as ce
import logging
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import time

from catboost import CatBoostClassifier
from collections import Counter
from lightgbm import LGBMClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectFromModel

pd.set_option('display.max_columns', None)
sns.set()
%matplotlib inline

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
start_time = time.time()

In [2]:
# Test to see if TensorFlow can utilize the GPU
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [3]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

train_df.drop(columns=['id'], inplace=True)
test_df.drop(columns=['id'], inplace=True)

logger.info(f"Train data load completed. Time elapsed: {time.time() - start_time:.2f} seconds")

train_df.head(2)

INFO:__main__:Train data load completed. Time elapsed: 4.81 seconds


Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,e,8.8,f,s,u,f,a,c,w,4.51,15.39,,,w,,,f,f,,d,a
1,p,4.51,x,h,o,f,a,c,n,4.79,6.48,,y,o,,,t,z,,d,w


In [4]:
def handle_missing_values(train_df, test_df, seed=None):
    
    # Identify numerical and categorical columns
    numerical_cols = list(train_df.select_dtypes(include=['float64', 'int64']).columns)
    categorical_cols = list(train_df.select_dtypes(include=['object']).columns)
    
    if 'class' in categorical_cols:
        categorical_cols.remove('class')
    
    I = IterativeImputer(random_state=seed)
    train_df[numerical_cols] = I.fit_transform(train_df[numerical_cols])
    test_df[numerical_cols] = I.transform(test_df[numerical_cols])
    
    for col in categorical_cols:
        train_df[col].fillna('Not Available', inplace=True)
        test_df[col].fillna('Not Available', inplace=True)
    
    return train_df, test_df, categorical_cols

def align_columns(train_df, test_df):

    common_cols = train_df.columns.intersection(test_df.columns)
    train_df = train_df[common_cols]
    test_df = test_df[common_cols]
    return train_df, test_df

In [5]:
# Preprocessing
train_df, test_df, categorical_cols = handle_missing_values(train_df, test_df, seed=42)

target = train_df['class']
train_features = train_df.drop(columns=['class'], errors='ignore')

train_features_aligned, test_features_aligned = align_columns(train_features, test_df)

logger.info(f"Missing values treatment completed. Time elapsed: {time.time() - start_time:.2f} seconds")

INFO:__main__:Missing values treatment completed. Time elapsed: 13.71 seconds


In [6]:
# Encoding
encoder = ce.OrdinalEncoder(cols=categorical_cols, handle_unknown='ignore')
train_features_encoded = encoder.fit_transform(train_features_aligned)
test_features_encoded = encoder.transform(test_features_aligned)

train_features_encoded['class'] = target
le = LabelEncoder()
train_features_encoded['class'] = le.fit_transform(train_features_encoded['class'])

logger.info(f"Categorical columns encoding completed. Time elapsed: {time.time() - start_time:.2f} seconds")

INFO:__main__:Categorical columns encoding completed. Time elapsed: 30.71 seconds


In [11]:
# Feature Selection
selector = SelectFromModel(estimator=XGBClassifier(
    random_state=42,
    tree_method='gpu_hist',
    n_estimators=1396,
    max_depth=19,
    learning_rate=0.010455050159676566,
    subsample=0.8006842727555243,
    colsample_bytree=0.5001438770455072,
    colsample_bylevel=0.8027576507794217,
    min_child_weight=5,
    reg_alpha=1.1586967014672253e-08,
    reg_lambda=3.3517458803447213e-06,
    gamma=0.01841032988451454
))

selector.fit(train_features_encoded.drop('class', axis=1), train_features_encoded['class'])

selected_features = train_features_encoded.drop('class', axis=1).columns[selector.get_support()]

X = train_features_encoded[selected_features]
y = train_features_encoded['class']
test_df_selected = test_features_encoded[selected_features]

logger.info(f"Feature selection completed. Time elapsed: {time.time() - start_time:.2f} seconds")

INFO:__main__:Feature selection completed. Time elapsed: 416.07 seconds


In [12]:
# Model Evaluation
def evaluate(Model, X, y, test_data, name):

    logger.info(f"Evaluating {name}")
    
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    train_scores = []
    val_scores = []
    test_predictions = [] 
    models = []

    for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
        X_train, X_val = X.iloc[train_index], X.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]
        
        model = Model
        
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        train_mcc = matthews_corrcoef(y_train, y_train_pred)
        train_scores.append(train_mcc)

        y_val_pred = model.predict(X_val)
        val_mcc = matthews_corrcoef(y_val, y_val_pred)
        val_scores.append(val_mcc)

        logger.info(f"{name} (Fold {fold}) - Train MCC Score: {train_mcc:.4f}")
        logger.info(f"{name} (Fold {fold}) - Validation MCC Score: {val_mcc:.4f}")
        logger.info(f"Time elapsed: {time.time() - start_time:.2f} seconds")
        
        y_test_pred_proba = model.predict(test_data)
        test_predictions.append(y_test_pred_proba)
        logger.info(f"{name} (Predictions for Fold {fold}) completed")
        logger.info(f"Time elapsed: {time.time() - start_time:.2f} seconds")
        
        models.append(model)

        print(f"Fold {fold}: Train MCC = {train_mcc:.6f}, Validation MCC = {val_mcc:.6f}")

    mean_train_mcc = np.mean(train_scores)
    mean_val_mcc = np.mean(val_scores)

    logger.info(f"{name} completed - Mean Train MCC Score: {mean_train_mcc:.4f}")
    logger.info(f"{name} completed - Mean Validation MCC Score: {mean_val_mcc:.4f}")
    logger.info(f"Time elapsed: {time.time() - start_time:.2f} seconds")
    print(f"Mean Train MCC: {mean_train_mcc:.6f}")
    print(f"Mean Validation MCC: {mean_val_mcc:.6f}")

    return models, test_predictions

In [13]:
# Hyperparameters for XGBoost
Xparams_s1 = {'n_estimators': 1396, 'max_depth': 19, 'learning_rate': 0.010455050159676566, 'subsample': 0.8006842727555243, 
              'colsample_bytree': 0.5001438770455072, 'colsample_bylevel': 0.8027576507794217, 'min_child_weight': 5,
              'reg_alpha': 1.1586967014672253e-08, 'reg_lambda': 3.3517458803447213e-06, 'gamma': 0.01841032988451454}

xgb_s1 = XGBClassifier(**Xparams_s1, random_state=42, tree_method='gpu_hist')   
xgb_models_s1, xgb_preds_s1 = evaluate(xgb_s1, X, y ,test_df_selected, "XGBoost")

INFO:__main__:Evaluating XGBoost
INFO:__main__:XGBoost (Fold 1) - Train MCC Score: 0.8882
INFO:__main__:XGBoost (Fold 1) - Validation MCC Score: 0.8872
INFO:__main__:Time elapsed: 446.29 seconds
INFO:__main__:XGBoost (Predictions for Fold 1) completed
INFO:__main__:Time elapsed: 447.06 seconds


Fold 1: Train MCC = 0.888180, Validation MCC = 0.887174


KeyboardInterrupt: 

In [None]:
# Preparing Submission
sub_test_df = pd.read_csv('test.csv')
xgb_test = np.mean(xgb_preds_s1, axis=0)
xgbpreds = (xgb_test >= 0.5).astype(int)
x_f = le.inverse_transform(xgbpreds)

submission_df = pd.DataFrame({
    'id': sub_test_df['id'],
    'class': x_f
})

submission_df.head(2)

In [None]:
submission_df.to_csv('Submission.csv', index=False)
logger.info(f"Submission file created - Time elapsed: {time.time() - start_time:.2f} seconds")