In [1]:
import warnings
warnings.filterwarnings("ignore")

import category_encoders as ce
import logging
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import time

from catboost import CatBoostClassifier
from collections import Counter
from lightgbm import LGBMClassifier
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.impute import IterativeImputer
from sklearn.linear_model import Lasso
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OrdinalEncoder, StandardScaler
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectFromModel

pd.set_option('display.max_columns', None)
sns.set()
%matplotlib inline

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
start_time = time.time()

In [2]:
# Test to see if TensorFlow can utilize the GPU
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [3]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

train_df.drop(columns=['id'], inplace=True)
test_df.drop(columns=['id'], inplace=True)

logger.info(f"Train data load completed. Time elapsed: {time.time() - start_time:.2f} seconds")

train_df.head(2)

INFO:__main__:Train data load completed. Time elapsed: 4.53 seconds


Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,e,8.8,f,s,u,f,a,c,w,4.51,15.39,,,w,,,f,f,,d,a
1,p,4.51,x,h,o,f,a,c,n,4.79,6.48,,y,o,,,t,z,,d,w


In [4]:
def handle_missing_values(train_df, test_df, seed=None):
    
    # Identify numerical and categorical columns
    numerical_cols = list(train_df.select_dtypes(include=['float64', 'int64']).columns)
    categorical_cols = list(train_df.select_dtypes(include=['object']).columns)
    
    if 'class' in categorical_cols:
        categorical_cols.remove('class')
    
    I = IterativeImputer(random_state=seed)
    train_df[numerical_cols] = I.fit_transform(train_df[numerical_cols])
    test_df[numerical_cols] = I.transform(test_df[numerical_cols])
    
    for col in categorical_cols:
        train_df[col].fillna('Not Available', inplace=True)
        test_df[col].fillna('Not Available', inplace=True)
    
    return train_df, test_df, categorical_cols

def align_columns(train_df, test_df):

    common_cols = train_df.columns.intersection(test_df.columns)
    train_df = train_df[common_cols]
    test_df = test_df[common_cols]
    return train_df, test_df

In [5]:
# Preprocessing
train_df, test_df, categorical_cols = handle_missing_values(train_df, test_df, seed=42)

target = train_df['class']
train_features = train_df.drop(columns=['class'], errors='ignore')

train_features_aligned, test_features_aligned = align_columns(train_features, test_df)

logger.info(f"Missing values treatment completed. Time elapsed: {time.time() - start_time:.2f} seconds")

INFO:__main__:Missing values treatment completed. Time elapsed: 9.75 seconds


In [6]:
# Encoding
encoder = ce.OrdinalEncoder(cols=categorical_cols, handle_unknown='ignore')
train_features_encoded = encoder.fit_transform(train_features_aligned)
test_features_encoded = encoder.transform(test_features_aligned)

train_features_encoded['class'] = target
le = LabelEncoder()
train_features_encoded['class'] = le.fit_transform(train_features_encoded['class'])

logger.info(f"Categorical columns encoding completed. Time elapsed: {time.time() - start_time:.2f} seconds")

INFO:__main__:Categorical columns encoding completed. Time elapsed: 20.26 seconds


In [7]:
train_features_encoded.head(2)

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season,class
0,8.8,1,1,1,1,1,1,1,4.51,15.39,1,1,1,1,1,1,1,1,1,1,0
1,4.51,2,2,2,1,1,1,2,4.79,6.48,1,2,2,1,1,2,2,1,1,2,1


In [8]:
test_features_encoded.head(2)

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,8.64,2.0,8.0,6.0,2.0,6.0,2.0,1.0,11.13,17.12,2.0,1.0,1.0,2.0,3.0,2.0,7.0,1.0,1.0,1
1,6.9,5.0,5.0,2.0,1.0,6.0,1.0,5.0,1.27,10.75,1.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1


In [9]:
# col_scale = ['cap-diameter', 'stem-height', 'stem-width']
col_scale =  ['cap-diameter', 'stem-height', 'stem-width', 'cap-shape', 'cap-surface', 'cap-color',
                  'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-root',
                  'stem-surface', 'stem-color', 'veil-type', 'veil-color', 'has-ring', 'ring-type',
                  'spore-print-color', 'habitat', 'season']

mmx = MinMaxScaler()

# Fit the scaler on the training data and transform both train_df and test_df
train_df[col_scale] = mmx.fit_transform(train_features_encoded[col_scale])
test_df[col_scale] = mmx.transform(test_features_encoded[col_scale])

train_df.head(2)

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,e,0.108755,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.050834,0.149563,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,p,0.055556,0.013514,0.012048,0.012821,0.0,0.0,0.0,0.015873,0.05399,0.062974,0.0,0.016667,0.016949,0.0,0.0,0.043478,0.025,0.0,0.0,0.333333


In [10]:
test_df.head(2)

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0.106771,0.013514,0.084337,0.064103,0.038462,0.064103,0.020833,0.0,0.125451,0.166375,0.026316,0.0,0.0,0.045455,0.083333,0.043478,0.15,0.0,0.0,0.0
1,0.085193,0.054054,0.048193,0.012821,0.0,0.064103,0.0,0.063492,0.014315,0.10447,0.0,0.0,0.033898,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# Load your dataset
X = train_df[['cap-diameter', 'stem-height', 'stem-width', 'cap-shape', 'cap-surface', 'cap-color',
              'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-root',
              'stem-surface', 'stem-color', 'veil-type', 'veil-color', 'has-ring', 'ring-type',
              'spore-print-color', 'habitat', 'season']]
y = train_features_encoded['class']

# Split the data
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

In [12]:
y = train_features_encoded['class']

In [13]:
# 1. Filter Method: Univariate Selection (ANOVA F-value)
select_k_best = SelectKBest(score_func=f_classif, k=10)
X_selected = select_k_best.fit_transform(X, y)

In [31]:
# Get feature scores and corresponding feature names
scores_k_best = select_k_best.scores_
selected_features_k_best = X.columns[select_k_best.get_support()]
selected_features_k_best

Index(['cap-diameter', 'stem-width', 'cap-surface', 'cap-color',
       'gill-attachment', 'gill-spacing', 'stem-root', 'stem-surface',
       'stem-color', 'spore-print-color'],
      dtype='object')

In [32]:
# Create a Series to map feature names to their scores
feature_scores_k_best = pd.Series(scores_k_best, index=X.columns).sort_values(ascending=False)
feature_scores_k_best

cap-surface             113682.315604
stem-width               92543.628283
cap-diameter             84674.260642
gill-attachment          55498.710057
stem-color               45431.158009
stem-surface             36449.487801
spore-print-color        31872.388273
stem-root                28329.777048
cap-color                23627.838827
gill-spacing             19765.756608
ring-type                18854.789485
cap-shape                14024.737643
gill-color               12630.724071
stem-height               7730.791302
has-ring                  7502.544901
veil-type                 7417.861159
does-bruise-or-bleed      4192.799089
habitat                   3029.049890
season                    2189.595450
veil-color                   0.177901
dtype: float64

In [15]:
# 2. Wrapper Method: Recursive Feature Elimination (RFE)
model = RandomForestClassifier(random_state=42)
rfe = RFE(model, n_features_to_select=10)
X_rfe = rfe.fit_transform(X, y)

In [16]:
X_rfe

array([[0.10875496, 0.05083408, 0.14956268, ..., 0.        , 0.        ,
        0.        ],
       [0.05555556, 0.05399008, 0.06297376, ..., 0.01587302, 0.01666667,
        0.01694915],
       [0.08568948, 0.0772092 , 0.09650146, ..., 0.        , 0.03333333,
        0.03389831],
       ...,
       [0.09660218, 0.10719116, 0.10748299, ..., 0.        , 0.        ,
        0.05084746],
       [0.11681548, 0.10290803, 0.17269193, ..., 0.0952381 , 0.01666667,
        0.        ],
       [0.03931052, 0.03178539, 0.07570457, ..., 0.        , 0.        ,
        0.        ]])

In [35]:
# Get the ranking of features and their importance scores from the RandomForestClassifier
feature_scores_rfe = rfe.ranking_
feature_importances_rfe = model.feature_importances_
feature_scores_rfe

array([ 1,  1,  1,  4,  1,  1,  5,  1,  1,  1,  3,  1,  1, 10,  8,  7,  2,
        9,  6, 11])

In [36]:
feature_importances_rfe

array([0.0767181 , 0.06527829, 0.11363605, 0.04016347, 0.09514485,
       0.04631307, 0.0245947 , 0.09750215, 0.07232326, 0.06302854,
       0.04326646, 0.05966782, 0.07973201, 0.0076079 , 0.01416604,
       0.02084782, 0.03653516, 0.0130371 , 0.02603076, 0.00440646])

In [39]:
# Map the rankings to the original feature names
ranking_df_rfe = pd.DataFrame({
    'Feature': X.columns,
    'Ranking': feature_scores_rfe,
    'Importance': feature_importances_rfe
})

# Sort the DataFrame by Ranking and then by Importance within each rank
ranking_df_sorted = ranking_df_rfe.sort_values(by=['Ranking', 'Importance'], ascending=[True, False])

ranking_df_sorted

Unnamed: 0,Feature,Ranking,Importance
2,stem-width,1,0.113636
7,gill-attachment,1,0.097502
4,cap-surface,1,0.095145
12,stem-color,1,0.079732
0,cap-diameter,1,0.076718
8,gill-spacing,1,0.072323
1,stem-height,1,0.065278
9,gill-color,1,0.063029
11,stem-surface,1,0.059668
5,cap-color,1,0.046313


In [41]:
# 3. Embedded Method: Lasso (L1 Regularization)
lasso = Lasso(alpha=0.01, random_state=42)
lasso.fit(X, y)
lasso_selected_features = X.columns[(lasso.coef_ != 0)]

In [42]:
lasso_selected_features

Index([], dtype='object')

In [43]:
# Get the coefficients of the features
lasso_coef = lasso.coef_

# Create a DataFrame to map the features with their coefficients
lasso_ranking_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': lasso_coef
})

# Filter to only include features that were selected (non-zero coefficients)
lasso_ranking_df_selected = lasso_ranking_df[lasso_ranking_df['Coefficient'] != 0]

# Sort the DataFrame by the absolute value of the coefficients
lasso_ranking_df_sorted = lasso_ranking_df_selected.sort_values(by='Coefficient', ascending=False)

lasso_ranking_df_sorted

Unnamed: 0,Feature,Coefficient


In [19]:
# 4. Dimensionality Reduction: PCA
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X)

In [20]:
X_pca

array([[-0.3223378 ,  0.04287978, -0.03248251, ..., -0.00334311,
        -0.02026103, -0.04131191],
       [ 0.01171539, -0.05230309, -0.04695859, ..., -0.00829209,
        -0.00946108, -0.02515635],
       [ 0.01149627, -0.00858452, -0.03589085, ...,  0.00841145,
        -0.01277603, -0.02940544],
       ...,
       [-0.32226504,  0.01440325, -0.06114671, ...,  0.02743542,
        -0.03157548, -0.00916262],
       [ 0.34410136,  0.08754685, -0.0083449 , ..., -0.03317175,
         0.02758625,  0.00038886],
       [ 0.34522135, -0.05092833, -0.03235208, ..., -0.00980215,
        -0.00659993, -0.01028683]])

In [46]:
# Get the loading scores for each principal component
loading_scores = pd.DataFrame(pca.components_.T, columns=[f'PC{i+1}' for i in range(10)], index=X.columns)

# Calculate the contribution of each feature to the first principal component (or others)
pca_feature_importance = np.abs(loading_scores).sum(axis=1)

# Sort the features by their importance in contributing to the PCA
pca_ranking_df_sorted = pca_feature_importance.sort_values(ascending=False)

pca_ranking_df_sorted

stem-surface            2.518358
cap-diameter            2.276629
cap-surface             2.262800
stem-color              2.253139
ring-type               2.186859
gill-color              2.137434
veil-color              2.057481
stem-width              2.028651
habitat                 1.242473
gill-attachment         1.160974
stem-height             1.108087
season                  1.033768
cap-color               1.004122
has-ring                0.831663
stem-root               0.827159
cap-shape               0.785957
spore-print-color       0.561012
gill-spacing            0.368038
veil-type               0.352017
does-bruise-or-bleed    0.334350
dtype: float64

In [47]:
# 5. Tree-Based Feature Importance
model.fit(X, y)
importances = model.feature_importances_
important_features = pd.Series(importances, index=X.columns).sort_values(ascending=False)
selected_features = important_features.head(10).index

In [48]:
selected_features

Index(['stem-width', 'gill-attachment', 'cap-surface', 'stem-color',
       'cap-diameter', 'gill-spacing', 'stem-height', 'gill-color',
       'stem-surface', 'cap-color'],
      dtype='object')

In [49]:
important_features

stem-width              0.113636
gill-attachment         0.097502
cap-surface             0.095145
stem-color              0.079732
cap-diameter            0.076718
gill-spacing            0.072323
stem-height             0.065278
gill-color              0.063029
stem-surface            0.059668
cap-color               0.046313
stem-root               0.043266
cap-shape               0.040163
ring-type               0.036535
habitat                 0.026031
does-bruise-or-bleed    0.024595
has-ring                0.020848
veil-color              0.014166
spore-print-color       0.013037
veil-type               0.007608
season                  0.004406
dtype: float64

In [23]:
def evaluate(Model, X, y, test_data, name):

    logger.info(f"Evaluating {name}")
    
    skf = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 42)
    train_scores = []
    val_scores = []
    test_predictions = [] 
    models = []

    for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
        X_train, X_val = X.iloc[train_index], X.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]
        
        model = Model
        
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        train_mcc = matthews_corrcoef(y_train, y_train_pred)
        train_scores.append(train_mcc)

        y_val_pred = model.predict(X_val)
        val_mcc = matthews_corrcoef(y_val, y_val_pred)
        val_scores.append(val_mcc)

        logger.info(f"{name} (Fold {fold}) - Train MCC Score: {train_mcc:.4f}")
        logger.info(f"{name} (Fold {fold}) - Validation MCC Score: {val_mcc:.4f}")
        logger.info(f"Time elapsed: {time.time() - start_time:.2f} seconds")
        
        y_test_pred_proba = model.predict(test_data)
        test_predictions.append(y_test_pred_proba)
        logger.info(f"{name} (Predictions for Fold {fold}) completed")
        logger.info(f"Time elapsed: {time.time() - start_time:.2f} seconds")
        
        models.append(model)

        print(f"Fold {fold}: Train MCC = {train_mcc:.6f}, Validation MCC = {val_mcc:.6f}")

    mean_train_mcc = np.mean(train_scores)
    mean_val_mcc = np.mean(val_scores)

    logger.info(f"{name} completed - Mean Train MCC Score: {mean_train_mcc:.4f}")
    logger.info(f"{name} completed - Mean Validation MCC Score: {mean_val_mcc:.4f}")
    logger.info(f"Time elapsed: {time.time() - start_time:.2f} seconds")
    print(f"Mean Train MCC: {mean_train_mcc:.6f}")
    print(f"Mean Validation MCC: {mean_val_mcc:.6f}")

    return model,test_predictions