In [1]:
# Import the necessary libraries, packages and modules

import warnings
warnings.filterwarnings("ignore")

import category_encoders as ce
import lightgbm as lgb
import logging
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import time

from catboost import CatBoostClassifier
from collections import Counter
from lightgbm import LGBMClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import matthews_corrcoef as mcc
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OrdinalEncoder
from xgboost import XGBClassifier

pd.set_option('display.max_columns', None)
sns.set()
%matplotlib inline

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
start_time = time.time()

In [2]:
# Test to see if TensorFlow can utilize the GPU
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [3]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

train_df.drop(columns=['id'], inplace=True)
test_df.drop(columns=['id'], inplace=True)

logger.info(f"Train data load completed. Time elapsed: {time.time() - start_time:.2f} seconds")

train_df.head(2)

INFO:__main__:Train data load completed. Time elapsed: 4.44 seconds


Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,e,8.8,f,s,u,f,a,c,w,4.51,15.39,,,w,,,f,f,,d,a
1,p,4.51,x,h,o,f,a,c,n,4.79,6.48,,y,o,,,t,z,,d,w


In [4]:
def handle_missing_values(train_df, test_df, seed=None):
    
    # Identify numerical and categorical columns
    numerical_cols = list(train_df.select_dtypes(include=['float64', 'int64']).columns)
    categorical_cols = list(train_df.select_dtypes(include=['object']).columns)
    
    if 'class' in categorical_cols:
        categorical_cols.remove('class')
    
    I = IterativeImputer(random_state = 42)
    train_df[numerical_cols] = I.fit_transform(train_df[numerical_cols])
    test_df[numerical_cols] = I.transform(test_df[numerical_cols])
    
    for col in categorical_cols:
        train_df[col].fillna('Not Available', inplace=True)
        test_df[col].fillna('Not Available', inplace=True)
    
    return train_df, test_df, categorical_cols

def align_columns(train_df, test_df):

    common_cols = train_df.columns.intersection(test_df.columns)
    train_df = train_df[common_cols]
    test_df = test_df[common_cols]
    return train_df, test_df

In [5]:
# Preprocessing

train_df, test_df, categorical_cols = handle_missing_values(train_df, test_df, seed = 42)

target = train_df['class']
train_features = train_df.drop(columns = ['class'], errors = 'ignore')

train_features_aligned, test_features_aligned = align_columns(train_features, test_df)

logger.info(f"Missing values treatment completed. Time elapsed: {time.time() - start_time:.2f} seconds")

INFO:__main__:Missing values treatment completed. Time elapsed: 9.55 seconds


In [6]:
# Encoding

encoder = ce.OrdinalEncoder(cols = categorical_cols, handle_unknown='ignore')
train_df = encoder.fit_transform(train_features_aligned)
test_df = encoder.transform(test_features_aligned)

train_df['class'] = target

le = LabelEncoder()

train_df['class'] = le.fit_transform(train_df['class'])

logger.info(f"Categorical columns encoding completed. Time elapsed: {time.time() - start_time:.2f} seconds")

INFO:__main__:Categorical columns encoding completed. Time elapsed: 20.01 seconds


In [7]:
# col_scale = ['cap-diameter', 'stem-height', 'stem-width']
col_scale =  ['cap-diameter', 'stem-height', 'stem-width', 'cap-shape', 'cap-surface', 'cap-color',
                  'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-root',
                  'stem-surface', 'stem-color', 'veil-type', 'veil-color', 'has-ring', 'ring-type',
                  'spore-print-color', 'habitat', 'season']
mmx = MinMaxScaler()

# Fit the scaler on the training data and transform both train_df and test_df
train_df[col_scale] = mmx.fit_transform(train_df[col_scale])
test_df[col_scale] = mmx.transform(test_df[col_scale])

In [8]:
train_df.head()

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season,class
0,0.108755,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.050834,0.149563,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.055556,0.013514,0.012048,0.012821,0.0,0.0,0.0,0.015873,0.05399,0.062974,0.0,0.016667,0.016949,0.0,0.0,0.043478,0.025,0.0,0.0,0.333333,1
2,0.085689,0.0,0.0,0.025641,0.0,0.012821,0.0,0.0,0.077209,0.096501,0.0,0.033333,0.033898,0.0,0.0,0.0,0.0,0.0,0.019231,0.333333,0
3,0.047743,0.0,0.024096,0.038462,0.0,0.025641,0.020833,0.031746,0.046889,0.06346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,0
4,0.072173,0.013514,0.036145,0.051282,0.0,0.038462,0.020833,0.0,0.037985,0.081244,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038462,0.0,0


In [9]:
train_df.head(2)

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season,class
0,0.108755,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.050834,0.149563,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.055556,0.013514,0.012048,0.012821,0.0,0.0,0.0,0.015873,0.05399,0.062974,0.0,0.016667,0.016949,0.0,0.0,0.043478,0.025,0.0,0.0,0.333333,1


In [10]:
# Load your dataset
X = train_df[['cap-diameter', 'stem-height', 'stem-width', 'cap-shape', 'cap-surface', 'cap-color',
              'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-root',
              'stem-surface', 'stem-color', 'veil-type', 'veil-color', 'has-ring', 'ring-type',
              'spore-print-color', 'habitat', 'season']]
y = train_df['class']

# Define MCC evaluation function
def mcc_eval(y_true, y_pred):
    y_pred_labels = np.round(y_pred)  # Convert probabilities to binary labels
    score = mcc(y_true, y_pred_labels)
    return 'mcc', score, True  # Name of the metric, score, and whether higher is better

# Define Stratified K-Fold Cross-Validation
def evaluate_lgbm_with_bagging(X, y, test_data, n_splits=10):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    train_scores = []
    val_scores = []
    test_predictions = []
    models = []
    # Define Early Stopping Callback
    early_stopping = lgb.callback.early_stopping(stopping_rounds=50)

    for fold, (train_index, val_index) in enumerate(skf.split(X, y), 1):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        
        # LightGBM model with GPU support
        lgbm_model = lgb.LGBMClassifier(boosting_type='gbdt',
                                        objective='binary',
                                        device_type='gpu',
                                        n_estimators=1000)  # Large number for early stopping

        # Train the LightGBM model with early stopping
        lgbm_model.fit(X_train, y_train,
                       eval_set=[(X_val, y_val)],
                       eval_metric=mcc_eval,
                       callbacks=[early_stopping],
                       )  # Adjust verbosity level as needed
        
        # Bagging with the trained LightGBM model
        bagging_model = BaggingClassifier(base_estimator=lgbm_model,
                                          n_estimators=25,
                                          max_samples=0.5,
                                          max_features=0.5,
                                          bootstrap=True,
                                          n_jobs=-1,
                                          random_state=42)
        
        # Train the Bagging model
        start_time = time.time()
        bagging_model.fit(X_train, y_train)
        
        # Predictions and evaluation
        y_train_pred = bagging_model.predict(X_train)
        y_val_pred = bagging_model.predict(X_val)
        
        train_mcc = mcc(y_train, y_train_pred)
        val_mcc = mcc(y_val, y_val_pred)
        
        train_scores.append(train_mcc)
        val_scores.append(val_mcc)

        logger.info(f"Fold {fold} - Train MCC Score: {train_mcc:.4f}")
        logger.info(f"Fold {fold} - Validation MCC Score: {val_mcc:.4f}")
        logger.info(f"Time elapsed: {time.time() - start_time:.2f} seconds")

        # Predictions on test data
        y_test_pred_proba = bagging_model.predict_proba(test_data)[:, 1]
        test_predictions.append(y_test_pred_proba)

        logger.info(f"Fold {fold} - Predictions for Test Data completed")

        models.append(bagging_model)

    mean_train_mcc = np.mean(train_scores)
    mean_val_mcc = np.mean(val_scores)

    logger.info(f"Evaluation completed - Mean Train MCC Score: {mean_train_mcc:.4f}")
    logger.info(f"Evaluation completed - Mean Validation MCC Score: {mean_val_mcc:.4f}")

    # Averaging predictions from all folds
    mean_test_predictions = np.mean(test_predictions, axis=0)
    
    return models, mean_test_predictions

In [11]:
# Load the test dataset
# sub_test_df = pd.read_csv('test.csv')
test_data = test_df[['cap-diameter', 'stem-height', 'stem-width', 'cap-shape', 'cap-surface', 'cap-color',
                         'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-root',
                         'stem-surface', 'stem-color', 'veil-type', 'veil-color', 'has-ring', 'ring-type',
                         'spore-print-color', 'habitat', 'season']]

# Evaluate the model
lgbm_models, lgbm_preds = evaluate_lgbm_with_bagging(X, y, test_data)

[LightGBM] [Info] Number of positive: 1534856, number of negative: 1270394
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 978
[LightGBM] [Info] Number of data points in the train set: 2805250, number of used features: 20
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4060, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 11 dense feature groups (32.10 MB) transferred to GPU in 0.024867 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.547137 -> initscore=0.189109
[LightGBM] [Info] Start training from score 0.189109
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[463]	valid_0's binary_logloss: 0.039062	valid_0's mcc: 0.98285


INFO:__main__:Fold 1 - Train MCC Score: 0.9844
INFO:__main__:Fold 1 - Validation MCC Score: 0.9837
INFO:__main__:Time elapsed: 1072.15 seconds
INFO:__main__:Fold 1 - Predictions for Test Data completed


[LightGBM] [Info] Number of positive: 1534856, number of negative: 1270394
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 981
[LightGBM] [Info] Number of data points in the train set: 2805250, number of used features: 20
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4060, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 11 dense feature groups (32.10 MB) transferred to GPU in 0.024565 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.547137 -> initscore=0.189109
[LightGBM] [Info] Start training from score 0.189109
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[679]	valid_0's binary_logloss: 0.0375939	valid_0's mcc: 0.983613


INFO:__main__:Fold 2 - Train MCC Score: 0.9844
INFO:__main__:Fold 2 - Validation MCC Score: 0.9843
INFO:__main__:Time elapsed: 1074.18 seconds
INFO:__main__:Fold 2 - Predictions for Test Data completed


[LightGBM] [Info] Number of positive: 1534856, number of negative: 1270394
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 977
[LightGBM] [Info] Number of data points in the train set: 2805250, number of used features: 20
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4060, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 11 dense feature groups (32.10 MB) transferred to GPU in 0.025545 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.547137 -> initscore=0.189109
[LightGBM] [Info] Start training from score 0.189109
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[509]	valid_0's binary_logloss: 0.039167	valid_0's mcc: 0.982871


INFO:__main__:Fold 3 - Train MCC Score: 0.9845
INFO:__main__:Fold 3 - Validation MCC Score: 0.9838
INFO:__main__:Time elapsed: 1066.72 seconds
INFO:__main__:Fold 3 - Predictions for Test Data completed


[LightGBM] [Info] Number of positive: 1534856, number of negative: 1270394
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 980
[LightGBM] [Info] Number of data points in the train set: 2805250, number of used features: 20
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4060, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 11 dense feature groups (32.10 MB) transferred to GPU in 0.028298 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.547137 -> initscore=0.189109
[LightGBM] [Info] Start training from score 0.189109
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[543]	valid_0's binary_logloss: 0.0377974	valid_0's mcc: 0.9836


INFO:__main__:Fold 4 - Train MCC Score: 0.9844
INFO:__main__:Fold 4 - Validation MCC Score: 0.9843
INFO:__main__:Time elapsed: 1061.77 seconds
INFO:__main__:Fold 4 - Predictions for Test Data completed


[LightGBM] [Info] Number of positive: 1534856, number of negative: 1270394
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 975
[LightGBM] [Info] Number of data points in the train set: 2805250, number of used features: 20
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4060, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 11 dense feature groups (32.10 MB) transferred to GPU in 0.023853 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.547137 -> initscore=0.189109
[LightGBM] [Info] Start training from score 0.189109
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[436]	valid_0's binary_logloss: 0.0385795	valid_0's mcc: 0.982889


INFO:__main__:Fold 5 - Train MCC Score: 0.9844
INFO:__main__:Fold 5 - Validation MCC Score: 0.9839
INFO:__main__:Time elapsed: 1067.15 seconds
INFO:__main__:Fold 5 - Predictions for Test Data completed


[LightGBM] [Info] Number of positive: 1534857, number of negative: 1270394
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 975
[LightGBM] [Info] Number of data points in the train set: 2805251, number of used features: 20
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4060, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 11 dense feature groups (32.10 MB) transferred to GPU in 0.027431 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.547137 -> initscore=0.189110
[LightGBM] [Info] Start training from score 0.189110
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[507]	valid_0's binary_logloss: 0.0383343	valid_0's mcc: 0.983239


INFO:__main__:Fold 6 - Train MCC Score: 0.9844
INFO:__main__:Fold 6 - Validation MCC Score: 0.9840
INFO:__main__:Time elapsed: 1070.94 seconds
INFO:__main__:Fold 6 - Predictions for Test Data completed


[LightGBM] [Info] Number of positive: 1534857, number of negative: 1270394
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 974
[LightGBM] [Info] Number of data points in the train set: 2805251, number of used features: 20
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4060, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 11 dense feature groups (32.10 MB) transferred to GPU in 0.027973 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.547137 -> initscore=0.189110
[LightGBM] [Info] Start training from score 0.189110
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[366]	valid_0's binary_logloss: 0.0388045	valid_0's mcc: 0.982454


INFO:__main__:Fold 7 - Train MCC Score: 0.9844
INFO:__main__:Fold 7 - Validation MCC Score: 0.9840
INFO:__main__:Time elapsed: 1067.86 seconds
INFO:__main__:Fold 7 - Predictions for Test Data completed


[LightGBM] [Info] Number of positive: 1534857, number of negative: 1270394
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 984
[LightGBM] [Info] Number of data points in the train set: 2805251, number of used features: 20
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4060, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 11 dense feature groups (32.10 MB) transferred to GPU in 0.025388 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.547137 -> initscore=0.189110
[LightGBM] [Info] Start training from score 0.189110
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[607]	valid_0's binary_logloss: 0.0391178	valid_0's mcc: 0.982957


INFO:__main__:Fold 8 - Train MCC Score: 0.9845
INFO:__main__:Fold 8 - Validation MCC Score: 0.9835
INFO:__main__:Time elapsed: 1075.06 seconds
INFO:__main__:Fold 8 - Predictions for Test Data completed


[LightGBM] [Info] Number of positive: 1534857, number of negative: 1270394
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 974
[LightGBM] [Info] Number of data points in the train set: 2805251, number of used features: 20
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4060, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 11 dense feature groups (32.10 MB) transferred to GPU in 0.027709 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.547137 -> initscore=0.189110
[LightGBM] [Info] Start training from score 0.189110
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[462]	valid_0's binary_logloss: 0.0382597	valid_0's mcc: 0.983044


INFO:__main__:Fold 9 - Train MCC Score: 0.9844
INFO:__main__:Fold 9 - Validation MCC Score: 0.9839
INFO:__main__:Time elapsed: 1068.96 seconds
INFO:__main__:Fold 9 - Predictions for Test Data completed


[LightGBM] [Info] Number of positive: 1534856, number of negative: 1270395
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 978
[LightGBM] [Info] Number of data points in the train set: 2805251, number of used features: 20
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4060, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 11 dense feature groups (32.10 MB) transferred to GPU in 0.032721 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.547137 -> initscore=0.189109
[LightGBM] [Info] Start training from score 0.189109
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[490]	valid_0's binary_logloss: 0.0385436	valid_0's mcc: 0.983095


INFO:__main__:Fold 10 - Train MCC Score: 0.9843
INFO:__main__:Fold 10 - Validation MCC Score: 0.9843
INFO:__main__:Time elapsed: 1065.92 seconds
INFO:__main__:Fold 10 - Predictions for Test Data completed
INFO:__main__:Evaluation completed - Mean Train MCC Score: 0.9844
INFO:__main__:Evaluation completed - Mean Validation MCC Score: 0.9840


In [12]:
test_df.head(2)

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0.106771,0.013514,0.084337,0.064103,0.038462,0.064103,0.020833,0.0,0.125451,0.166375,0.026316,0.0,0.0,0.045455,0.083333,0.043478,0.15,0.0,0.0,0.0
1,0.085193,0.054054,0.048193,0.012821,0.0,0.064103,0.0,0.063492,0.014315,0.10447,0.0,0.0,0.033898,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# Post-processing predictions
threshold = 0.5
lgbm_test_preds = (lgbm_preds >= threshold).astype(int)
sub_test_df = pd.read_csv('test.csv')

x_f = le.inverse_transform(lgbm_test_preds)

# Create submission DataFrame
submission_df = pd.DataFrame({
    'id': sub_test_df['id'],
    'class': x_f
})

print(submission_df.head(2))

        id  class
0  3116945      0
1  3116946      1


In [15]:
submission_df.to_csv('Submission.csv', index=False)
logger.info(f"Submission file created - Time elapsed: {time.time() - start_time:.2f} seconds")

INFO:__main__:Submission file created - Time elapsed: 21431.65 seconds


In [10]:
final_sub = pd.read_csv('Submission.csv')
final_sub

Unnamed: 0,id,class
0,3116945,0
1,3116946,1
2,3116947,1
3,3116948,1
4,3116949,0
...,...,...
2077959,5194904,1
2077960,5194905,1
2077961,5194906,1
2077962,5194907,0


In [11]:
final_sub['class'] = le.inverse_transform(final_sub['class'])
final_sub.head(2)

Unnamed: 0,id,class
0,3116945,e
1,3116946,p


In [12]:
final_sub.to_csv('submission.csv', index=False)
logger.info(f"Submission file created - Time elapsed: {time.time() - start_time:.2f} seconds")

INFO:__main__:Submission file created - Time elapsed: 122.23 seconds
