## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from category_encoders import TargetEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import make_pipeline
import xgboost as xgb
from xgboost import XGBClassifier, DMatrix
from sklearn.ensemble import RandomForestClassifier
import optuna
import catboost
import lightgbm
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
train_df = pd.read_csv('/kaggle/input/playground-series-s3e17/train.csv')
print(f"-----------Train:----------- \n {train_df} \n")

test_df = pd.read_csv('/kaggle/input/playground-series-s3e17/test.csv')
print(f"-----------Test:------------ \n {test_df} \n")

ss = pd.read_csv('/kaggle/input/playground-series-s3e17/sample_submission.csv')
print(f"-----------Sample Submission:----------- \n {ss}")

-----------Train:----------- 
             id Product ID Type  Air temperature [K]  Process temperature [K]  \
0            0     L50096    L                300.6                    309.6   
1            1     M20343    M                302.6                    312.1   
2            2     L49454    L                299.3                    308.5   
3            3     L53355    L                301.0                    310.9   
4            4     M24050    M                298.0                    309.0   
...        ...        ...  ...                  ...                      ...   
136424  136424     M22284    M                300.1                    311.4   
136425  136425     H38017    H                297.5                    308.5   
136426  136426     L54690    L                300.5                    311.8   
136427  136427     L53876    L                301.7                    310.9   
136428  136428     L47937    L                296.9                    308.1   

        

In [3]:
train_df.columns

Index(['id', 'Product ID', 'Type', 'Air temperature [K]',
       'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]',
       'Tool wear [min]', 'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF',
       'RNF'],
      dtype='object')

In [4]:
train_df.head()

Unnamed: 0,id,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,0,L50096,L,300.6,309.6,1596,36.1,140,0,0,0,0,0,0
1,1,M20343,M,302.6,312.1,1759,29.1,200,0,0,0,0,0,0
2,2,L49454,L,299.3,308.5,1805,26.5,25,0,0,0,0,0,0
3,3,L53355,L,301.0,310.9,1524,44.3,197,0,0,0,0,0,0
4,4,M24050,M,298.0,309.0,1641,35.4,34,0,0,0,0,0,0


In [5]:
# train_df.describe()
def basic_analysis(df):
    desc = pd.DataFrame()
    desc['nunqiue'] = df.nunique()
    desc['perc_nunique'] = np.round(df.nunique()/df.shape[0] * 100, 2)
    # Count infinite values in each column
    desc['is_infinity'] = df.apply(lambda col: col.isin([float('inf'), float('-inf')]).sum(), axis=0) # columns wise sum

    desc = pd.concat([desc, train_df.describe().T], axis=1)
    return desc

In [6]:
print(f"Train Basic Analysis: \n {basic_analysis(train_df)} \n")
print(f"Test Basic Analysis: \n{basic_analysis(test_df)}")

Train Basic Analysis: 
                          nunqiue  perc_nunique  is_infinity     count  \
id                        136429        100.00            0  136429.0   
Product ID                  9976          7.31            0       NaN   
Type                           3          0.00            0       NaN   
Air temperature [K]           95          0.07            0  136429.0   
Process temperature [K]       81          0.06            0  136429.0   
Rotational speed [rpm]       952          0.70            0  136429.0   
Torque [Nm]                  611          0.45            0  136429.0   
Tool wear [min]              246          0.18            0  136429.0   
Machine failure                2          0.00            0  136429.0   
TWF                            2          0.00            0  136429.0   
HDF                            2          0.00            0  136429.0   
PWF                            2          0.00            0  136429.0   
OSF                        

In [7]:
## Dataframe ko list mein daalke list of columns aajate h
list(train_df)

['id',
 'Product ID',
 'Type',
 'Air temperature [K]',
 'Process temperature [K]',
 'Rotational speed [rpm]',
 'Torque [Nm]',
 'Tool wear [min]',
 'Machine failure',
 'TWF',
 'HDF',
 'PWF',
 'OSF',
 'RNF']

In [8]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136429 entries, 0 to 136428
Data columns (total 14 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       136429 non-null  int64  
 1   Product ID               136429 non-null  object 
 2   Type                     136429 non-null  object 
 3   Air temperature [K]      136429 non-null  float64
 4   Process temperature [K]  136429 non-null  float64
 5   Rotational speed [rpm]   136429 non-null  int64  
 6   Torque [Nm]              136429 non-null  float64
 7   Tool wear [min]          136429 non-null  int64  
 8   Machine failure          136429 non-null  int64  
 9   TWF                      136429 non-null  int64  
 10  HDF                      136429 non-null  int64  
 11  PWF                      136429 non-null  int64  
 12  OSF                      136429 non-null  int64  
 13  RNF                      136429 non-null  int64  
dtypes: f

In [9]:
train_df.columns

Index(['id', 'Product ID', 'Type', 'Air temperature [K]',
       'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]',
       'Tool wear [min]', 'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF',
       'RNF'],
      dtype='object')

In [10]:
new_columns=[]
for i in train_df.columns:
    new_columns.append(i.split("[")[0].strip())

In [11]:
train_df.columns = new_columns
target_columns = list(filter(lambda x: x!='Machine failure', new_columns)) # drop Machine Failures
test_df.columns = target_columns

In [12]:
test_df

Unnamed: 0,id,Product ID,Type,Air temperature,Process temperature,Rotational speed,Torque,Tool wear,TWF,HDF,PWF,OSF,RNF
0,136429,L50896,L,302.3,311.5,1499,38.0,60,0,0,0,0,0
1,136430,L53866,L,301.7,311.0,1713,28.8,17,0,0,0,0,0
2,136431,L50498,L,301.3,310.4,1525,37.7,96,0,0,0,0,0
3,136432,M21232,M,300.1,309.6,1479,47.6,5,0,0,0,0,0
4,136433,M19751,M,303.4,312.3,1515,41.3,114,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
90949,227378,L51130,L,302.3,311.4,1484,40.4,15,0,0,0,0,0
90950,227379,L47783,L,297.9,309.8,1542,33.8,31,0,0,0,0,0
90951,227380,L48097,L,295.6,306.2,1501,41.4,187,0,0,0,0,0
90952,227381,L48969,L,298.1,307.8,1534,40.3,69,0,0,0,0,0


In [13]:
### Categricala and Numerical Featutres
categorical_feat = train_df.select_dtypes(include=['object'])
numerical_feat = train_df.select_dtypes(include=['float', 'int'])

In [14]:
# Since the correlation between Product Id and target variable is really less. Target encoding would not be a good option.
from scipy.stats import spearmanr

# Calculate Spearman's rank correlation coefficient
spearman_corr, p_value = spearmanr(train_df['Product ID'], train_df['Machine failure'])

print("Spearman's Rank Correlation Coefficient:", spearman_corr)
print("P-value:", p_value)

Spearman's Rank Correlation Coefficient: -0.012963931145135155
P-value: 1.6798788164708942e-06


In [15]:
def convert_columns_types(df):
    df['Product ID'] = df['Product ID'].astype('category')
    df['Type'] = df['Type'].astype('category')
    
convert_columns_types(train_df)
convert_columns_types(test_df)

In [16]:
X = train_df.drop('Machine failure', axis=1)
y = train_df['Machine failure']

score_list, oof_list, test_list = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
models=[]

In [17]:
def cross_val_score(model, X, y, test, cv):
    val_predictions = np.zeros(len(X))
    test_predictions = np.zeros(len(test))
    
    train_score_list = []
    val_score_list = []
    
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_train = X.iloc[train_idx].copy()
        y_train = y.iloc[train_idx].copy()
        X_val = X.iloc[val_idx].copy()
        y_val = y.iloc[val_idx].copy()
        
        encoder = model.steps[0][1]
        
        if len(model.steps) > 1 and isinstance(encoder, (OrdinalEncoder, OneHotEncoder)):
            categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns
            enc = model.steps[0][1]
            
            X_train_enc = X_train.copy()
            X_val_enc = X_val.copy()
            test_enc = test.copy()

            X_train_enc[categorical_cols] = enc.fit_transform(X_train[categorical_cols])
            X_val_enc[categorical_cols] = enc.transform(X_val[categorical_cols])
            test_enc[categorical_cols] = enc.transform(test[categorical_cols])
            
        else:
            X_train_enc = X_train.copy()
            X_val_enc = X_val.copy()
            test_enc = test.copy()
        
        classifier_step_name = model.steps[-1][0]  # Dynamically get the classifier step name
        classifier = model.steps[-1][1]  # Get the classifier instance

        if isinstance(classifier, (CatBoostClassifier, XGBClassifier, LGBMClassifier)):
            # Prepare the fit parameters for classifiers that support early stopping
            if isinstance(classifier, LGBMClassifier):
                fit_params = {
                f'{classifier_step_name}__eval_set': [(X_val_enc, y_val)],
            }
            else:
                fit_params = {
                    f'{classifier_step_name}__eval_set': [(X_val_enc, y_val)],
                    f'{classifier_step_name}__early_stopping_rounds': 100,
                    f'{classifier_step_name}__verbose': False,
                }
            # Fit the model with early stopping
            model.fit(X_train_enc, y_train, **fit_params)
        else:
            model.fit(X_train_enc, y_train)
        
        train_preds = model.predict(X_train_enc)
        val_preds = model.predict(X_val_enc)
        
        val_predictions[val_idx] = val_preds
        
        train_score = roc_auc_score(y_train, train_preds)
        val_score = roc_auc_score(y_val, val_preds)
        
        train_score_list.append(train_score)
        val_score_list.append(val_score)
        
        test_predictions += model.predict_proba(test_enc)[:, 1] / cv

    print(f"Train AUC: {np.mean(train_score_list):.3f} ± {np.std(train_score_list):.3f} || Val AUC: {np.mean(val_score_list):.3f} ± {np.std(val_score_list):.3f}")
    
    return val_score_list, val_predictions, test_predictions

### XGBOOST

In [18]:
seed=42

In [19]:
def xgb_objective(trial):
    params = {
        'eta': trial.suggest_float('eta', .001, .1, log=True),
        'max_depth' : trial.suggest_int('max_depth', 2, 20),
        'subsample' : trial.suggest_float('subsample', .5, 1),
        'colsample_bytree' : trial.suggest_float('colsample_bytree', .1, 1),
        'min_child_weight' : trial.suggest_int('min_child_weight', 1, 15),
        'reg_lambda' : trial.suggest_float('reg_lambda', 0, 1),
        'reg_alpha' : trial.suggest_float('reg_alpha', 0, 1),
        'n_estimators' : trial.suggest_int('n_estimators', 100, 1000),
        'random_state' : seed,
        'tree_method' : 'hist',
        'device': 'cuda'
    }
    
    optuna_model_xgb = make_pipeline(XGBClassifier(tree_method = 'hist', enable_categorical=True))
    optuna_score_xgb, optuna_val_xgb, optuna_test_xgb = cross_val_score(model=optuna_model_xgb, X=X, y=y, test=test_df, cv=5)
    
    return np.mean(optuna_score_xgb)

xgb_study = optuna.create_study(direction='maximize')
xgb_study.optimize(xgb_objective, n_trials=20)

print(f" The best trial:{xgb_study.best_trial}, the best params is:{xgb_study.best_params}, the best value is:{xgb_study.best_value}")

XGB = XGBClassifier(tree_method = 'hist', enable_categorical=True, **xgb_study.best_params)
XGB = make_pipeline(XGB)
score_list['XGB'], oof_list['XGB'], test_list['CGB'] = cross_val_score(XGB, X=X, y=y, test=test_df, cv=5)

[I 2024-05-24 20:29:42,414] A new study created in memory with name: no-name-84948528-0030-4e58-9b89-96c008550d71
[I 2024-05-24 20:29:52,737] Trial 0 finished with value: 0.8846492440106293 and parameters: {'eta': 0.035440282976428684, 'max_depth': 7, 'subsample': 0.6225657781224709, 'colsample_bytree': 0.9926942986952393, 'min_child_weight': 6, 'reg_lambda': 0.5369835218047554, 'reg_alpha': 0.15248967670848912, 'n_estimators': 903}. Best is trial 0 with value: 0.8846492440106293.


Train AUC: 0.897 ± 0.006 || Val AUC: 0.885 ± 0.010


[I 2024-05-24 20:30:02,694] Trial 1 finished with value: 0.8846492440106293 and parameters: {'eta': 0.0017907080610661177, 'max_depth': 17, 'subsample': 0.9549313730713964, 'colsample_bytree': 0.16143299561513452, 'min_child_weight': 13, 'reg_lambda': 0.7250343853898819, 'reg_alpha': 0.8256450792649679, 'n_estimators': 558}. Best is trial 0 with value: 0.8846492440106293.


Train AUC: 0.897 ± 0.006 || Val AUC: 0.885 ± 0.010


[I 2024-05-24 20:30:12,911] Trial 2 finished with value: 0.8846492440106293 and parameters: {'eta': 0.001097515370319281, 'max_depth': 2, 'subsample': 0.6032571807647147, 'colsample_bytree': 0.18657136363454396, 'min_child_weight': 9, 'reg_lambda': 0.5337234282166514, 'reg_alpha': 0.9461662278419596, 'n_estimators': 732}. Best is trial 0 with value: 0.8846492440106293.


Train AUC: 0.897 ± 0.006 || Val AUC: 0.885 ± 0.010


[I 2024-05-24 20:30:23,642] Trial 3 finished with value: 0.8846492440106293 and parameters: {'eta': 0.01381846404281577, 'max_depth': 11, 'subsample': 0.9381881460793311, 'colsample_bytree': 0.4198601470570281, 'min_child_weight': 12, 'reg_lambda': 0.8029440382367034, 'reg_alpha': 0.5117124716251755, 'n_estimators': 920}. Best is trial 0 with value: 0.8846492440106293.


Train AUC: 0.897 ± 0.006 || Val AUC: 0.885 ± 0.010


[I 2024-05-24 20:30:33,505] Trial 4 finished with value: 0.8846492440106293 and parameters: {'eta': 0.0015101824228622989, 'max_depth': 8, 'subsample': 0.8689995745133022, 'colsample_bytree': 0.9039383602723895, 'min_child_weight': 10, 'reg_lambda': 0.29952963507130115, 'reg_alpha': 0.2704988506759305, 'n_estimators': 769}. Best is trial 0 with value: 0.8846492440106293.


Train AUC: 0.897 ± 0.006 || Val AUC: 0.885 ± 0.010


[I 2024-05-24 20:30:43,403] Trial 5 finished with value: 0.8846492440106293 and parameters: {'eta': 0.013450441114712762, 'max_depth': 12, 'subsample': 0.5582025633996006, 'colsample_bytree': 0.7359695045461301, 'min_child_weight': 10, 'reg_lambda': 0.7346352202578019, 'reg_alpha': 0.26284271266726755, 'n_estimators': 727}. Best is trial 0 with value: 0.8846492440106293.


Train AUC: 0.897 ± 0.006 || Val AUC: 0.885 ± 0.010


[I 2024-05-24 20:30:54,688] Trial 6 finished with value: 0.8846492440106293 and parameters: {'eta': 0.03928976216740406, 'max_depth': 13, 'subsample': 0.7813969889277064, 'colsample_bytree': 0.4339368431195101, 'min_child_weight': 8, 'reg_lambda': 0.9211672475664233, 'reg_alpha': 0.01724445189462498, 'n_estimators': 498}. Best is trial 0 with value: 0.8846492440106293.


Train AUC: 0.897 ± 0.006 || Val AUC: 0.885 ± 0.010


[I 2024-05-24 20:31:04,605] Trial 7 finished with value: 0.8846492440106293 and parameters: {'eta': 0.011466111871364646, 'max_depth': 13, 'subsample': 0.5350740219857476, 'colsample_bytree': 0.8806473701937934, 'min_child_weight': 12, 'reg_lambda': 0.8486234670897701, 'reg_alpha': 0.005616986071947938, 'n_estimators': 504}. Best is trial 0 with value: 0.8846492440106293.


Train AUC: 0.897 ± 0.006 || Val AUC: 0.885 ± 0.010


[I 2024-05-24 20:31:14,726] Trial 8 finished with value: 0.8846492440106293 and parameters: {'eta': 0.009964616864361115, 'max_depth': 8, 'subsample': 0.9808363999094369, 'colsample_bytree': 0.801589212170385, 'min_child_weight': 5, 'reg_lambda': 0.12486746772768531, 'reg_alpha': 0.657230454232435, 'n_estimators': 989}. Best is trial 0 with value: 0.8846492440106293.


Train AUC: 0.897 ± 0.006 || Val AUC: 0.885 ± 0.010


[I 2024-05-24 20:31:25,851] Trial 9 finished with value: 0.8846492440106293 and parameters: {'eta': 0.0941482373563317, 'max_depth': 13, 'subsample': 0.7927426414749852, 'colsample_bytree': 0.8603990728744985, 'min_child_weight': 1, 'reg_lambda': 0.05086123694399092, 'reg_alpha': 0.35168901273015585, 'n_estimators': 197}. Best is trial 0 with value: 0.8846492440106293.


Train AUC: 0.897 ± 0.006 || Val AUC: 0.885 ± 0.010


[I 2024-05-24 20:31:35,871] Trial 10 finished with value: 0.8846492440106293 and parameters: {'eta': 0.04104847199597115, 'max_depth': 2, 'subsample': 0.6574376777312528, 'colsample_bytree': 0.6236527616937964, 'min_child_weight': 5, 'reg_lambda': 0.465525517550349, 'reg_alpha': 0.1761326692975205, 'n_estimators': 123}. Best is trial 0 with value: 0.8846492440106293.


Train AUC: 0.897 ± 0.006 || Val AUC: 0.885 ± 0.010


[I 2024-05-24 20:31:45,915] Trial 11 finished with value: 0.8846492440106293 and parameters: {'eta': 0.0032803603197335282, 'max_depth': 18, 'subsample': 0.6784343012369441, 'colsample_bytree': 0.12841310655317728, 'min_child_weight': 15, 'reg_lambda': 0.6295682945246548, 'reg_alpha': 0.855458465232691, 'n_estimators': 328}. Best is trial 0 with value: 0.8846492440106293.


Train AUC: 0.897 ± 0.006 || Val AUC: 0.885 ± 0.010


[I 2024-05-24 20:31:57,215] Trial 12 finished with value: 0.8846492440106293 and parameters: {'eta': 0.0031161934140149735, 'max_depth': 20, 'subsample': 0.6836905628649351, 'colsample_bytree': 0.3192992236149773, 'min_child_weight': 6, 'reg_lambda': 0.39966293701936495, 'reg_alpha': 0.7303334248485543, 'n_estimators': 357}. Best is trial 0 with value: 0.8846492440106293.


Train AUC: 0.897 ± 0.006 || Val AUC: 0.885 ± 0.010


[I 2024-05-24 20:32:07,301] Trial 13 finished with value: 0.8846492440106293 and parameters: {'eta': 0.005196885588506944, 'max_depth': 7, 'subsample': 0.8796993148227824, 'colsample_bytree': 0.6216807825840129, 'min_child_weight': 15, 'reg_lambda': 0.9979032167380453, 'reg_alpha': 0.5049963614710891, 'n_estimators': 855}. Best is trial 0 with value: 0.8846492440106293.


Train AUC: 0.897 ± 0.006 || Val AUC: 0.885 ± 0.010


[I 2024-05-24 20:32:17,490] Trial 14 finished with value: 0.8846492440106293 and parameters: {'eta': 0.03564930549370486, 'max_depth': 16, 'subsample': 0.741176314045436, 'colsample_bytree': 0.9797177081688346, 'min_child_weight': 3, 'reg_lambda': 0.6305731433181728, 'reg_alpha': 0.7198910577343238, 'n_estimators': 636}. Best is trial 0 with value: 0.8846492440106293.


Train AUC: 0.897 ± 0.006 || Val AUC: 0.885 ± 0.010


[I 2024-05-24 20:32:28,710] Trial 15 finished with value: 0.8846492440106293 and parameters: {'eta': 0.08223293656061158, 'max_depth': 5, 'subsample': 0.8555287101058184, 'colsample_bytree': 0.27412758300666284, 'min_child_weight': 12, 'reg_lambda': 0.25819677503151006, 'reg_alpha': 0.40992181609955514, 'n_estimators': 614}. Best is trial 0 with value: 0.8846492440106293.


Train AUC: 0.897 ± 0.006 || Val AUC: 0.885 ± 0.010


[I 2024-05-24 20:32:38,802] Trial 16 finished with value: 0.8846492440106293 and parameters: {'eta': 0.02646915072856747, 'max_depth': 16, 'subsample': 0.6167965341935344, 'colsample_bytree': 0.5007308413286674, 'min_child_weight': 7, 'reg_lambda': 0.6747321916545509, 'reg_alpha': 0.6121948821965855, 'n_estimators': 397}. Best is trial 0 with value: 0.8846492440106293.


Train AUC: 0.897 ± 0.006 || Val AUC: 0.885 ± 0.010


[I 2024-05-24 20:32:48,926] Trial 17 finished with value: 0.8846492440106293 and parameters: {'eta': 0.006378088897243096, 'max_depth': 5, 'subsample': 0.7240286012070382, 'colsample_bytree': 0.6945880180347445, 'min_child_weight': 4, 'reg_lambda': 0.5464898172898359, 'reg_alpha': 0.1409164905475362, 'n_estimators': 587}. Best is trial 0 with value: 0.8846492440106293.


Train AUC: 0.897 ± 0.006 || Val AUC: 0.885 ± 0.010


[I 2024-05-24 20:33:00,303] Trial 18 finished with value: 0.8846492440106293 and parameters: {'eta': 0.002188108302762883, 'max_depth': 10, 'subsample': 0.5067905611952792, 'colsample_bytree': 0.2986623617690314, 'min_child_weight': 2, 'reg_lambda': 0.3477753902930563, 'reg_alpha': 0.9772151295381301, 'n_estimators': 832}. Best is trial 0 with value: 0.8846492440106293.


Train AUC: 0.897 ± 0.006 || Val AUC: 0.885 ± 0.010


[I 2024-05-24 20:33:10,441] Trial 19 finished with value: 0.8846492440106293 and parameters: {'eta': 0.02159900461959042, 'max_depth': 16, 'subsample': 0.9385687315762087, 'colsample_bytree': 0.9981123143561919, 'min_child_weight': 14, 'reg_lambda': 0.7535946324333088, 'reg_alpha': 0.8022823373670437, 'n_estimators': 990}. Best is trial 0 with value: 0.8846492440106293.


Train AUC: 0.897 ± 0.006 || Val AUC: 0.885 ± 0.010
 The best trial:FrozenTrial(number=0, state=TrialState.COMPLETE, values=[0.8846492440106293], datetime_start=datetime.datetime(2024, 5, 24, 20, 29, 42, 415220), datetime_complete=datetime.datetime(2024, 5, 24, 20, 29, 52, 736594), params={'eta': 0.035440282976428684, 'max_depth': 7, 'subsample': 0.6225657781224709, 'colsample_bytree': 0.9926942986952393, 'min_child_weight': 6, 'reg_lambda': 0.5369835218047554, 'reg_alpha': 0.15248967670848912, 'n_estimators': 903}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'eta': FloatDistribution(high=0.1, log=True, low=0.001, step=None), 'max_depth': IntDistribution(high=20, log=False, low=2, step=1), 'subsample': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'colsample_bytree': FloatDistribution(high=1.0, log=False, low=0.1, step=None), 'min_child_weight': IntDistribution(high=15, log=False, low=1, step=1), 'reg_lambda': FloatDistribution(high=1.0, log=Fals

### CATBOOST

In [20]:
def cb_objective(trial):
    params = {
        'learning_rate' : trial.suggest_float('learning_rate', .001, .1, log=True),
        'max_depth' : trial.suggest_int('max_depth', 2, 10),
        'n_estimators' : trial.suggest_int('n_estimators', 100, 1000),
        'random_state' : seed,
        'cat_features' : ['Product ID', 'Type'],
        'task_type': 'GPU'
    }
    
    optuna_model_cb = make_pipeline(CatBoostClassifier(**params))
    optuna_score_cb, optuna_val_cb, optuna_test_cb = cross_val_score(optuna_model_cb, X=X, y=y, test=test_df, cv=5)
    
    return np.mean(optuna_score_cb)

cb_study = optuna.create_study(direction='maximize')

cb_study.optimize(cb_objective, n_trials=10)

print(f"The best trails: {cb_study.best_trial}, best params is: {cb_study.best_params}, best value is: {cb_study.best_value}")

CBoost = CatBoostClassifier(cat_features= ['Product ID', 'Type'], **cb_study.best_params)
CBoost = make_pipeline(CBoost)
score_list['CBoost'], oof_list['CBoost'], test_list['CBoost'] = cross_val_score(CBoost, X=X, y=y, test=test_df, cv=5)

[I 2024-05-24 20:33:44,167] A new study created in memory with name: no-name-0a5e4f2d-9133-4fbb-bf42-717f0006761e
[I 2024-05-24 20:34:32,293] Trial 0 finished with value: 0.8817020143619269 and parameters: {'learning_rate': 0.004101368694837233, 'max_depth': 5, 'n_estimators': 475}. Best is trial 0 with value: 0.8817020143619269.


Train AUC: 0.882 ± 0.003 || Val AUC: 0.882 ± 0.011


[I 2024-05-24 20:37:00,167] Trial 1 finished with value: 0.8817020143619269 and parameters: {'learning_rate': 0.0013615346665086176, 'max_depth': 8, 'n_estimators': 922}. Best is trial 0 with value: 0.8817020143619269.


Train AUC: 0.882 ± 0.003 || Val AUC: 0.882 ± 0.011


[I 2024-05-24 20:37:11,682] Trial 2 finished with value: 0.8817020143619269 and parameters: {'learning_rate': 0.005652965573230791, 'max_depth': 9, 'n_estimators': 112}. Best is trial 0 with value: 0.8817020143619269.


Train AUC: 0.882 ± 0.003 || Val AUC: 0.882 ± 0.011


[I 2024-05-24 20:37:40,274] Trial 3 finished with value: 0.8817020143619269 and parameters: {'learning_rate': 0.00144516973008682, 'max_depth': 5, 'n_estimators': 272}. Best is trial 0 with value: 0.8817020143619269.


Train AUC: 0.882 ± 0.003 || Val AUC: 0.882 ± 0.011


[I 2024-05-24 20:37:54,754] Trial 4 finished with value: 0.8817020143619269 and parameters: {'learning_rate': 0.03221694657338411, 'max_depth': 7, 'n_estimators': 162}. Best is trial 0 with value: 0.8817020143619269.


Train AUC: 0.882 ± 0.003 || Val AUC: 0.882 ± 0.011


[I 2024-05-24 20:38:17,256] Trial 5 finished with value: 0.8823853369005658 and parameters: {'learning_rate': 0.02235370012365637, 'max_depth': 2, 'n_estimators': 371}. Best is trial 5 with value: 0.8823853369005658.


Train AUC: 0.883 ± 0.002 || Val AUC: 0.882 ± 0.011


[I 2024-05-24 20:38:29,499] Trial 6 finished with value: 0.8817020143619269 and parameters: {'learning_rate': 0.02984488670358962, 'max_depth': 9, 'n_estimators': 106}. Best is trial 5 with value: 0.8823853369005658.


Train AUC: 0.882 ± 0.003 || Val AUC: 0.882 ± 0.011


[I 2024-05-24 20:39:28,521] Trial 7 finished with value: 0.8816982907992221 and parameters: {'learning_rate': 0.008514093971427624, 'max_depth': 7, 'n_estimators': 400}. Best is trial 5 with value: 0.8823853369005658.


Train AUC: 0.882 ± 0.003 || Val AUC: 0.882 ± 0.011


[I 2024-05-24 20:40:27,739] Trial 8 finished with value: 0.8817020143619269 and parameters: {'learning_rate': 0.004663650891440681, 'max_depth': 6, 'n_estimators': 502}. Best is trial 5 with value: 0.8823853369005658.


Train AUC: 0.882 ± 0.003 || Val AUC: 0.882 ± 0.011


[I 2024-05-24 20:41:07,326] Trial 9 finished with value: 0.8817020143619269 and parameters: {'learning_rate': 0.006423624231446773, 'max_depth': 5, 'n_estimators': 387}. Best is trial 5 with value: 0.8823853369005658.


Train AUC: 0.882 ± 0.003 || Val AUC: 0.882 ± 0.011
The best trails: FrozenTrial(number=5, state=TrialState.COMPLETE, values=[0.8823853369005658], datetime_start=datetime.datetime(2024, 5, 24, 20, 37, 54, 755578), datetime_complete=datetime.datetime(2024, 5, 24, 20, 38, 17, 255814), params={'learning_rate': 0.02235370012365637, 'max_depth': 2, 'n_estimators': 371}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=0.1, log=True, low=0.001, step=None), 'max_depth': IntDistribution(high=10, log=False, low=2, step=1), 'n_estimators': IntDistribution(high=1000, log=False, low=100, step=1)}, trial_id=5, value=None), best params is: {'learning_rate': 0.02235370012365637, 'max_depth': 2, 'n_estimators': 371}, best value is: 0.8823853369005658
Train AUC: 0.883 ± 0.002 || Val AUC: 0.882 ± 0.011


### LGBM

In [21]:
def lgbm_objective(trial):
    params = {
        'learning_rate' : trial.suggest_float('learning_rate', .001, .1, log = True),
        'max_depth' : trial.suggest_int('max_depth', 2, 20),
        'n_estimators' : trial.suggest_int('n_estimators', 100, 1000),
        'random_state' : seed,
        'categorical_features': ['Product ID', 'Type'],
        'early_stopping': trial.suggest_int('early_stopping', 10, 100),
        'verbose': -1
    }
    
    optuna_model_lgbm = make_pipeline(LGBMClassifier(**params))
    
    optuna_score_lgbm, optuna_val_lgbm, optuna_test_lgbm = cross_val_score(optuna_model_lgbm, X=X, y=y, test=test_df, cv=5)
    
    return np.mean(optuna_score_lgbm)

lgbm_study = optuna.create_study(direction="maximize")

lgbm_study.optimize(lgbm_objective, n_trials=20)

print(f"The best trails: {lgbm_study.best_trial}, best params is: {lgbm_study.best_params}, best value is: {lgbm_study.best_value}")

LGBM = LGBMClassifier(verbosity=-1, **lgbm_study.best_params)
LGBM = make_pipeline(LGBM)
score_list['LGBM'], oof_list['LGBM'], test_list['LGBM'] = cross_val_score(LGBM, X=X, y=y, test=test_df, cv=5)

[I 2024-05-24 20:42:14,920] A new study created in memory with name: no-name-9a620ced-b5f6-42f5-ade6-ca624e2cc71e
[I 2024-05-24 20:43:02,848] Trial 0 finished with value: 0.881469456222392 and parameters: {'learning_rate': 0.0019801575847825663, 'max_depth': 18, 'n_estimators': 408, 'early_stopping': 81}. Best is trial 0 with value: 0.881469456222392.


Train AUC: 0.881 ± 0.003 || Val AUC: 0.881 ± 0.010


[I 2024-05-24 20:44:24,587] Trial 1 finished with value: 0.8866449854371782 and parameters: {'learning_rate': 0.0057748556949256255, 'max_depth': 10, 'n_estimators': 760, 'early_stopping': 54}. Best is trial 1 with value: 0.8866449854371782.


Train AUC: 0.929 ± 0.004 || Val AUC: 0.887 ± 0.011


[I 2024-05-24 20:44:34,347] Trial 2 finished with value: 0.5 and parameters: {'learning_rate': 0.0036736541741204153, 'max_depth': 5, 'n_estimators': 131, 'early_stopping': 57}. Best is trial 1 with value: 0.8866449854371782.


Train AUC: 0.500 ± 0.000 || Val AUC: 0.500 ± 0.000


[I 2024-05-24 20:45:24,366] Trial 3 finished with value: 0.8554204206203163 and parameters: {'learning_rate': 0.001485497693355658, 'max_depth': 6, 'n_estimators': 593, 'early_stopping': 25}. Best is trial 1 with value: 0.8866449854371782.


Train AUC: 0.865 ± 0.007 || Val AUC: 0.855 ± 0.014


[I 2024-05-24 20:45:33,414] Trial 4 finished with value: 0.887757815047179 and parameters: {'learning_rate': 0.06970169914073139, 'max_depth': 8, 'n_estimators': 140, 'early_stopping': 60}. Best is trial 4 with value: 0.887757815047179.


Train AUC: 0.928 ± 0.007 || Val AUC: 0.888 ± 0.010


[I 2024-05-24 20:46:07,310] Trial 5 finished with value: 0.8852602646399417 and parameters: {'learning_rate': 0.013620574546008186, 'max_depth': 5, 'n_estimators': 629, 'early_stopping': 60}. Best is trial 4 with value: 0.887757815047179.


Train AUC: 0.917 ± 0.004 || Val AUC: 0.885 ± 0.010


[I 2024-05-24 20:46:30,535] Trial 6 finished with value: 0.8868770013445036 and parameters: {'learning_rate': 0.016081270552332785, 'max_depth': 9, 'n_estimators': 216, 'early_stopping': 64}. Best is trial 4 with value: 0.887757815047179.


Train AUC: 0.925 ± 0.003 || Val AUC: 0.887 ± 0.011


[I 2024-05-24 20:47:16,475] Trial 7 finished with value: 0.8864076194091638 and parameters: {'learning_rate': 0.008831352515092825, 'max_depth': 9, 'n_estimators': 445, 'early_stopping': 97}. Best is trial 4 with value: 0.887757815047179.


Train AUC: 0.927 ± 0.005 || Val AUC: 0.886 ± 0.011


[I 2024-05-24 20:48:08,095] Trial 8 finished with value: 0.8816796731243421 and parameters: {'learning_rate': 0.0029610023253279403, 'max_depth': 13, 'n_estimators': 433, 'early_stopping': 63}. Best is trial 4 with value: 0.887757815047179.


Train AUC: 0.887 ± 0.003 || Val AUC: 0.882 ± 0.011


[I 2024-05-24 20:49:04,172] Trial 9 finished with value: 0.8882979441193338 and parameters: {'learning_rate': 0.007597261954630366, 'max_depth': 14, 'n_estimators': 730, 'early_stopping': 38}. Best is trial 9 with value: 0.8882979441193338.


Train AUC: 0.925 ± 0.005 || Val AUC: 0.888 ± 0.011


[I 2024-05-24 20:49:15,689] Trial 10 finished with value: 0.8868887139875394 and parameters: {'learning_rate': 0.037288643247299934, 'max_depth': 15, 'n_estimators': 996, 'early_stopping': 10}. Best is trial 9 with value: 0.8882979441193338.


Train AUC: 0.926 ± 0.009 || Val AUC: 0.887 ± 0.011


[I 2024-05-24 20:49:22,945] Trial 11 finished with value: 0.888184611789358 and parameters: {'learning_rate': 0.08706836668048215, 'max_depth': 20, 'n_estimators': 815, 'early_stopping': 36}. Best is trial 9 with value: 0.8882979441193338.


Train AUC: 0.924 ± 0.003 || Val AUC: 0.888 ± 0.009


[I 2024-05-24 20:49:37,712] Trial 12 finished with value: 0.8859451426443121 and parameters: {'learning_rate': 0.03162438257469674, 'max_depth': 20, 'n_estimators': 880, 'early_stopping': 35}. Best is trial 9 with value: 0.8882979441193338.


Train AUC: 0.924 ± 0.005 || Val AUC: 0.886 ± 0.011


[I 2024-05-24 20:49:46,062] Trial 13 finished with value: 0.886406534806101 and parameters: {'learning_rate': 0.06591854799638618, 'max_depth': 16, 'n_estimators': 757, 'early_stopping': 37}. Best is trial 9 with value: 0.8882979441193338.


Train AUC: 0.920 ± 0.006 || Val AUC: 0.886 ± 0.010


[I 2024-05-24 20:50:05,033] Trial 14 finished with value: 0.8873368391497133 and parameters: {'learning_rate': 0.024717879464755114, 'max_depth': 13, 'n_estimators': 750, 'early_stopping': 42}. Best is trial 9 with value: 0.8882979441193338.


Train AUC: 0.927 ± 0.006 || Val AUC: 0.887 ± 0.010


[I 2024-05-24 20:51:10,354] Trial 15 finished with value: 0.887372448080133 and parameters: {'learning_rate': 0.0065828563786061315, 'max_depth': 20, 'n_estimators': 912, 'early_stopping': 18}. Best is trial 9 with value: 0.8882979441193338.


Train AUC: 0.926 ± 0.004 || Val AUC: 0.887 ± 0.011


[I 2024-05-24 20:51:20,041] Trial 16 finished with value: 0.8861804105584536 and parameters: {'learning_rate': 0.08944902131742416, 'max_depth': 2, 'n_estimators': 670, 'early_stopping': 45}. Best is trial 9 with value: 0.8882979441193338.


Train AUC: 0.897 ± 0.001 || Val AUC: 0.886 ± 0.009


[I 2024-05-24 20:51:41,173] Trial 17 finished with value: 0.8875752181339618 and parameters: {'learning_rate': 0.01978020090980694, 'max_depth': 17, 'n_estimators': 878, 'early_stopping': 27}. Best is trial 9 with value: 0.8882979441193338.


Train AUC: 0.923 ± 0.004 || Val AUC: 0.888 ± 0.011


[I 2024-05-24 20:51:53,141] Trial 18 finished with value: 0.8870925678125665 and parameters: {'learning_rate': 0.044852235606847306, 'max_depth': 13, 'n_estimators': 512, 'early_stopping': 48}. Best is trial 9 with value: 0.8882979441193338.


Train AUC: 0.923 ± 0.007 || Val AUC: 0.887 ± 0.011


[I 2024-05-24 20:53:30,742] Trial 19 finished with value: 0.8796079069189823 and parameters: {'learning_rate': 0.0010290615562364206, 'max_depth': 18, 'n_estimators': 701, 'early_stopping': 31}. Best is trial 9 with value: 0.8882979441193338.


Train AUC: 0.878 ± 0.002 || Val AUC: 0.880 ± 0.012
The best trails: FrozenTrial(number=9, state=TrialState.COMPLETE, values=[0.8882979441193338], datetime_start=datetime.datetime(2024, 5, 24, 20, 48, 8, 96042), datetime_complete=datetime.datetime(2024, 5, 24, 20, 49, 4, 171950), params={'learning_rate': 0.007597261954630366, 'max_depth': 14, 'n_estimators': 730, 'early_stopping': 38}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=0.1, log=True, low=0.001, step=None), 'max_depth': IntDistribution(high=20, log=False, low=2, step=1), 'n_estimators': IntDistribution(high=1000, log=False, low=100, step=1), 'early_stopping': IntDistribution(high=100, log=False, low=10, step=1)}, trial_id=9, value=None), best params is: {'learning_rate': 0.007597261954630366, 'max_depth': 14, 'n_estimators': 730, 'early_stopping': 38}, best value is: 0.8882979441193338
Train AUC: 0.925 ± 0.005 || Val AUC: 0.888 ± 0.011


### META MODEL

In [22]:
meta_model = LogisticRegression().fit(X=oof_list, y=train_df['Machine failure'])
weights = meta_model.coef_[0]

weights = meta_model.coef_[0]

weights = weights/np.sum(weights)

In [23]:
# Make the final prediction by applying the weights
final_predictions = np.dot(test_list, weights)

binary_predictions = (final_predictions >= 0.5).astype(int)

In [24]:
binary_predictions.shape

(90954,)

In [25]:
ss['Machine failure'] = binary_predictions

In [26]:
ss.head()

Unnamed: 0,id,Machine failure
0,136429,0
1,136430,0
2,136431,0
3,136432,0
4,136433,0


In [27]:
ss.to_csv('submission.csv', index = False)