<a href="https://www.kaggle.com/code/awesomeharris/ps3e17-baseline-models?scriptVersionId=133406600" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

# Getting started

In [2]:
path1 = "/kaggle/input/playground-series-s3e17/"
#path2 = "/kaggle/input//"

sample = pd.read_csv(path1+"sample_submission.csv")
train = pd.read_csv(path1+"train.csv")
test = pd.read_csv(path1+"test.csv")
#extra = pd.read_csv(path2+"synthetic_data.csv")

# Combine playground dataset and synthetic dataset generated
#train = pd.concat([train,extra]).reset_index(drop=True).drop_duplicates()

id_test = test['id'] # id column required for submission file
target = 'Machine failure'

# Preprocessing the data

In [3]:
# checkpoint 1
# drop "id" and "Product ID" columns in train and test datasets

train1 = train.drop(columns=['id','Product ID',target],axis=1)
test1 = test.drop(columns=['id','Product ID'],axis=1)

# rename columns with '[]' symbols as they don't work well in XGBoost
train1.columns = train1.columns.str.replace('[\[\]]', '', regex=True)
test1.columns = test1.columns.str.replace('[\[\]]', '', regex=True)

In [4]:
def preprocess(df):
    # Limit the "Rotational speed rpm" to 2000.
    # Replace outliers > 2000 with mean value.
    #df["Rotational speed rpm"] = np.where(df["Rotational speed rpm"] > 2000 , \
    #                                      df["Rotational speed rpm"].mean() , \
    #                                      df["Rotational speed rpm"])
    
    # Feature engineering
    # Add "Power" feature
    #df["Power"] = df["Torque Nm"] * df["Rotational speed rpm"]
    
    # Add "Temp Ratio" feature
    #df["Temp ratio"] = df["Process temperature K"] / df["Air temperature K"]

    return df

# Building data pipeline

In [5]:
cat_features = ['Type']
num_features = ['Air temperature K', 'Process temperature K', 'Rotational speed rpm', \
                'Torque Nm', 'Tool wear min']#, 'Power', 'Temp ratio']
bin_features = ['TWF', 'HDF', 'PWF', 'OSF', 'RNF']

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, FunctionTransformer

# Define transformers for numerical and categorical columns
numerical_transformer = Pipeline(steps=[
    #('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    #('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse = False)),
    #('ordinal', OrdinalEncoder()),
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        #('num', numerical_transformer, num_features),
        ('cat', categorical_transformer, cat_features)
    ],remainder = 'passthrough')

# Create a pipeline with the preprocessor
pipeline = Pipeline(steps=[
    ('create_features', FunctionTransformer(preprocess, validate=False)),
    ('preprocessor', preprocessor),
])

X = train1.copy()
y = train[target]
X_preprocessed = pipeline.fit_transform(X)

# Training some models

In [7]:
# Split into training and validation datasets
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_preprocessed, y, train_size = 0.8, random_state=42)

In [8]:
%%time
from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators=1000, learning_rate=0.01, random_state=42)
xgb.fit(X_train, y_train, early_stopping_rounds=100, eval_set=[(X_val, y_val)], verbose=100)

[0]	validation_0-logloss:0.68335
[100]	validation_0-logloss:0.20889
[200]	validation_0-logloss:0.08187
[300]	validation_0-logloss:0.04060
[400]	validation_0-logloss:0.02687
[500]	validation_0-logloss:0.02251
[600]	validation_0-logloss:0.02118
[700]	validation_0-logloss:0.02077
[800]	validation_0-logloss:0.02063
[900]	validation_0-logloss:0.02058
[999]	validation_0-logloss:0.02060
CPU times: user 3min 39s, sys: 438 ms, total: 3min 39s
Wall time: 56.1 s


In [9]:
%%time
from catboost import CatBoostClassifier
cb = CatBoostClassifier(n_estimators=2000, learning_rate=0.01, random_state=42)
cb.fit(X_train, y_train, early_stopping_rounds=100, eval_set=[(X_val, y_val)], verbose=100)

0:	learn: 0.6689868	test: 0.6690456	best: 0.6690456 (0)	total: 74.2ms	remaining: 2m 28s
100:	learn: 0.0406767	test: 0.0408580	best: 0.0408580 (100)	total: 1.69s	remaining: 31.8s
200:	learn: 0.0235270	test: 0.0239583	best: 0.0239583 (200)	total: 3.32s	remaining: 29.8s
300:	learn: 0.0212863	test: 0.0220066	best: 0.0220066 (300)	total: 5.05s	remaining: 28.5s
400:	learn: 0.0206090	test: 0.0215718	best: 0.0215718 (400)	total: 6.73s	remaining: 26.9s
500:	learn: 0.0201670	test: 0.0214002	best: 0.0214001 (499)	total: 8.42s	remaining: 25.2s
600:	learn: 0.0198406	test: 0.0213137	best: 0.0213126 (599)	total: 10.1s	remaining: 23.5s
700:	learn: 0.0195393	test: 0.0212414	best: 0.0212414 (700)	total: 11.8s	remaining: 21.8s
800:	learn: 0.0192551	test: 0.0211894	best: 0.0211894 (800)	total: 13.4s	remaining: 20.1s
900:	learn: 0.0189723	test: 0.0211430	best: 0.0211430 (900)	total: 15.2s	remaining: 18.5s
1000:	learn: 0.0187385	test: 0.0211130	best: 0.0211114 (991)	total: 16.9s	remaining: 16.8s
1100:	learn

<catboost.core.CatBoostClassifier at 0x7a9098b51000>

In [10]:
%%time
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier(n_estimators=2000, learning_rate=0.01, random_state=42)
lgbm.fit(X_train, y_train, early_stopping_rounds=100, eval_set=[(X_val, y_val)], verbose=100)

[100]	valid_0's binary_logloss: 0.0286105
[200]	valid_0's binary_logloss: 0.0228313
[300]	valid_0's binary_logloss: 0.0211826
[400]	valid_0's binary_logloss: 0.0207523
[500]	valid_0's binary_logloss: 0.0206797
[600]	valid_0's binary_logloss: 0.0206791
CPU times: user 17 s, sys: 3.14 s, total: 20.2 s
Wall time: 9.37 s


In [11]:
%%time
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import log_loss
ada = AdaBoostClassifier(n_estimators=1000, learning_rate=0.01, random_state=42)
ada.fit(X_train, y_train)
y_pred_prob = ada.predict_proba(X_val)
y_pred_proba = [pred[1] for pred in y_pred_prob]
logloss = log_loss(y_val, y_pred_proba)
print("Log Loss:", logloss)

Log Loss: 0.4376504315382091
CPU times: user 1min 39s, sys: 67.9 ms, total: 1min 39s
Wall time: 1min 39s


Small lesson learned here:  
- fit() method provides early stopping parameter to stop the iterations when there is no further improvement to the scores, but not available on all models!
- cross_val_score method does not provide early stopping parameter, and will run through all the runs specified in the n_estimators parameter.
- That's why a lot of authors write their own manual cross validation codes to manually perform Kfolds splits or other split methods and use the fit() method afterward to have more control over the cross validation procedure, including specifiying early_stopping_rounds to save time and prevent overfitting.

# Final Prediction and Submission

In [12]:
X_test = test1
X_test = pipeline.fit_transform(X_test)

In [13]:
models = [xgb, cb, lgbm, ada]
model_names = ['xgb', 'cb', 'lgbm', 'ada']

for model, model_name in zip(models, model_names):
    y_pred = model.predict_proba(X_test)
    y_pred_proba = [pred[1] for pred in y_pred]
    output = pd.DataFrame({'id':id_test, 'Machine failure':y_pred_proba})
    print(f"{model_name} model:\n",output.head(3),"\n"), display(output['Machine failure'].value_counts())
    output.to_csv(f'{model_name}_submission.csv', index=False)
    print(f"\nYour {model_name} submission was successfully saved!\n")
    print("======================================================================")

xgb model:
        id  Machine failure
0  136429         0.001998
1  136430         0.002327
2  136431         0.000754 



0.000754    302
0.000757    218
0.000804    202
0.000798    202
0.000769    198
           ... 
0.005259      1
0.001911      1
0.001174      1
0.007075      1
0.001037      1
Name: Machine failure, Length: 54385, dtype: int64


Your xgb submission was successfully saved!

cb model:
        id  Machine failure
0  136429         0.001728
1  136430         0.002154
2  136431         0.000848 



0.011065    16
0.001752    13
0.003080     8
0.159063     5
0.001763     5
            ..
0.001832     1
0.017773     1
0.001732     1
0.001199     1
0.001409     1
Name: Machine failure, Length: 90350, dtype: int64


Your cb submission was successfully saved!

lgbm model:
        id  Machine failure
0  136429         0.001498
1  136430         0.001293
2  136431         0.000700 



0.000748    195
0.000700    144
0.000562    127
0.000738    107
0.000731     98
           ... 
0.000855      1
0.001017      1
0.002077      1
0.016894      1
0.001212      1
Name: Machine failure, Length: 64332, dtype: int64


Your lgbm submission was successfully saved!

ada model:
        id  Machine failure
0  136429         0.352806
1  136430         0.347262
2  136431         0.338604 



0.338604    25922
0.338952     8389
0.352806     4914
0.340606     4289
0.351086     2450
            ...  
0.429943        1
0.462483        1
0.609724        1
0.382990        1
0.448968        1
Name: Machine failure, Length: 1466, dtype: int64


Your ada submission was successfully saved!

