<a href="https://www.kaggle.com/code/awesomeharris/ps3e17-baseline-models?scriptVersionId=133639290" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

# Getting started

In [2]:
path1 = "/kaggle/input/playground-series-s3e17/"
#path2 = "/kaggle/input//"

sample = pd.read_csv(path1+"sample_submission.csv")
train = pd.read_csv(path1+"train.csv")
test = pd.read_csv(path1+"test.csv")
#extra = pd.read_csv(path2+"synthetic_data.csv")

# Combine playground dataset and synthetic dataset generated
#train = pd.concat([train,extra]).reset_index(drop=True).drop_duplicates()

id_test = test['id'] # id column required for submission file
target = 'Machine failure'

# Preprocessing the data

In [3]:
# checkpoint 1
# drop "id" column in train and test datasets

train1 = train.drop(columns=['id',target],axis=1)
test1 = test.drop(columns=['id'],axis=1)

# rename columns with '[]' symbols as they don't work well in XGBoost
train1.columns = train1.columns.str.replace('[\[\]]', '', regex=True)
test1.columns = test1.columns.str.replace('[\[\]]', '', regex=True)

In [4]:
def preprocess(df):
    # Limit the "Rotational speed rpm" to 2000.
    # Replace outliers > 2000 with mean value.
    #df["Rotational speed rpm"] = np.where(df["Rotational speed rpm"] > 2000 , \
    #                                      df["Rotational speed rpm"].mean() , \
    #                                      df["Rotational speed rpm"])
    
    # Feature engineering
    # Add "Power" feature
    df["Power"] = df["Torque Nm"] * df["Rotational speed rpm"]
    
    # Add "Temp ratio" feature
    df["Temp ratio"] = df["Process temperature K"] / df["Air temperature K"]
    
    # Add "Process temperature C" feature
    df["Process temperature C"] = df["Process temperature K"] - 273.15
    
    # Add "Air temperature C" feature
    df["Air temperature C"] = df["Air temperature K"] - 273.15
    
    # Add "Temp ratio C" feature
    df["Temp ratio C"] = df["Process temperature C"] / df["Air temperature C"]
    
    # Add "Failure sum" feature
    df["Failure sum"] = (df["TWF"] +
                         df["HDF"] +
                         df["PWF"] +
                         df["OSF"] +
                         df["RNF"])
    
    # Add "Tool wear speed" feature
    df["Tool wear speed"] = df["Tool wear min"] * df["Rotational speed rpm"]
    
    # Add "Torque wear ratio" feature
    df["Torque wear ratio"] = df["Torque Nm"] / (df["Tool wear min"] + 0.0001)
    
    # Add "Torque x Wear" feature
    df["Torque x Wear"] = df["Torque Nm"] * df["Tool wear min"]
    
    # Add "Product_id_num" feature
    df["Product_id_num"] = pd.to_numeric(df["Product ID"].str.slice(start=1))

    # Drop "Product ID"
    df = df.drop(columns=['Product ID'], axis=1)
    
    return df

# Building data pipeline

In [5]:
cat_features = ['Type']
num_features = ['Air temperature K', 'Process temperature K', 'Rotational speed rpm', \
                'Torque Nm', 'Tool wear min', 'Power', 'Temp ratio', 'Process temperature C',\
                'Air temperature C', 'Temp ratio C', 'Failure sum', 'Tool wear speed',\
                'Torque wear ratio', 'Torque x Wear', 'Product_id_num']
bin_features = ['TWF', 'HDF', 'PWF', 'OSF', 'RNF']

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, FunctionTransformer

# Define transformers for numerical and categorical columns
numerical_transformer = Pipeline(steps=[
    #('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    #('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    #('onehot', OneHotEncoder(handle_unknown='ignore', sparse = False)),
    ('ordinal', OrdinalEncoder()),
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        #('num', numerical_transformer, num_features),
        ('cat', categorical_transformer, cat_features)
    ],remainder = 'passthrough')

# Create a pipeline with the preprocessor
pipeline = Pipeline(steps=[
    ('create_features', FunctionTransformer(preprocess, validate=False)),
    ('preprocessor', preprocessor),
])

X = train1.copy()
y = train[target]
X_preprocessed = pipeline.fit_transform(X)

In [7]:
cols = ['Type', 'Air temperature K', 'Process temperature K',
       'Rotational speed rpm', 'Torque Nm', 'Tool wear min', 'TWF', 'HDF',
       'PWF', 'OSF', 'RNF', 'Power', 'Temp ratio', 'Process temperature C',
       'Air temperature C', 'Temp ratio C', 'Failure sum', 'Tool wear speed',
       'Torque wear ratio', 'Torque x Wear', 'Product_id_num']

In [8]:
X_reverse = pd.DataFrame(X_preprocessed,columns=cols)
X_reverse.head(3)

Unnamed: 0,Type,Air temperature K,Process temperature K,Rotational speed rpm,Torque Nm,Tool wear min,TWF,HDF,PWF,OSF,RNF,Power,Temp ratio,Process temperature C,Air temperature C,Temp ratio C,Failure sum,Tool wear speed,Torque wear ratio,Torque x Wear,Product_id_num
0,1.0,300.6,309.6,1596.0,36.1,140.0,0.0,0.0,0.0,0.0,0.0,57615.6,1.02994,36.45,27.45,1.327869,0.0,223440.0,0.257857,5054.0,50096.0
1,2.0,302.6,312.1,1759.0,29.1,200.0,0.0,0.0,0.0,0.0,0.0,51186.9,1.031395,38.95,29.45,1.322581,0.0,351800.0,0.1455,5820.0,20343.0
2,1.0,299.3,308.5,1805.0,26.5,25.0,0.0,0.0,0.0,0.0,0.0,47832.5,1.030738,35.35,26.15,1.351816,0.0,45125.0,1.059996,662.5,49454.0


# Training some models

In [9]:
# Split into training and validation datasets
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_preprocessed, y, train_size = 0.8, random_state=42)

In [10]:
%%time
from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators=2000, learning_rate=0.01, random_state=42)
xgb.fit(X_train, y_train, early_stopping_rounds=100, eval_set=[(X_val, y_val)],\
        eval_metric=["logloss","auc"],verbose=200)

[0]	validation_0-logloss:0.68335	validation_0-auc:0.91076
[200]	validation_0-logloss:0.08152	validation_0-auc:0.94917
[326]	validation_0-logloss:0.03508	validation_0-auc:0.95176
CPU times: user 2min 18s, sys: 235 ms, total: 2min 19s
Wall time: 35.6 s


In [11]:
%%time
from catboost import CatBoostClassifier
cb = CatBoostClassifier(n_estimators=5000, learning_rate=0.01, eval_metric='AUC',random_state=42)
cb.fit(X_train, y_train, early_stopping_rounds=500, eval_set=[(X_val, y_val)],verbose=200)

0:	test: 0.9003753	best: 0.9003753 (0)	total: 81.9ms	remaining: 6m 49s
200:	test: 0.9556624	best: 0.9556624 (200)	total: 4.38s	remaining: 1m 44s
400:	test: 0.9580293	best: 0.9580401 (382)	total: 8.68s	remaining: 1m 39s
600:	test: 0.9587754	best: 0.9588037 (596)	total: 13s	remaining: 1m 35s
800:	test: 0.9590778	best: 0.9591375 (696)	total: 17.4s	remaining: 1m 31s
1000:	test: 0.9599115	best: 0.9599115 (1000)	total: 22.1s	remaining: 1m 28s
1200:	test: 0.9601899	best: 0.9601899 (1200)	total: 26.5s	remaining: 1m 23s
1400:	test: 0.9603492	best: 0.9603492 (1400)	total: 30.8s	remaining: 1m 19s
1600:	test: 0.9603493	best: 0.9604299 (1518)	total: 35.3s	remaining: 1m 14s
1800:	test: 0.9602585	best: 0.9604299 (1518)	total: 39.6s	remaining: 1m 10s
2000:	test: 0.9600833	best: 0.9604299 (1518)	total: 44s	remaining: 1m 5s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.9604298646
bestIteration = 1518

Shrink model to first 1519 iterations.
CPU times: user 2min 41s, sys: 10.5 s, to

<catboost.core.CatBoostClassifier at 0x7cc174948130>

In [12]:
%%time
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier(n_estimators=2000, learning_rate=0.01, random_state=42)
lgbm.fit(X_train, y_train, early_stopping_rounds=300, eval_set=[(X_val, y_val)], \
         eval_metric=['AUC'],verbose=100)

[100]	valid_0's auc: 0.959142	valid_0's binary_logloss: 0.028794
[200]	valid_0's auc: 0.96004	valid_0's binary_logloss: 0.0231022
[300]	valid_0's auc: 0.960014	valid_0's binary_logloss: 0.0214136
[400]	valid_0's auc: 0.959739	valid_0's binary_logloss: 0.0208154
[500]	valid_0's auc: 0.959114	valid_0's binary_logloss: 0.0206059
CPU times: user 19.9 s, sys: 2.3 s, total: 22.2 s
Wall time: 10.2 s


In [13]:
%%time
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import log_loss
ada = AdaBoostClassifier(n_estimators=1000, learning_rate=0.01, random_state=42)
ada.fit(X_train, y_train)
y_pred_prob = ada.predict_proba(X_val)
y_pred_proba = [pred[1] for pred in y_pred_prob]
logloss = log_loss(y_val, y_pred_proba)
print("Log Loss:", logloss)

Log Loss: 0.43494006740158936
CPU times: user 4min 15s, sys: 84.4 ms, total: 4min 15s
Wall time: 4min 15s


Small lesson learned here:  
- fit() method provides early stopping parameter to stop the iterations when there is no further improvement to the scores, but not available on all models!
- cross_val_score method does not provide early stopping parameter, and will run through all the runs specified in the n_estimators parameter.
- That's why a lot of authors write their own manual cross validation codes to manually perform Kfolds splits or other split methods and use the fit() method afterward to have more control over the cross validation procedure, including specifiying early_stopping_rounds to save time and prevent overfitting.

## Ensemble the models

In [14]:
%%time
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import roc_auc_score
ensemble = VotingClassifier(estimators=[('xgb', xgb),
                                        ('cb', cb),
                                        ('lgbm', lgbm)], 
                            voting='soft')
ensemble.fit(X_train, y_train)
print(roc_auc_score(y_val, [x[1] for x in ensemble.predict_proba(X_val)]))

0:	total: 37.2ms	remaining: 3m 6s
1:	total: 87.8ms	remaining: 3m 39s
2:	total: 117ms	remaining: 3m 14s
3:	total: 144ms	remaining: 2m 59s
4:	total: 175ms	remaining: 2m 54s
5:	total: 216ms	remaining: 2m 59s
6:	total: 252ms	remaining: 2m 59s
7:	total: 301ms	remaining: 3m 7s
8:	total: 330ms	remaining: 3m 3s
9:	total: 355ms	remaining: 2m 57s
10:	total: 376ms	remaining: 2m 50s
11:	total: 401ms	remaining: 2m 46s
12:	total: 426ms	remaining: 2m 43s
13:	total: 448ms	remaining: 2m 39s
14:	total: 476ms	remaining: 2m 38s
15:	total: 500ms	remaining: 2m 35s
16:	total: 524ms	remaining: 2m 33s
17:	total: 551ms	remaining: 2m 32s
18:	total: 577ms	remaining: 2m 31s
19:	total: 603ms	remaining: 2m 30s
20:	total: 627ms	remaining: 2m 28s
21:	total: 651ms	remaining: 2m 27s
22:	total: 679ms	remaining: 2m 26s
23:	total: 705ms	remaining: 2m 26s
24:	total: 731ms	remaining: 2m 25s
25:	total: 755ms	remaining: 2m 24s
26:	total: 780ms	remaining: 2m 23s
27:	total: 805ms	remaining: 2m 22s
28:	total: 830ms	remaining: 2m 

# Final Prediction and Submission

In [15]:
X_test = test1
X_test = pipeline.fit_transform(X_test)


In [16]:
models = [xgb, cb, lgbm, ada, ensemble]
model_names = ['xgb', 'cb', 'lgbm', 'ada', 'ensemble']

for model, model_name in zip(models, model_names):
    y_pred = model.predict_proba(X_test)
    y_pred_proba = [pred[1] for pred in y_pred]
    output = pd.DataFrame({'id':id_test, 'Machine failure':y_pred_proba})
    print(f"{model_name} model:\n",output.head(3),"\n"), display(output['Machine failure'].value_counts())
    output.to_csv(f'{model_name}_submission.csv', index=False)
    print(f"\nYour {model_name} submission was successfully saved!\n")
    print("======================================================================")

xgb model:
        id  Machine failure
0  136429         0.052933
1  136430         0.052933
2  136431         0.052933 



0.052933    74435
0.059283     4340
0.053266     1875
0.058285     1598
0.061153     1158
            ...  
0.120270        1
0.069361        1
0.153499        1
0.084241        1
0.465780        1
Name: Machine failure, Length: 924, dtype: int64


Your xgb submission was successfully saved!

cb model:
        id  Machine failure
0  136429         0.000982
1  136430         0.001323
2  136431         0.000775 



0.006538    16
0.001207    13
0.002781     8
0.002531     5
0.197041     5
            ..
0.000799     1
0.014455     1
0.001135     1
0.001039     1
0.000973     1
Name: Machine failure, Length: 90381, dtype: int64


Your cb submission was successfully saved!

lgbm model:
        id  Machine failure
0  136429         0.002664
1  136430         0.002807
2  136431         0.002664 



0.002664    34446
0.002670     5302
0.003608     4008
0.003778     2936
0.002679     2023
            ...  
0.026944        1
0.008574        1
0.014038        1
0.004710        1
0.003829        1
Name: Machine failure, Length: 11405, dtype: int64


Your lgbm submission was successfully saved!

ada model:
        id  Machine failure
0  136429         0.337122
1  136430         0.338579
2  136431         0.337122 



0.337122    31056
0.338579     5679
0.337576     4732
0.344539     2876
0.379042     2497
            ...  
0.375317        1
0.547810        1
0.343653        1
0.454762        1
0.402734        1
Name: Machine failure, Length: 2163, dtype: int64


Your ada submission was successfully saved!

ensemble model:
        id  Machine failure
0  136429         0.000488
1  136430         0.001375
2  136431         0.000463 



0.009922    16
0.001264    13
0.001822     8
0.003434     5
0.081633     5
            ..
0.000241     1
0.000959     1
0.004771     1
0.001701     1
0.000429     1
Name: Machine failure, Length: 90413, dtype: int64


Your ensemble submission was successfully saved!

