# Libs import

In [1]:
import numpy as np
import pandas as pd
import phik
from catboost import Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
import seaborn as sns

# Loading data

In [2]:
main_df = pd.read_csv('train.csv')
submission = pd.read_csv('test.csv')

In [3]:
main_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136429 entries, 0 to 136428
Data columns (total 14 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       136429 non-null  int64  
 1   Product ID               136429 non-null  object 
 2   Type                     136429 non-null  object 
 3   Air temperature [K]      136429 non-null  float64
 4   Process temperature [K]  136429 non-null  float64
 5   Rotational speed [rpm]   136429 non-null  int64  
 6   Torque [Nm]              136429 non-null  float64
 7   Tool wear [min]          136429 non-null  int64  
 8   Machine failure          136429 non-null  int64  
 9   TWF                      136429 non-null  int64  
 10  HDF                      136429 non-null  int64  
 11  PWF                      136429 non-null  int64  
 12  OSF                      136429 non-null  int64  
 13  RNF                      136429 non-null  int64  
dtypes: f

In [4]:
submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90954 entries, 0 to 90953
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       90954 non-null  int64  
 1   Product ID               90954 non-null  object 
 2   Type                     90954 non-null  object 
 3   Air temperature [K]      90954 non-null  float64
 4   Process temperature [K]  90954 non-null  float64
 5   Rotational speed [rpm]   90954 non-null  int64  
 6   Torque [Nm]              90954 non-null  float64
 7   Tool wear [min]          90954 non-null  int64  
 8   TWF                      90954 non-null  int64  
 9   HDF                      90954 non-null  int64  
 10  PWF                      90954 non-null  int64  
 11  OSF                      90954 non-null  int64  
 12  RNF                      90954 non-null  int64  
dtypes: float64(3), int64(8), object(2)
memory usage: 9.0+ MB


# ETL

In [5]:
main_df['Speed_new'] = np.log(main_df['Rotational speed [rpm]'])
submission['Speed_new'] = np.log(submission['Rotational speed [rpm]'])
main_df['Power'] = main_df['Torque [Nm]']*main_df['Rotational speed [rpm]']
submission['Power'] = submission['Torque [Nm]']*submission['Rotational speed [rpm]']
main_df['TF'] = main_df[['TWF', 'HDF', 'PWF', 'OSF', 'RNF']].sum(axis=1)
submission['TF'] = submission[['TWF', 'HDF', 'PWF', 'OSF', 'RNF']].sum(axis=1)
main_df['TemperatureDifference'] = main_df['Process temperature [K]'] - main_df['Air temperature [K]']
submission['TemperatureDifference'] = submission['Process temperature [K]'] - submission['Air temperature [K]']
main_df['TemperatureRatio'] = main_df['Process temperature [K]'] / main_df['Air temperature [K]']
submission['TemperatureRatio'] = submission['Process temperature [K]'] / submission['Air temperature [K]']

In [6]:
main_df.columns

Index(['id', 'Product ID', 'Type', 'Air temperature [K]',
       'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]',
       'Tool wear [min]', 'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF',
       'Speed_new', 'Power', 'TF', 'TemperatureDifference',
       'TemperatureRatio'],
      dtype='object')

In [7]:
phik_overview = main_df[['Type', 'Product ID', 'Air temperature [K]',
       'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]',
       'Tool wear [min]', 'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF',
       'RNF', 'Speed_new', 'Power', 'TF', 'TemperatureDifference',
       'TemperatureRatio']].phik_matrix()

interval columns not set, guessing: ['Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]', 'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF', 'Speed_new', 'Power', 'TF', 'TemperatureDifference', 'TemperatureRatio']




In [8]:
phik_overview['Machine failure'].sort_values(ascending=False)

Machine failure            1.000000
TF                         0.950224
HDF                        0.774598
OSF                        0.700630
PWF                        0.566404
TWF                        0.469542
Torque [Nm]                0.423334
Product ID                 0.409150
Speed_new                  0.306703
Rotational speed [rpm]     0.302614
Power                      0.286943
Tool wear [min]            0.166201
TemperatureRatio           0.163337
TemperatureDifference      0.161562
Air temperature [K]        0.143185
Process temperature [K]    0.062449
Type                       0.006897
RNF                        0.000000
Name: Machine failure, dtype: float64

### Let's check class balance

In [9]:
main_df['Machine failure'].mean()

0.01574445315878589

In [10]:
main_df[main_df['Machine failure'] == 1]['Machine failure'].count()

2148

In [11]:
cols = ['Type', 'Air temperature [K]', 'Product ID',
       'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]',
       'Tool wear [min]', 'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF',
       'RNF', 'Speed_new', 'Power', 'TF', 'TemperatureDifference',
       'TemperatureRatio']
for col in cols:
    print(f'{col}: ' + str(main_df[col].nunique()))

Type: 3
Air temperature [K]: 95
Product ID: 9976
Process temperature [K]: 81
Rotational speed [rpm]: 952
Torque [Nm]: 611
Tool wear [min]: 246
Machine failure: 2
TWF: 2
HDF: 2
PWF: 2
OSF: 2
RNF: 2
Speed_new: 952
Power: 48528
TF: 4
TemperatureDifference: 120
TemperatureRatio: 2165


In [12]:
for col in cols:
    if main_df[col].dtype in ['int64', 'float64']:
        print(f'{col}:' + str(main_df.groupby('Machine failure')[col].agg('mean')))

Air temperature [K]:Machine failure
0    299.846820
1    300.860242
Name: Air temperature [K], dtype: float64
Process temperature [K]:Machine failure
0    309.935645
1    310.280214
Name: Process temperature [K], dtype: float64
Rotational speed [rpm]:Machine failure
0    1521.399148
1    1453.563315
Name: Rotational speed [rpm], dtype: float64
Torque [Nm]:Machine failure
0    40.191867
1    50.149395
Name: Torque [Nm], dtype: float64
Tool wear [min]:Machine failure
0    103.952234
1    132.957169
Name: Tool wear [min], dtype: float64
Machine failure:Machine failure
0    0.0
1    1.0
Name: Machine failure, dtype: float64
TWF:Machine failure
0    0.000000
1    0.098696
Name: TWF, dtype: float64
HDF:Machine failure
0    0.000045
1    0.324953
Name: HDF, dtype: float64
PWF:Machine failure
0    0.000022
1    0.150838
Name: PWF, dtype: float64
OSF:Machine failure
0    0.000030
1    0.249534
Name: OSF, dtype: float64
RNF:Machine failure
0    0.002249
1    0.002793
Name: RNF, dtype: float64
Sp

# Data preparation

In [13]:
df, test = train_test_split(main_df, test_size=0.20, stratify=main_df['Machine failure'], random_state=42)
train, val = train_test_split(df, test_size=0.25, stratify=df['Machine failure'], random_state=42)

In [14]:
train['Machine failure'].mean(), val['Machine failure'].mean(), test['Machine failure'].mean()

(0.015734756954200617, 0.015758997287986513, 0.015758997287986513)

In [15]:
len(train), len(val), len(test), sum([len(train), len(val), len(test)])==len(main_df)

(81857, 27286, 27286, True)

In [16]:
df.columns

Index(['id', 'Product ID', 'Type', 'Air temperature [K]',
       'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]',
       'Tool wear [min]', 'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF',
       'Speed_new', 'Power', 'TF', 'TemperatureDifference',
       'TemperatureRatio'],
      dtype='object')

In [17]:
X = ['Type', 'Air temperature [K]', 'Product ID',
     'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]',
     'Tool wear [min]', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF', 'Speed_new', 'Power', 'TF', 
     'TemperatureDifference', 'TemperatureRatio']
y = ['Machine failure']
cat_features = ['Type', 'Product ID']

In [18]:
train_data = Pool(data=train[X],
                  label=train[y],
                  cat_features=cat_features
                 )
valid_data = Pool(data=val[X],
                  label=val[y],
                  cat_features=cat_features
                 )

# First model with validation

In [19]:
params = {'verbose':100,
          'n_estimators': 1549,
          'max_depth': 165,
          'max_bin': 295,
          'random_strength': 0.23,
          'grow_policy': 'Lossguide',
          'bootstrap_type': 'Bayesian',
          'objective':'Logloss',
          'loss_function': 'AUC',
          'eval_metric': "AUC",
          'l2_leaf_reg':  0.06876484483245901,
          'min_child_samples': 185,
          'random_seed':42,
          'learning_rate': 0.00343900835845922}

In [20]:
model_1 = CatBoostClassifier(**params)

In [21]:
model_1.fit(train_data, eval_set=valid_data)

0:	test: 0.9055491	best: 0.9055491 (0)	total: 121ms	remaining: 3m 6s
100:	test: 0.9539225	best: 0.9564919 (41)	total: 5.71s	remaining: 1m 21s
200:	test: 0.9611289	best: 0.9611289 (200)	total: 11.1s	remaining: 1m 14s
300:	test: 0.9654992	best: 0.9654992 (300)	total: 16.3s	remaining: 1m 7s
400:	test: 0.9681042	best: 0.9681042 (400)	total: 21.5s	remaining: 1m 1s
500:	test: 0.9696262	best: 0.9696331 (499)	total: 26.2s	remaining: 54.8s
600:	test: 0.9707507	best: 0.9707507 (600)	total: 30.9s	remaining: 48.8s
700:	test: 0.9713390	best: 0.9713390 (700)	total: 35.5s	remaining: 42.9s
800:	test: 0.9716601	best: 0.9716617 (798)	total: 40s	remaining: 37.3s
900:	test: 0.9718486	best: 0.9718493 (864)	total: 44.6s	remaining: 32.1s
1000:	test: 0.9720532	best: 0.9720562 (998)	total: 49.3s	remaining: 27s
1100:	test: 0.9722349	best: 0.9722349 (1100)	total: 54.5s	remaining: 22.2s
1200:	test: 0.9724074	best: 0.9724074 (1200)	total: 59.6s	remaining: 17.3s
1300:	test: 0.9725783	best: 0.9725785 (1296)	total: 1

<catboost.core.CatBoostClassifier at 0x79878c941990>

In [22]:
test['pred_1'] = model_1.predict(test[X])

In [23]:
test

Unnamed: 0,id,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF,Speed_new,Power,TF,TemperatureDifference,TemperatureRatio,pred_1
108143,108143,L51820,L,303.3,311.2,1619,31.1,219,0,0,0,0,0,0,7.389564,50350.9,0,7.9,1.026047,0
124027,124027,L56933,L,298.5,309.8,1663,36.1,203,0,0,0,0,0,0,7.416378,60034.3,0,11.3,1.037856,0
40415,40415,M16054,M,297.0,308.1,1548,38.2,0,0,0,0,0,0,0,7.344719,59133.6,0,11.1,1.037374,0
5055,5055,L48448,L,298.0,309.5,1599,35.7,51,0,0,0,0,0,0,7.377134,57084.3,0,11.5,1.038591,0
75315,75315,L47982,L,296.9,307.8,1567,35.0,121,0,0,0,0,0,0,7.356918,54845.0,0,10.9,1.036713,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21222,21222,M20620,M,301.9,311.3,1775,26.7,181,0,0,0,0,0,0,7.481556,47392.5,0,9.4,1.031136,0
999,999,L48290,L,296.5,307.6,1513,40.9,62,0,0,0,0,0,0,7.321850,61881.7,0,11.1,1.037437,0
111774,111774,L54248,L,300.8,310.7,1461,40.1,173,0,0,0,0,0,0,7.286876,58586.1,0,9.9,1.032912,0
85390,85390,H35963,H,301.4,310.6,1623,31.4,170,0,0,0,0,0,0,7.392032,50962.2,0,9.2,1.030524,0


In [24]:
accuracy_score(test['Machine failure'], test['pred_1'])

0.9961518727552591

# Now let's train model on full data

In [25]:
train_full = pd.concat([train, val])

In [26]:
model_1.best_iteration_

1545

In [27]:
params_2 = {'verbose':100,
            'random_seed':42,
            'verbose':100,
            'n_estimators': 1549,
            'max_depth': 165,
            'max_bin': 295,
            'random_strength': 0.23,
            'grow_policy': 'Lossguide',
            'bootstrap_type': 'Bayesian',
            'objective':'Logloss',
            'loss_function': 'AUC',
            'eval_metric': "AUC",
            'l2_leaf_reg':  0.06876484483245901,
            'min_child_samples': 185,
            'cat_features': cat_features,
            'learning_rate': 0.00343900835845922}

In [28]:
model_2 = CatBoostClassifier(**params_2)

In [29]:
model_2.fit(train_full[X], train_full[y])

0:	total: 80.7ms	remaining: 2m 4s
100:	total: 6.88s	remaining: 1m 38s
200:	total: 13.3s	remaining: 1m 29s
300:	total: 19.5s	remaining: 1m 20s
400:	total: 25.2s	remaining: 1m 12s
500:	total: 30.4s	remaining: 1m 3s
600:	total: 35.6s	remaining: 56.2s
700:	total: 40.9s	remaining: 49.5s
800:	total: 46.1s	remaining: 43s
900:	total: 51.4s	remaining: 36.9s
1000:	total: 56.7s	remaining: 31s
1100:	total: 1m 2s	remaining: 25.3s
1200:	total: 1m 8s	remaining: 19.7s
1300:	total: 1m 13s	remaining: 14s
1400:	total: 1m 19s	remaining: 8.41s
1500:	total: 1m 25s	remaining: 2.74s
1548:	total: 1m 28s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x79878c778610>

In [30]:
test['pred_2'] = model_2.predict(test[X])

In [31]:
accuracy_score(test['Machine failure'], test['pred_2'])

0.9963351169097706

# Prediction to submission

In [32]:
submission['Machine failure'] = model_2.predict(submission[X])

In [33]:
submission['Machine failure'].mean()

0.012401873474503595

In [34]:
sub = submission[['id','Machine failure']]
sub.head(5)

Unnamed: 0,id,Machine failure
0,136429,0
1,136430,0
2,136431,0
3,136432,0
4,136433,0


In [35]:
sub.to_csv('sub_machine_failures.csv', index=False)