In [1]:
# Goal: Predict failure (Classification) via Decision Trees, bagging, Random Forests, AdaBoost,gradient boosting
# and XGBoost.

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/machine-failure-prediction-using-sensor-data/data.csv


In [3]:
dataset = pd.read_csv('/kaggle/input/machine-failure-prediction-using-sensor-data/data.csv')

In [4]:
dataset.head()

Unnamed: 0,footfall,tempMode,AQ,USS,CS,VOC,RP,IP,Temperature,fail
0,0,7,7,1,6,6,36,3,1,1
1,190,1,3,3,5,1,20,4,1,0
2,31,7,2,2,6,1,24,6,1,0
3,83,4,3,4,5,1,28,6,1,0
4,640,7,5,6,4,0,68,6,1,0


In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 944 entries, 0 to 943
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   footfall     944 non-null    int64
 1   tempMode     944 non-null    int64
 2   AQ           944 non-null    int64
 3   USS          944 non-null    int64
 4   CS           944 non-null    int64
 5   VOC          944 non-null    int64
 6   RP           944 non-null    int64
 7   IP           944 non-null    int64
 8   Temperature  944 non-null    int64
 9   fail         944 non-null    int64
dtypes: int64(10)
memory usage: 73.9 KB


In [6]:
'''
Columns Description
footfall: The number of people or objects passing by the machine. (Continuous Variable)
tempMode: The temperature mode or setting of the machine. (Categorical, 1-7)
AQ: Air quality index near the machine. (Categorical)
USS: Ultrasonic sensor data, indicating proximity measurements.(Categorical)
CS: Current sensor readings, indicating the electrical current usage of the machine.(Categorical)
VOC: Volatile organic compounds level detected near the machine.(Categorical)
RP: Rotational position or RPM (revolutions per minute) of the machine parts. (Continuous)
IP: Input pressure to the machine.(Categorical)
Temperature: The operating temperature of the machine.(Continuous)
fail: Binary indicator of machine failure (1 for failure, 0 for no failure). (Categorical)
''';

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

In [8]:
X_train, X_test, y_train, y_test = train_test_split(dataset.drop('fail',axis=1),
                                                   dataset['fail'],
                                                   test_size=0.25, random_state=42)

In [9]:
X_train.head()

Unnamed: 0,footfall,tempMode,AQ,USS,CS,VOC,RP,IP,Temperature
584,0,5,4,6,5,2,51,3,20
298,330,2,4,5,5,1,43,5,15
828,3,7,2,4,3,6,33,6,22
328,900,2,6,5,3,1,25,3,15
204,290,0,6,3,6,5,35,4,12


In [10]:
y_train.head()

584    0
298    0
828    1
328    0
204    1
Name: fail, dtype: int64

# Decision Trees

In [11]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(criterion='gini', splitter='best', min_samples_leaf=2)

In [12]:
roc_auc_score = cross_val_score(dt, X_train, y_train, scoring='roc_auc', cv=10)
acc_score = cross_val_score(dt, X_train, y_train, scoring='accuracy', cv=10)


print(f"roc-auc: {np.mean(roc_auc_score).round(3)} +/- {np.std(roc_auc_score).round(3)}")
print(f"acc-score: {np.mean(acc_score).round(3)} +/- {np.std(acc_score).round(3)}")


roc-auc: 0.884 +/- 0.048
acc-score: 0.874 +/- 0.046


In [13]:
decision_tree = dt.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)
y_pred_proba = decision_tree.predict_proba(X_test)[:,1]

In [14]:
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
print(classification_report(y_test, y_pred))
print(f"ROC-AUC:{roc_auc_score(y_test, y_pred_proba).round(3)}")

              precision    recall  f1-score   support

           0       0.88      0.85      0.86       132
           1       0.81      0.85      0.83       104

    accuracy                           0.85       236
   macro avg       0.84      0.85      0.85       236
weighted avg       0.85      0.85      0.85       236

ROC-AUC:0.861


# Bagging

In [15]:
from sklearn.ensemble import BaggingClassifier

In [16]:
bc = BaggingClassifier(estimator=dt,
                      n_estimators=10,
                      bootstrap=True,
                      oob_score=False,
                      n_jobs=-1)

In [17]:
roc_auc_score = cross_val_score(bc, X_train, y_train, scoring='roc_auc', cv=10)
acc_score = cross_val_score(bc, X_train, y_train, scoring='accuracy', cv=10)


print(f"roc-auc: {np.mean(roc_auc_score).round(3)} +/- {np.std(roc_auc_score).round(3)}")
print(f"acc-score: {np.mean(acc_score).round(3)} +/- {np.std(acc_score).round(3)}")


roc-auc: 0.947 +/- 0.031
acc-score: 0.888 +/- 0.046


In [18]:
bagging_classifier = bc.fit(X_train, y_train)
y_pred = bagging_classifier.predict(X_test)
y_pred_proba = bagging_classifier.predict_proba(X_test)[:,1]

In [19]:
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
print(classification_report(y_test, y_pred))
print(f"ROC-AUC:{roc_auc_score(y_test, y_pred_proba).round(3)}")

              precision    recall  f1-score   support

           0       0.87      0.89      0.88       132
           1       0.85      0.84      0.84       104

    accuracy                           0.86       236
   macro avg       0.86      0.86      0.86       236
weighted avg       0.86      0.86      0.86       236

ROC-AUC:0.934


# Random Forests

In [20]:
from sklearn.ensemble import RandomForestClassifier

In [21]:
rf = RandomForestClassifier(n_estimators=100,
                           criterion='gini',
                           min_samples_leaf=2,
                           max_features='sqrt',
                           bootstrap=True,
                           oob_score=False,
                           n_jobs=-1)

In [22]:
roc_auc_score = cross_val_score(rf, X_train, y_train, scoring='roc_auc', cv=10)
acc_score = cross_val_score(rf, X_train, y_train, scoring='accuracy', cv=10)


print(f"roc-auc: {np.mean(roc_auc_score).round(3)} +/- {np.std(roc_auc_score).round(3)}")
print(f"acc-score: {np.mean(acc_score).round(3)} +/- {np.std(acc_score).round(3)}")


roc-auc: 0.963 +/- 0.023
acc-score: 0.912 +/- 0.037


In [23]:
random_forest = rf.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)
y_pred_proba = random_forest.predict_proba(X_test)[:,1]

In [24]:
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
print(classification_report(y_test, y_pred))
print(f"ROC-AUC:{roc_auc_score(y_test, y_pred_proba).round(3)}")

              precision    recall  f1-score   support

           0       0.91      0.89      0.90       132
           1       0.86      0.89      0.88       104

    accuracy                           0.89       236
   macro avg       0.89      0.89      0.89       236
weighted avg       0.89      0.89      0.89       236

ROC-AUC:0.954


# AdaBoost

In [25]:
from sklearn.ensemble import AdaBoostClassifier

In [26]:
abc = AdaBoostClassifier(estimator=dt,
                         n_estimators=100,
                         learning_rate = 1)

In [27]:
roc_auc_score = cross_val_score(abc, X_train, y_train, scoring='roc_auc', cv=10)
acc_score = cross_val_score(abc, X_train, y_train, scoring='accuracy', cv=10)


print(f"roc-auc: {np.mean(roc_auc_score).round(3)} +/- {np.std(roc_auc_score).round(3)}")
print(f"acc-score: {np.mean(acc_score).round(3)} +/- {np.std(acc_score).round(3)}")

roc-auc: 0.961 +/- 0.021
acc-score: 0.905 +/- 0.034


In [28]:
adaboost_classifier = abc.fit(X_train, y_train)
y_pred = adaboost_classifier.predict(X_test)
y_pred_proba = adaboost_classifier.predict_proba(X_test)[:,1]

In [29]:
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
print(classification_report(y_test, y_pred))
print(f"ROC-AUC:{roc_auc_score(y_test, y_pred_proba).round(3)}")

              precision    recall  f1-score   support

           0       0.90      0.89      0.90       132
           1       0.87      0.88      0.87       104

    accuracy                           0.89       236
   macro avg       0.88      0.88      0.88       236
weighted avg       0.89      0.89      0.89       236

ROC-AUC:0.955


# Gradient Boosting

In [30]:
from sklearn.ensemble import GradientBoostingClassifier

In [31]:
gbc = GradientBoostingClassifier(loss='log_loss', learning_rate = 0.1,
                                n_estimators=500, min_samples_leaf=2)

In [32]:
roc_auc_score = cross_val_score(gbc, X_train, y_train, scoring='roc_auc', cv=10)
acc_score = cross_val_score(gbc, X_train, y_train, scoring='accuracy', cv=10)


print(f"roc-auc: {np.mean(roc_auc_score).round(3)} +/- {np.std(roc_auc_score).round(3)}")
print(f"acc-score: {np.mean(acc_score).round(3)} +/- {np.std(acc_score).round(3)}")

roc-auc: 0.957 +/- 0.025
acc-score: 0.9 +/- 0.035


In [33]:
gb_classifier = gbc.fit(X_train, y_train)
y_pred = gb_classifier.predict(X_test)
y_pred_proba = gb_classifier.predict_proba(X_test)[:,1]

In [34]:
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
print(classification_report(y_test, y_pred))
print(f"ROC-AUC:{roc_auc_score(y_test, y_pred_proba).round(3)}")

              precision    recall  f1-score   support

           0       0.88      0.89      0.89       132
           1       0.86      0.85      0.85       104

    accuracy                           0.87       236
   macro avg       0.87      0.87      0.87       236
weighted avg       0.87      0.87      0.87       236

ROC-AUC:0.942


# XGBoost

In [35]:
from xgboost import XGBClassifier

In [36]:
xgbc = XGBClassifier(tree_method='hist')

In [37]:
roc_auc_score = cross_val_score(xgbc, X_train, y_train, scoring='roc_auc', cv=10)
acc_score = cross_val_score(xgbc, X_train, y_train, scoring='accuracy', cv=10)


print(f"roc-auc: {np.mean(roc_auc_score).round(3)} +/- {np.std(roc_auc_score).round(3)}")
print(f"acc-score: {np.mean(acc_score).round(3)} +/- {np.std(acc_score).round(3)}")

roc-auc: 0.959 +/- 0.024
acc-score: 0.91 +/- 0.043


In [38]:
xgb_classifier = xgbc.fit(X_train, y_train)
y_pred = xgb_classifier.predict(X_test)
y_pred_proba = xgb_classifier.predict_proba(X_test)[:,1]

In [39]:
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
print(classification_report(y_test, y_pred))
print(f"ROC-AUC:{roc_auc_score(y_test, y_pred_proba).round(3)}")

              precision    recall  f1-score   support

           0       0.88      0.87      0.88       132
           1       0.84      0.86      0.85       104

    accuracy                           0.86       236
   macro avg       0.86      0.86      0.86       236
weighted avg       0.86      0.86      0.86       236

ROC-AUC:0.946
