In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<a id="table-of-content"></a>
# Table of Content
### [1. Import Libraries](#import-libraries)
### [2. Loading Data](#loading-data)
### [3. Data Exploration](#data-exploration)
- [Correlation Matrix](#correlation-matrix)
- [Data Distribution](#data-distribution)
- [Divide box plot into "No Failure" & "Failure"](#divide-box-plot)
- [Rename Columns](#rename-columns)
- [Removes useless & correlated features](#removes-features)
- [Pair Plot](#pair-plot)

### [4. Imbalanced Data](#imbalanced-data)
### [5. One Hot Encoding](#encoding)
- [get_dummies approach](#get-dummies)

### [6. Train-Test Split](#train-test-split)
### [7. Classify Imbalanced Data](#classify-imbalanced-data)
### [8. Resampling Data + Modelling](#resampling-data)
- [Random Forest Classifier](#random-forest)
- [Gradient Boosting Classifier](#GBT)
- [XGBoost](#xgboost)

### [9. Conclusion](#conclusion)
### [Go to end](#end)

<a id="import-libraries"></a>
# Import Libraries

In [None]:
import pandas as pd
import imblearn
import seaborn as sns
sns.set_theme(style="darkgrid")
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict, cross_val_score, KFold, GridSearchCV
from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.over_sampling import SMOTE 

<a id="loading-data"></a>
# Loading Data

In [None]:
df = pd.read_csv('../input/machine-predictive-maintenance-classification/predictive_maintenance.csv')
df.head()

<a id="data-exploration"></a>
# Data Exploration

In [None]:
df.info()

In [None]:
df.nunique()

**Findings:**
1. There is no missing value in the data
2. UDI and ProductID are unique for each instances


In [None]:
plt.figure(figsize=(15,15))
plt.subplot(2,1,1)
pl = sns.countplot(y=df["Type"])
pl.set_title("Type (Frequency)")
pl.set(xlabel=None)
for p in pl.patches:
        percentage = p.get_width()
        x = p.get_x() + p.get_width() + 0.02
        y = p.get_y() + p.get_height()/2
        pl.annotate(percentage, (x, y))
plt.subplot(2,1,2)
pl2 = sns.countplot(y=df["Failure Type"])
pl2.set_title("Failure Type (Frequency)")
pl2.set(xlabel=None)
for p in pl2.patches:
        percentage = p.get_width()
        x = p.get_x() + p.get_width() + 0.02
        y = p.get_y() + p.get_height()/2
        pl2.annotate(percentage, (x, y))
plt.show()

<a id="correlation-matrix"></a>
## Correlation Matrix

In [None]:
plt.figure(figsize=(10,10))
pl = sns.heatmap(df.corr(), annot=True)
pl.set_title("Correlation Matrix")
plt.show()

We need to remove:
1. ("Torque \[Nm\]" or "Rotational speed \[rpm\]")
2. ("Air temperature \[K\]" or "Process temperature \[K\]")
<br></br>
to make this decision, I decided to view compare their data distribution against failure type.

In [None]:
df.describe()

There might be outliers in "Rotational speed \[rpm]", "Torque [Nm]", and "Tool wear [min]"

<a id="data-distribution"></a>
## Data Distribution

In [None]:
plt.figure(figsize=(10,10))
df_drop = df.drop(columns=['Target','Product ID', 'UDI','Type'])
df_normalize=(df_drop-df_drop.mean())/df_drop.std()
pl=sns.boxplot(data=df_normalize.drop(columns='Failure Type'), orient = 'h')
pl.set_title("Box Plot")
plt.show()

Chances are these outliers could be the cause of failure

<a id="divide-box-plot"></a>
## Divide box plot into "No Failure" & "Failure" 

In [None]:
plt.figure(figsize=(15,15))
df_drop = df.drop(columns=['Target','Product ID', 'UDI','Type'])

plt.subplot(2,1,1)
df_no_fail = df_drop[df_drop['Failure Type']=="No Failure"]
# pl = sns.boxplot(data=df_no_fail)
df_normalize=(df_no_fail-df_no_fail.mean())/df_no_fail.std()
pl=sns.boxplot(data=df_normalize.drop(columns='Failure Type'), orient = 'h')
pl.set_xlim([-4, 7])
pl.set_title("Box Plot (No Failure)")

plt.subplot(2,1,2)
df_fail = df_drop[df_drop['Failure Type']!="No Failure"]
# pl = sns.boxplot(data=df_no_fail)
df_normalize=(df_fail-df_fail.mean())/df_fail.std()
pl=sns.boxplot(data=df_normalize.drop(columns='Failure Type'), orient = 'h')
pl.set_xlim([-4, 7])
pl.set_title("Box Plot (Failure)")
plt.show()

1. Data distribution of "Air temperature \[K\]" is different for "Failure" and "No Failure" (at least more obvious than "Process temperature \[K\]") so I decided to remove "Process temperature \[K\]".
2. Rotational speed \[rpm\] hsa obvious difference between "Failure" and "No Failure" so I decided to remove "Torque \[Nm\]".

<a id="rename-columns"></a>
## Rename Columns

In [None]:
df['Air temperature'] = df['Air temperature [K]']
df['Rotational speed'] = df['Rotational speed [rpm]']
df['Tool wear'] = df['Tool wear [min]']
df

<a id="removes-features"></a>
## Removes useless and correlated features

In [None]:
df.drop(columns = ['UDI','Product ID', 'Process temperature [K]', 'Torque [Nm]', 'Target','Air temperature [K]','Rotational speed [rpm]','Tool wear [min]'], inplace = True)

In [None]:
df.info()

In [None]:
df.columns

<a id="pair-plot"></a>
## Pairplot

In [None]:
sns.pairplot(df, hue = "Failure Type")
plt.show()

<a id="imbalanced-data"></a>
# Imbalanced Data

Data distribution of the failures are not obvious due to imbalanced data.

In [None]:
plt.figure(figsize=(12,7))
pl = sns.countplot(y=df["Failure Type"])
pl.set_title("Failure Type (Frequency)")
pl.set(xlabel=None)
for p in pl.patches:
        percentage = p.get_width()
        x = p.get_x() + p.get_width() + 0.02
        y = p.get_y() + p.get_height()/2
        pl.annotate(percentage, (x, y))
plt.show()

<a id="encoding"></a>
# One Hot Encoding

In [None]:
encoder = OneHotEncoder(handle_unknown = 'ignore')
transformed = encoder.fit_transform(df['Type'].to_numpy().reshape(-1,1)).toarray()
enc_df = pd.DataFrame(transformed)
df2 = df.join(enc_df)
df2

<a id="get-dummies"></a>
## get_dummies approach

In [None]:
dum_df = pd.get_dummies(df,columns = ['Type'], prefix = ['Type'])
dum_df

Let's use dum_df since it saved our time of renaming the columns

In [None]:
# Removes "Type_H" to reduce multicollinearity
df = dum_df.drop(columns = ['Type_H'])
df

<a id="train-test-split"></a>
# Train-Test Split

We need to split our data set before over sampling to prevent information leak into validation and testing set. This will over fit the testing set.

In [None]:
# Define X and y
X = df.drop(columns = 'Failure Type')
y = df["Failure Type"]

In [None]:
X

In [None]:
y

In [None]:
# Number of instances for each class
from collections import Counter
print(sorted(Counter(y).items()))

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
y = le.fit_transform(y)

In [None]:
y

In [None]:
# train-test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=1)
    
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
# Number of instances for each class
from collections import Counter
print(sorted(Counter(y).items()))

<a id="classify-imbalanced-data"></a>
# Classifying Imbalanced Data

In [None]:
# Classifying imbalanced data
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X,y)

cv_results = cross_validate(
    model, X, y, scoring="balanced_accuracy",
    return_train_score=True, return_estimator=True,
    n_jobs=-1
)
cv_results

We get very low accuracy (30%) due to imbalanced data.

In [None]:
from sklearn.model_selection import cross_val_predict

y_train_predict = cross_val_predict(model, X_train, y_train, cv=3)

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_train,y_train_predict)
cm

Visualize the matrix to better evaluate the model 

### Define y_class

In [None]:
from sklearn.metrics import classification_report
cr = classification_report(y_train,y_train_predict, output_dict=True)

y_list = []

for key, values in cr.items():
    y_list.append(key)

y_class_str = []
    
for i in y_list[:6]:
    y_class_str.append(int(i))
    
y_class_str

In [None]:
y_class = le.inverse_transform(y_class_str)
y_class

This list is going to be our x & y axes for our confusion matrix heat map

In [None]:
df_cm = pd.DataFrame(cm, index = [y_class],
                  columns = y_class)
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=True)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

From the heat map, we can say that:
1. The model is heavily biased towards majority class
2. Minority classes are heavily underfitted

What about the precision-recall and f1 score?

In [None]:
print(classification_report(y_train,y_train_predict))

Consistent with our heat map, minority classes are heavily underfitted especially "Random Failure" & "Tool Wear Failure" (Recall = 0). Out of 50 failures the model catch none of them which is very bad. When the data is highly imbalanced, the model wasn't exposed to the minority class enough and was taught to predict the mojority class most of the time. Therefore, we shoud resample our data such that the majority class and minority classes have equal chance to be seen by the model. 

<a id="resampling-data"></a>
# Resampling Data + Modelling

Note that we're resampling our data after train-test split so that information from the training set didn't leak into the testing set.

Credit to this [article](https://kiwidamien.github.io/how-to-do-cross-validation-when-upsampling-data.html) for the right way to oversample without polluting your validation set.

In [None]:
# from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
# kf = KFold(n_splits=5, random_state=42, shuffle=False)
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

In [None]:
from imblearn.under_sampling import RandomUnderSampler

<a id="random-forest"></a>
## Random Forest Classifier

In [None]:
# parameters for grid search
rf_params = {
    'n_estimators': [50, 100, 150],
    'max_depth': [4, 6, 10, 12],
    'random_state': [13]
}
# Add classifier's name for grid search
pipe_rf_params = {'randomforestclassifier__' + key: rf_params[key] for key in rf_params}
pipe_rf_params

In [None]:
# from imblearn.pipeline import Pipeline, make_pipeline
# from imblearn.over_sampling import SMOTE 
# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import GridSearchCV


rf_pipeline = make_pipeline(RandomUnderSampler(sampling_strategy='majority'),SMOTE(random_state=42), 
                              RandomForestClassifier(random_state=13))

# use recall_weighted instead of recall because this is multiclass
grid_rf = GridSearchCV(rf_pipeline, param_grid=pipe_rf_params, cv=skf,scoring='recall_weighted',
                        return_train_score=True)
grid_rf.fit(X_train, y_train)
grid_rf.best_score_

In [None]:
grid_rf.best_params_

It is important for us to test on testing set instead of validation set because we're using grid search which could overfit the validation set.

In [None]:
y_train_predict = grid_rf.predict(X_train)
cm_val = confusion_matrix(y_train,y_train_predict)

y_test_predict = grid_rf.predict(X_test)
cm_test = confusion_matrix(y_test,y_test_predict)

print("Validation Score:")
print(cm_val)
print("Test Score:")
print(cm_test)

In [None]:
df_val = pd.DataFrame(cm_val, index = [y_class],
                  columns = y_class)

df_test = pd.DataFrame(cm_test, index = [y_class],
                  columns = y_class)

plt.figure(figsize = (15,15))
plt.subplot(2,1,1)
pl = sns.heatmap(df_val, annot=True)
plt.xlabel('Predicted')
plt.ylabel('Actual')
pl.set_title("Validation")

plt.subplot(2,1,2)
pl2 = sns.heatmap(df_test, annot=True)
plt.xlabel('Predicted')
plt.ylabel('Actual')
pl2.set_title("Test")
plt.show()

In [None]:
print("Validation Score:")
print(classification_report(y_train,y_train_predict))
print("")
print("Test Score:")
print(classification_report(y_test,y_test_predict))

Now, we see that this model tried to classify the imbalanced classes more (although it was not accurate). Something worth taking note is that the validation score is so much better than our test score. This might be a sign of overfitting the validation set. Try pruning the tree or use other model to see if it gets better

<a id="GBT"></a>
## Gradient Boosting Classifier

In [None]:
# parameters for grid search
GBT_params = {
    'max_depth': [3, 5, 7],
    'tol': [0.01,0.02]
}
# Add classifier's name for grid search
pipe_GBT_params = {'gradientboostingclassifier__' + key: GBT_params[key] for key in GBT_params}
pipe_GBT_params

In [None]:
# from sklearn.ensemble import GradientBoostingClassifier
# from imblearn.under_sampling import RandomUnderSampler

GBT_pipeline = make_pipeline(RandomUnderSampler(sampling_strategy='majority'),SMOTE(random_state=42), 
                              GradientBoostingClassifier(random_state=13))

# use recall_weighted instead of recall because this is multiclass
grid_GBT = GridSearchCV(GBT_pipeline, param_grid=pipe_GBT_params, cv=skf,scoring='recall_weighted',
                        return_train_score=True)
grid_GBT.fit(X_train, y_train)
grid_GBT.best_score_

In [None]:
grid_GBT.best_params_

In [None]:
y_train_predict = grid_GBT.predict(X_train)
cm_val = confusion_matrix(y_train,y_train_predict)

y_test_predict = grid_GBT.predict(X_test)
cm_test = confusion_matrix(y_test,y_test_predict)

print("Validation Score:")
print(cm_val)
print("Test Score:")
print(cm_test)

In [None]:
df_val = pd.DataFrame(cm_val, index = [y_class],
                  columns = y_class)

df_test = pd.DataFrame(cm_test, index = [y_class],
                  columns = y_class)

plt.figure(figsize = (15,15))
plt.subplot(2,1,1)
pl = sns.heatmap(df_val, annot=True)
plt.xlabel('Predicted')
plt.ylabel('Actual')
pl.set_title("Validation")

plt.subplot(2,1,2)
pl2 = sns.heatmap(df_test, annot=True)
plt.xlabel('Predicted')
plt.ylabel('Actual')
pl2.set_title("Test")
plt.show()

In [None]:
print("Validation Score:")
print(classification_report(y_train,y_train_predict))
print("")
print("Test Score:")
print(classification_report(y_test,y_test_predict))

Now this model is better at finding actual "Random Failure" and "Tool Wear Failure" but at the cost of more false alarm.

<a id="xgboost"></a>
## XGBoost

In [None]:
# parameters for grid search
XGB_params = {
    'max_depth': [3, 5, 7],
    'min_child_weight': [2,5,6]
}
# Add classifier's name for grid search
pipe_XGB_params = {'xgbclassifier__' + key: XGB_params[key] for key in XGB_params}
pipe_XGB_params

In [None]:
from xgboost import XGBClassifier

XGB_pipeline = make_pipeline( RandomUnderSampler(sampling_strategy='majority'),SMOTE(random_state=42),
                              XGBClassifier(use_label_encoder=False))

# use recall_weighted instead of recall because this is multiclass
grid_XGB = GridSearchCV(XGB_pipeline, param_grid=pipe_XGB_params, cv=skf,scoring='recall_weighted',
                        return_train_score=True)
grid_XGB.fit(X_train, y_train)
grid_XGB.best_score_

In [None]:
grid_XGB.best_params_

In [None]:
y_train_predict = grid_XGB.predict(X_train)
cm_val = confusion_matrix(y_train,y_train_predict)

y_test_predict = grid_XGB.predict(X_test)
cm_test = confusion_matrix(y_test,y_test_predict)

print("Validation Score:")
print(cm_val)
print("Test Score:")
print(cm_test)

In [None]:
df_val = pd.DataFrame(cm_val, index = [y_class],
                  columns = y_class)

df_test = pd.DataFrame(cm_test, index = [y_class],
                  columns = y_class)

plt.figure(figsize = (15,15))
plt.subplot(2,1,1)
pl = sns.heatmap(df_val, annot=True)
plt.xlabel('Predicted')
plt.ylabel('Actual')
pl.set_title("Validation")

plt.subplot(2,1,2)
pl2 = sns.heatmap(df_test, annot=True)
plt.xlabel('Predicted')
plt.ylabel('Actual')
pl2.set_title("Test")
plt.show()

In [None]:
print("Validation Score:")
print(classification_report(y_train,y_train_predict))
print("")
print("Test Score:")
print(classification_report(y_test,y_test_predict))

There is no significant improvement comparing XGBoost with GBT.

<a id="conclusion"></a>
# Conclusion

1. We can use Over & Down Sampling to help our model see more instances in minority classes.
2. Split training & testing set before oversampling
3. Use make_pipeline to prevent polluting validation set
4. Boosting can help improve our prediction for minority classes at the cost of lower precision and misclassified majority class

Please comment if you have suggestions on how to improve this notebook. 😁

<a id="end"></a>
# [Back to top](#table-of-content)