### Model development

In [1]:
import pandas as pd
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, RobustScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score, cross_val_predict, StratifiedShuffleSplit

filepath = "C:/Users/WALDMJN/OneDrive - Schaeffler/Uni/Data Exploration Project/Pred Maintenance Project/Predictive-Maintenance/Data/predictive_maintenance.csv"
df = pd.read_csv(filepath)
df = df.drop(["UDI", "Product ID"], axis = 1)
df.head()


Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
0,M,298.1,308.6,1551,42.8,0,0,No Failure
1,L,298.2,308.7,1408,46.3,3,0,No Failure
2,L,298.1,308.5,1498,49.4,5,0,No Failure
3,L,298.2,308.6,1433,39.5,7,0,No Failure
4,L,298.2,308.7,1408,40.0,9,0,No Failure


Let's drop out the target anomalies from notebook before.

In [2]:
fail_df = df[df['Target'] == 1]
indexPossibleFailure = fail_df[fail_df['Failure Type'] == 'No Failure'].index
df.drop(indexPossibleFailure, axis=0, inplace=True)
df.shape[0]


9991

In [3]:
fail_df  = df[df['Target'] == 0]
indexPossibleFailure = fail_df[fail_df['Failure Type'] == 'Random Failures'].index
df.drop(indexPossibleFailure, axis=0, inplace=True)
df.shape[0]


9973

The RobustScaler on Rotational Speed and Torque is necessary because of strong outliers.

In [4]:
df_scaled = df.copy()

columns = ['Rotational speed [rpm]', 'Torque [Nm]']
scaler = RobustScaler()
features_scaled = scaler.fit_transform(df[columns])
features_scaled = pd.DataFrame(features_scaled, columns=columns)
df_scaled.drop(columns, axis=1, inplace=True)
df_scaled = pd.concat([df_scaled,features_scaled], axis=1)

df_scaled.head(5)

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Tool wear [min],Target,Failure Type,Rotational speed [rpm],Torque [Nm]
0,M,298.1,308.6,0.0,0.0,No Failure,0.253968,0.2
1,L,298.2,308.7,3.0,0.0,No Failure,-0.502646,0.459259
2,L,298.1,308.5,5.0,0.0,No Failure,-0.026455,0.688889
3,L,298.2,308.6,7.0,0.0,No Failure,-0.37037,-0.044444
4,L,298.2,308.7,9.0,0.0,No Failure,-0.502646,-0.007407


Air temperature, Process temperature and tool wear get scaled over MinMaxScaler.

In [5]:
columns = ['Air temperature [K]', 'Process temperature [K]', 'Tool wear [min]']
scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(df[columns])
features_scaled = pd.DataFrame(features_scaled, columns=columns)
df_scaled.drop(columns, axis=1, inplace=True)
df_scaled = pd.concat([df_scaled, features_scaled], axis=1)

df_scaled.head()

Unnamed: 0,Type,Target,Failure Type,Rotational speed [rpm],Torque [Nm],Air temperature [K],Process temperature [K],Tool wear [min]
0,M,0.0,No Failure,0.253968,0.2,0.304348,0.358025,0.0
1,L,0.0,No Failure,-0.502646,0.459259,0.315217,0.37037,0.011858
2,L,0.0,No Failure,-0.026455,0.688889,0.304348,0.345679,0.019763
3,L,0.0,No Failure,-0.37037,-0.044444,0.315217,0.358025,0.027668
4,L,0.0,No Failure,-0.502646,-0.007407,0.315217,0.37037,0.035573


It is important that the values for training data and test data are well divided, as there is a small number of errors, especially in the existing data set. 

In [7]:
from sklearn.model_selection import StratifiedShuffleSplit

X = df_scaled.drop(['Target', 'Failure Type'], axis=1)
y = df_scaled['Target']

X = X[~y.isna()]
y = y.dropna()

X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.30, random_state=42)

for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]

print('Checking the stratified split...')
print('Target proportion in original dataset:')
print(df['Target'].value_counts(normalize=True))

print('Target proportion in y_train dataset:')
print(y_train.value_counts(normalize=True))

print('Target proportion in y_test dataset:')
print(y_test.value_counts(normalize=True))


Checking the stratified split...
Target proportion in original dataset:
Target
0    0.966911
1    0.033089
Name: proportion, dtype: float64
Target proportion in y_train dataset:
Target
0.0    0.96691
1.0    0.03309
Name: proportion, dtype: float64
Target proportion in y_test dataset:
Target
0.0    0.966912
1.0    0.033088
Name: proportion, dtype: float64


Y_Train and Y_Test have an equally good distribution and a small difference in the target values.