# Model Training

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
print('Libraries imported.')

Libraries imported.


In [3]:
df = pd.read_csv('../data/processed/data_processed.csv')
df.head()

Unnamed: 0,Type,Machine failure,Rotational speed [rpm],Torque [Nm],Tool wear [min],Air temperature [c],Process temperature [c],type_of_failure
0,1.0,0,0.222934,0.535714,0.0,0.304348,0.358025,5
1,0.0,0,0.139697,0.583791,0.011858,0.315217,0.37037,5
2,0.0,0,0.192084,0.626374,0.019763,0.304348,0.345679,5
3,0.0,0,0.154249,0.490385,0.027668,0.315217,0.358025,5
4,0.0,0,0.139697,0.497253,0.035573,0.315217,0.37037,5


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57912 entries, 0 to 57911
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Type                     57912 non-null  float64
 1   Machine failure          57912 non-null  int64  
 2   Rotational speed [rpm]   57912 non-null  float64
 3   Torque [Nm]              57912 non-null  float64
 4   Tool wear [min]          57912 non-null  float64
 5   Air temperature [c]      57912 non-null  float64
 6   Process temperature [c]  57912 non-null  float64
 7   type_of_failure          57912 non-null  int64  
dtypes: float64(6), int64(2)
memory usage: 3.5 MB


## Train Test Split

In [5]:
from sklearn.model_selection import train_test_split

X = df.drop(['Machine failure', 'type_of_failure'], axis=1)
y = df['Machine failure']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Predicting Machine Failure

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, classification_report

lr = LogisticRegression()
svc = SVC()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()

models = [lr, svc, dt, rf]
scores = []

for m in models:
    m.fit(X_train, y_train)
    y_pred = m.predict(X_test)
    acc = accuracy_score(y_test, y_pred) * 100
    prec = precision_score(y_test, y_pred) * 100
    rec = recall_score(y_test, y_pred) * 100
    f1 = f1_score(y_test, y_pred) * 100
    scores.append([acc, prec, rec, f1])


In [7]:
scores_df = pd.DataFrame(columns=['Model'], data=['Logistic Regression', 'SVC', 'Decision Tree', 'Random Forest'])
scores_df = pd.concat([scores_df, pd.DataFrame(scores, columns=['Accuracy', 'Precision', 'Recall', 'F1'])], axis=1)
scores_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,Logistic Regression,84.045584,86.628343,89.909209,88.238289
1,SVC,96.089096,94.846125,99.533074,97.133093
2,Decision Tree,99.145299,99.109563,99.610895,99.359596
3,Random Forest,99.222999,98.908858,99.935149,99.419355


In [8]:
best_model_idx = scores_df['F1'].idxmax()
best_model = models[best_model_idx]
best_model

In [15]:
report = classification_report(y_test, best_model.predict(X_test), output_dict=True)
print(report)
report  = pd.DataFrame(report).transpose()
obj  = report.to_json()
obj


{'0': {'precision': 0.9986817822304245, 'recall': 0.9780531887425769, 'f1-score': 0.9882598486824942, 'support': 3873}, '1': {'precision': 0.9890885750962772, 'recall': 0.9993514915693904, 'f1-score': 0.9941935483870967, 'support': 7710}, 'accuracy': 0.9922299922299922, 'macro avg': {'precision': 0.9938851786633509, 'recall': 0.9887023401559836, 'f1-score': 0.9912266985347955, 'support': 11583}, 'weighted avg': {'precision': 0.9922962493801891, 'recall': 0.9922299922299922, 'f1-score': 0.9922095011665213, 'support': 11583}}


'{"precision":{"0":0.9986817822,"1":0.9890885751,"accuracy":0.9922299922,"macro avg":0.9938851787,"weighted avg":0.9922962494},"recall":{"0":0.9780531887,"1":0.9993514916,"accuracy":0.9922299922,"macro avg":0.9887023402,"weighted avg":0.9922299922},"f1-score":{"0":0.9882598487,"1":0.9941935484,"accuracy":0.9922299922,"macro avg":0.9912266985,"weighted avg":0.9922095012},"support":{"0":3873.0,"1":7710.0,"accuracy":0.9922299922,"macro avg":11583.0,"weighted avg":11583.0}}'

In [16]:
rep = pd.read_json(obj)
rep

Unnamed: 0,precision,recall,f1-score,support
0,0.998682,0.978053,0.98826,3873.0
1,0.989089,0.999351,0.994194,7710.0
accuracy,0.99223,0.99223,0.99223,0.99223
macro avg,0.993885,0.988702,0.991227,11583.0
weighted avg,0.992296,0.99223,0.99221,11583.0


In [None]:
type = 'M'

if type == 'L':
        type = 0
elif type == 'M':
    type = 1
elif type == 'H':
    type = 2

type = float(type)
type

1.0

In [None]:
best_model.predict([[type,0.175738,0.477421,0.823187,0.363062,0.352309]])

array([1])

In [None]:
df.iloc[57907]

Type                       2.000000
Machine failure            1.000000
Rotational speed [rpm]     0.255781
Torque [Nm]                0.440476
Tool wear [min]            0.890956
Air temperature [c]        0.352730
Process temperature [c]    0.279353
type_of_failure            4.000000
Name: 57907, dtype: float64

Random Forest Classifier is th best performing model.

## Train Test Split

In [None]:
X = df.drop(['Machine failure', 'type_of_failure'], axis=1)
y = df['type_of_failure']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Predicting Type of Failure

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

lr = LogisticRegression()
svc = SVC()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()

models = [lr, svc, dt, rf]
scores = []

for m in models:
    m.fit(X_train, y_train)
    y_pred = m.predict(X_test)
    acc = accuracy_score(y_test, y_pred) * 100
    prec = precision_score(y_test, y_pred, average='macro') * 100
    rec = recall_score(y_test, y_pred, average='macro') * 100
    f1 = f1_score(y_test, y_pred, average='macro') * 100
    scores.append([acc, prec, rec, f1])

In [None]:
scores_df = pd.DataFrame(columns=['Model'], data=['Logistic Regression', 'SVC', 'Decision Tree', 'Random Forest'])
scores_df = pd.concat([scores_df, pd.DataFrame(scores, columns=['Accuracy', 'Precision', 'Recall', 'F1'])], axis=1)
scores_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,Logistic Regression,82.802383,82.018207,82.862065,82.264222
1,SVC,93.991194,94.187705,94.055852,93.792458
2,Decision Tree,98.566865,98.569445,98.579732,98.567563
3,Random Forest,99.283433,99.293034,99.294312,99.28472


Random Forest Classifier is th best performing model.