# Model Training

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
print('Libraries imported.')

Libraries imported.


In [2]:
df = pd.read_csv('../data/processed/data_processed.csv')
df.head()

Unnamed: 0,Type,Machine failure,Rotational speed [rpm],Torque [Nm],Tool wear [min],Air temperature [c],Process temperature [c],type_of_failure
0,1.0,0,0.222934,0.535714,0.0,0.304348,0.358025,5
1,0.0,0,0.139697,0.583791,0.011858,0.315217,0.37037,5
2,0.0,0,0.192084,0.626374,0.019763,0.304348,0.345679,5
3,0.0,0,0.154249,0.490385,0.027668,0.315217,0.358025,5
4,0.0,0,0.139697,0.497253,0.035573,0.315217,0.37037,5


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57912 entries, 0 to 57911
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Type                     57912 non-null  float64
 1   Machine failure          57912 non-null  int64  
 2   Rotational speed [rpm]   57912 non-null  float64
 3   Torque [Nm]              57912 non-null  float64
 4   Tool wear [min]          57912 non-null  float64
 5   Air temperature [c]      57912 non-null  float64
 6   Process temperature [c]  57912 non-null  float64
 7   type_of_failure          57912 non-null  int64  
dtypes: float64(6), int64(2)
memory usage: 3.5 MB


## Train Test Split

In [4]:
from sklearn.model_selection import train_test_split

X = df.drop(['Machine failure', 'type_of_failure'], axis=1)
y = df['Machine failure']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Predicting Machine Failure

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score

lr = LogisticRegression()
svc = SVC()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()

models = [lr, svc, dt, rf]
scores = []

for m in models:
    m.fit(X_train, y_train)
    y_pred = m.predict(X_test)
    acc = accuracy_score(y_test, y_pred) * 100
    prec = precision_score(y_test, y_pred) * 100
    rec = recall_score(y_test, y_pred) * 100
    f1 = f1_score(y_test, y_pred) * 100
    scores.append([acc, prec, rec, f1])


In [6]:
scores_df = pd.DataFrame(columns=['Model'], data=['Logistic Regression', 'SVC', 'Decision Tree', 'Random Forest'])
scores_df = pd.concat([scores_df, pd.DataFrame(scores, columns=['Accuracy', 'Precision', 'Recall', 'F1'])], axis=1)
scores_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,Logistic Regression,83.872917,86.385152,89.948119,88.130639
1,SVC,95.873263,94.675068,99.390402,96.975449
2,Decision Tree,99.076232,99.032633,99.584955,99.308026
3,Random Forest,99.231633,98.921556,99.935149,99.425769


In [7]:
best_model_idx = scores_df['F1'].idxmax()
best_model = models[best_model_idx]
best_model

In [8]:
X

Unnamed: 0,Type,Rotational speed [rpm],Torque [Nm],Tool wear [min],Air temperature [c],Process temperature [c]
0,1.0,0.222934,0.535714,0.000000,0.304348,0.358025
1,0.0,0.139697,0.583791,0.011858,0.315217,0.370370
2,0.0,0.192084,0.626374,0.019763,0.304348,0.345679
3,0.0,0.154249,0.490385,0.027668,0.315217,0.358025
4,0.0,0.139697,0.497253,0.035573,0.315217,0.370370
...,...,...,...,...,...,...
57907,0.0,0.175738,0.477421,0.823187,0.363062,0.352309
57908,2.0,0.127831,0.643668,0.828357,0.552733,0.526402
57909,1.0,0.119856,0.626268,0.793577,0.286686,0.329629
57910,0.0,0.202221,0.468751,0.820927,0.759436,0.674209


In [9]:
type = 'M'

if type == 'L':
        type = 0
elif type == 'M':
    type = 1
elif type == 'H':
    type = 2

type = float(type)
type

1.0

In [10]:
best_model.predict([[type,0.175738,0.477421,0.823187,0.363062,0.352309]])

array([1])

In [11]:
df.iloc[57907]

Type                       0.000000
Machine failure            1.000000
Rotational speed [rpm]     0.175738
Torque [Nm]                0.477421
Tool wear [min]            0.823187
Air temperature [c]        0.363062
Process temperature [c]    0.352309
type_of_failure            4.000000
Name: 57907, dtype: float64

Random Forest Classifier is th best performing model.

## Train Test Split

In [12]:
X = df.drop(['Machine failure', 'type_of_failure'], axis=1)
y = df['type_of_failure']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Predicting Type of Failure

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

lr = LogisticRegression()
svc = SVC()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()

models = [lr, svc, dt, rf]
scores = []

for m in models:
    m.fit(X_train, y_train)
    y_pred = m.predict(X_test)
    acc = accuracy_score(y_test, y_pred) * 100
    prec = precision_score(y_test, y_pred, average='macro') * 100
    rec = recall_score(y_test, y_pred, average='macro') * 100
    f1 = f1_score(y_test, y_pred, average='macro') * 100
    scores.append([acc, prec, rec, f1])

In [14]:
scores_df = pd.DataFrame(columns=['Model'], data=['Logistic Regression', 'SVC', 'Decision Tree', 'Random Forest'])
scores_df = pd.concat([scores_df, pd.DataFrame(scores, columns=['Accuracy', 'Precision', 'Recall', 'F1'])], axis=1)
scores_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,Logistic Regression,83.190883,82.371925,83.254114,82.637389
1,SVC,93.904861,94.150683,93.971555,93.701715
2,Decision Tree,98.799965,98.801502,98.812181,98.800583
3,Random Forest,99.179833,99.190525,99.191835,99.180305


Random Forest Classifier is th best performing model.