In [3]:
# Data Handling
import pandas as pd
import numpy as np

In [4]:
# Preprocessing
from sklearn.preprocessing import LabelEncoder

In [5]:
# ML Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [28]:
# Evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [29]:
# Save / Load Model
import joblib

In [53]:
# Load csv file from ./data folder
df = pd.read_csv("../data/machine_data.csv")
df.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,No Failure
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,No Failure
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,No Failure
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,No Failure
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,No Failure


In [54]:
# Extract first letter from Product ID
df['Product Quality'] = df['Product ID'].str[0]

In [55]:
# Drop original Product ID , Type(Redundant with Product Quality) , UDI and Failure Type Columns
df = df.drop(columns = ['Product ID', 'Type', 'UDI', 'Failure Type'], errors='ignore')

In [56]:
# Convert Product Quality L/M/H into numeric values
le = LabelEncoder()
df['Product Quality'] = le.fit_transform(df['Product Quality'])
# L-> 0 M -> 1 H-> 2

In [36]:
# Encode the Target column
df['Target'] = df['Target'].map({'Failure' : 1, 'No Failure' : 0})

In [62]:
# Dataset ready to use
df.head()

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Product Quality
0,298.1,308.6,1551,42.8,0,0,2
1,298.2,308.7,1408,46.3,3,0,1
2,298.1,308.5,1498,49.4,5,0,1
3,298.2,308.6,1433,39.5,7,0,1
4,298.2,308.7,1408,40.0,9,0,1


In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Air temperature [K]      10000 non-null  float64
 1   Process temperature [K]  10000 non-null  float64
 2   Rotational speed [rpm]   10000 non-null  int64  
 3   Torque [Nm]              10000 non-null  float64
 4   Tool wear [min]          10000 non-null  int64  
 5   Target                   10000 non-null  int64  
 6   Product Quality          10000 non-null  int64  
dtypes: float64(3), int64(4)
memory usage: 547.0 KB


In [64]:
# Features
X = df.drop(columns=['Target'])

# Label
y = df['Target']

In [149]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [150]:
# Initialize the model
model = DecisionTreeClassifier(random_state=42)
# Train the model
model.fit(X_train, y_train)

0,1,2
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`.",'gini'
,"splitter  splitter: {""best"", ""random""}, default=""best"" The strategy used to choose the split at each node. Supported strategies are ""best"" to choose the best split and ""random"" to choose the best random split.",'best'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",1
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: int, float or {""sqrt"", ""log2""}, default=None The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at  each split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. note::  The search for a split does not stop until at least one  valid partition of the node samples is found, even if it requires to  effectively inspect more than ``max_features`` features.",
,"random_state  random_state: int, RandomState instance or None, default=None Controls the randomness of the estimator. The features are always randomly permuted at each split, even if ``splitter`` is set to ``""best""``. When ``max_features < n_features``, the algorithm will select ``max_features`` at random at each split before finding the best split among them. But the best found split may vary across different runs, even if ``max_features=n_features``. That is the case, if the improvement of the criterion is identical for several splits and one split has to be selected at random. To obtain a deterministic behaviour during fitting, ``random_state`` has to be fixed to an integer. See :term:`Glossary ` for details.",42
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow a tree with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0


In [151]:
# Check Predictions
y_pred = model.predict(X_test)
# Accuracy
print("Accuracy Score: \n", accuracy_score(y_test, y_pred)*100)
# Detailed Report
print("Classification Report: \n", classification_report(y_test, y_pred))
# Confusion Matrix
print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred))

Accuracy Score: 
 97.89999999999999
Classification Report: 
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      1939
           1       0.63      0.74      0.68        61

    accuracy                           0.98      2000
   macro avg       0.81      0.86      0.84      2000
weighted avg       0.98      0.98      0.98      2000

Confusion Matrix: 
 [[1913   26]
 [  16   45]]


In [116]:
# Save the model
joblib.dump(model, '../models/predictive_model.pkl')

['../models/predictive_model.pkl']