In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/machine-predictive-maintenance-classification/predictive_maintenance.csv


In [2]:
df = pd.read_csv("/kaggle/input/machine-predictive-maintenance-classification/predictive_maintenance.csv")

In [3]:
df.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,No Failure
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,No Failure
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,No Failure
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,No Failure
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,No Failure


In [4]:
## Looking at type of failure
df["Failure Type"].value_counts()

No Failure                  9652
Heat Dissipation Failure     112
Power Failure                 95
Overstrain Failure            78
Tool Wear Failure             45
Random Failures               18
Name: Failure Type, dtype: int64

In [5]:
## As there is a failure of "Heat Dissipation Failure" having more than other failure, will consider taking as Binary classification where my model can identify the Heat failure or Not a failure
df.loc[df["Failure Type"] == "Heat Dissipation Failure", "Output"] = 1
df.loc[df["Failure Type"] != "Heat Dissipation Failure", "Output"] = 0

In [6]:
df["Output"].value_counts()

0.0    9888
1.0     112
Name: Output, dtype: int64

In [7]:
df["Product ID"].value_counts()

M14860    1
L53850    1
L53843    1
L53844    1
L53845    1
         ..
M18193    1
M18194    1
L50515    1
L50516    1
M24859    1
Name: Product ID, Length: 10000, dtype: int64

#### AS the product id column is having all unique value, and only shows indivisual id's droping the column

In [8]:
df.drop(["Product ID", "UDI", "Failure Type"], inplace=True,axis=1)

In [9]:
df.head()

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Output
0,M,298.1,308.6,1551,42.8,0,0,0.0
1,L,298.2,308.7,1408,46.3,3,0,0.0
2,L,298.1,308.5,1498,49.4,5,0,0.0
3,L,298.2,308.6,1433,39.5,7,0,0.0
4,L,298.2,308.7,1408,40.0,9,0,0.0


In [10]:
df["Target"].value_counts()

0    9661
1     339
Name: Target, dtype: int64

In [11]:
### For this iterations we are not taking Target as final lable, because 339 failures are mixed up, and a Heat failure could be much different from a power failure.

In [12]:
df.drop(["Target"], inplace=True,axis=1)

In [13]:
df.head()

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Output
0,M,298.1,308.6,1551,42.8,0,0.0
1,L,298.2,308.7,1408,46.3,3,0.0
2,L,298.1,308.5,1498,49.4,5,0.0
3,L,298.2,308.6,1433,39.5,7,0.0
4,L,298.2,308.7,1408,40.0,9,0.0


In [14]:
df["Type"].value_counts()

L    6000
M    2997
H    1003
Name: Type, dtype: int64

In [15]:
### Encoding Type
df = pd.get_dummies(df, prefix=['Type'])

In [16]:
### Creating past columns as refrence as the failure can be related or occured from past

# As we are working on Heat Failure temp is somehting can get more info

df['Air_mean_avg'] = df["Air temperature [K]"].rolling(window=5).mean()
df['process_mean_avg'] = df["Process temperature [K]"].rolling(window=5).mean()

In [17]:
df.head()

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Output,Type_H,Type_L,Type_M,Air_mean_avg,process_mean_avg
0,298.1,308.6,1551,42.8,0,0.0,0,0,1,,
1,298.2,308.7,1408,46.3,3,0.0,0,1,0,,
2,298.1,308.5,1498,49.4,5,0.0,0,1,0,,
3,298.2,308.6,1433,39.5,7,0.0,0,1,0,,
4,298.2,308.7,1408,40.0,9,0.0,0,1,0,298.16,308.62


In [18]:
df.columns = [x.replace("[", "").replace("]", "").replace(" ", "_").lower() for x in df.columns]

In [19]:
df.isnull().sum()

air_temperature_k        0
process_temperature_k    0
rotational_speed_rpm     0
torque_nm                0
tool_wear_min            0
output                   0
type_h                   0
type_l                   0
type_m                   0
air_mean_avg             4
process_mean_avg         4
dtype: int64

In [20]:
df.fillna(0, inplace=True)

In [21]:
df.isnull().sum()

air_temperature_k        0
process_temperature_k    0
rotational_speed_rpm     0
torque_nm                0
tool_wear_min            0
output                   0
type_h                   0
type_l                   0
type_m                   0
air_mean_avg             0
process_mean_avg         0
dtype: int64

In [22]:
### Train and test data split

from sklearn.model_selection import train_test_split

y = df["output"]
X = df.drop("output", axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y.values)

In [23]:
X_train.head()

Unnamed: 0,air_temperature_k,process_temperature_k,rotational_speed_rpm,torque_nm,tool_wear_min,type_h,type_l,type_m,air_mean_avg,process_mean_avg
8368,298.9,309.8,1534,42.1,23,0,1,0,298.88,309.8
1345,298.5,310.0,1804,26.1,22,0,1,0,298.54,310.12
4699,303.5,311.6,1428,46.2,139,0,1,0,303.42,311.6
518,297.6,309.5,1924,20.6,37,0,1,0,297.5,309.32
9197,297.9,308.8,1469,49.0,51,0,0,1,297.92,308.92


In [24]:
#### Using Boosting model

from xgboost import XGBClassifier
import time

In [25]:
xgb = XGBClassifier(n_estimators=100)
training_start = time.perf_counter()
xgb.fit(X_train, y_train)


XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [26]:
training_end = time.perf_counter()
prediction_start = time.perf_counter()
preds = xgb.predict(X_test)
prediction_end = time.perf_counter()
acc_xgb = (preds == y_test).sum().astype(float) / len(preds)*100

In [27]:
acc_xgb

99.7878787878788