## Data Pre-Analysis

In [1]:
import pandas as pd

In [2]:
data_path = 'Data\\predictive_maintenance.csv'
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,No Failure
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,No Failure
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,No Failure
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,No Failure
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,No Failure


In [3]:
df.isnull().sum()

UDI                        0
Product ID                 0
Type                       0
Air temperature [K]        0
Process temperature [K]    0
Rotational speed [rpm]     0
Torque [Nm]                0
Tool wear [min]            0
Target                     0
Failure Type               0
dtype: int64

In [4]:
df.describe()

Unnamed: 0,UDI,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,300.00493,310.00556,1538.7761,39.98691,107.951,0.0339
std,2886.89568,2.000259,1.483734,179.284096,9.968934,63.654147,0.180981
min,1.0,295.3,305.7,1168.0,3.8,0.0,0.0
25%,2500.75,298.3,308.8,1423.0,33.2,53.0,0.0
50%,5000.5,300.1,310.1,1503.0,40.1,108.0,0.0
75%,7500.25,301.5,311.1,1612.0,46.8,162.0,0.0
max,10000.0,304.5,313.8,2886.0,76.6,253.0,1.0


## Data Cleaning

In [5]:
split_cols = ['UDI','Product ID']
product_id = df[split_cols]
product_id.to_csv("Data\\product_id.csv", index=False)

In [6]:
df.drop(split_cols + ["Failure Type"], axis='columns', inplace=True)
df.head()

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target
0,M,298.1,308.6,1551,42.8,0,0
1,L,298.2,308.7,1408,46.3,3,0
2,L,298.1,308.5,1498,49.4,5,0
3,L,298.2,308.6,1433,39.5,7,0
4,L,298.2,308.7,1408,40.0,9,0


## Balancing Data

In [7]:
import numpy as np

In [8]:
Target_0 = df[df['Target'] == 0]
Target_1 = df[df['Target'] == 1]
Target_0 = Target_0[:339]
print(Target_0.shape, Target_1.shape)

(339, 7) (339, 7)


In [9]:
balanced_df = pd.concat([Target_0, Target_1], axis='rows')
print(balanced_df.isnull().sum().sum())
balanced_df.head()

0


Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target
0,M,298.1,308.6,1551,42.8,0,0
1,L,298.2,308.7,1408,46.3,3,0
2,L,298.1,308.5,1498,49.4,5,0
3,L,298.2,308.6,1433,39.5,7,0
4,L,298.2,308.7,1408,40.0,9,0


## Label Encoding

In [10]:
from sklearn.preprocessing import LabelEncoder
import joblib

In [11]:
encoder = LabelEncoder()
df['Type_en'] = encoder.fit_transform(df['Type'])
df.drop(['Type'], axis='columns', inplace=True)
df.head()

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Type_en
0,298.1,308.6,1551,42.8,0,0,2
1,298.2,308.7,1408,46.3,3,0,1
2,298.1,308.5,1498,49.4,5,0,1
3,298.2,308.6,1433,39.5,7,0,1
4,298.2,308.7,1408,40.0,9,0,1


In [12]:
encoder = LabelEncoder()
balanced_df['Type'] = encoder.fit_transform(balanced_df['Type'])
balanced_df.head()

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target
0,2,298.1,308.6,1551,42.8,0,0
1,1,298.2,308.7,1408,46.3,3,0
2,1,298.1,308.5,1498,49.4,5,0
3,1,298.2,308.6,1433,39.5,7,0
4,1,298.2,308.7,1408,40.0,9,0


In [14]:
Type_labels = dict(zip(['H','L','M'],sorted(list(balanced_df['Type'].unique()))))
joblib.dump(Type_labels, 'params\\Type_labels_dict.pkl')
Type_labels

{'H': 0, 'L': 1, 'M': 2}

In [15]:
balanced_df.to_csv("Data\\balanced_data.csv", index=False)
joblib.dump(encoder, "params\\encoder.pkl")

['params\\encoder.pkl']

## Saving Processed Data

In [16]:
df.to_csv('Data\\Processed_data.csv', index=False)

In [20]:
df_new = pd.read_csv(data_path)
df_new.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,No Failure
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,No Failure
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,No Failure
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,No Failure
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,No Failure


In [23]:
df_new['Target']

0       0
1       0
2       0
3       0
4       0
       ..
9995    0
9996    0
9997    0
9998    0
9999    0
Name: Target, Length: 10000, dtype: int64