## AI4I 2020 Predictive Maintenance

The AI4I 2020 Predictive Maintenance Dataset is a synthetic dataset that reflects real predictive maintenance data encountered in industry.

Import required liberaries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from pandas_profiling import ProfileReport
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge,Lasso ,RidgeCV,LassoCV , ElasticNet , ElasticNetCV, LinearRegression

Load the Dataset

In [2]:
df = pd.read_csv('ai4i2020.csv')

In [3]:
df.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [4]:
df.tail()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
9995,9996,M24855,M,298.8,308.4,1604,29.5,14,0,0,0,0,0,0
9996,9997,H39410,H,298.9,308.4,1632,31.8,17,0,0,0,0,0,0
9997,9998,M24857,M,299.0,308.6,1645,33.4,22,0,0,0,0,0,0
9998,9999,H39412,H,299.0,308.7,1408,48.5,25,0,0,0,0,0,0
9999,10000,M24859,M,299.0,308.7,1500,40.2,30,0,0,0,0,0,0


In [5]:
df.shape

(10000, 14)

In [6]:
df.dtypes

UDI                          int64
Product ID                  object
Type                        object
Air temperature [K]        float64
Process temperature [K]    float64
Rotational speed [rpm]       int64
Torque [Nm]                float64
Tool wear [min]              int64
Machine failure              int64
TWF                          int64
HDF                          int64
PWF                          int64
OSF                          int64
RNF                          int64
dtype: object

In [7]:
df.describe()

Unnamed: 0,UDI,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,300.00493,310.00556,1538.7761,39.98691,107.951,0.0339,0.0046,0.0115,0.0095,0.0098,0.0019
std,2886.89568,2.000259,1.483734,179.284096,9.968934,63.654147,0.180981,0.067671,0.106625,0.097009,0.098514,0.04355
min,1.0,295.3,305.7,1168.0,3.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2500.75,298.3,308.8,1423.0,33.2,53.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5000.5,300.1,310.1,1503.0,40.1,108.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,7500.25,301.5,311.1,1612.0,46.8,162.0,0.0,0.0,0.0,0.0,0.0,0.0
max,10000.0,304.5,313.8,2886.0,76.6,253.0,1.0,1.0,1.0,1.0,1.0,1.0


## Pandas profiling for better understanding of data

In [8]:
pf = ProfileReport(df)

In [None]:
pf.to_widgets()

HBox(children=(HTML(value='Summarize dataset'), FloatProgress(value=0.0, max=27.0), HTML(value='')))




HBox(children=(HTML(value='Generate report structure'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Render widgets'), FloatProgress(value=0.0, max=1.0), HTML(value='')))

In [None]:
pf.to_file('test.html')

## Analysis of data

1) Air temperature and Process temperature are highly correlated( ie data is linearly ditsributed)

2) Other features like Rotational speed, torque,Tool wear are uniformly distributed.

3) Product ID and UDI are not adding any inputs to our model building.

4) We dont find any multicolinearity in the features.

5)Torque and rotational speed is inversly correlated

##  NAN Values

In [None]:
df.isna().sum()

## Fequency Distribution

In [None]:
sns.countplot(df['Air temperature [K]'])

#### The Data is normally distributed.

## Feature Engineering

Let's drop not required columns

In [None]:
df.drop(columns=['UDI','Product ID'],inplace=True)

#### Rename columns 

In [None]:
df.rename(columns={"Air temperature [K]":"Air"}, inplace = True)
df.rename(columns={"Process temperature [K]":"Process"}, inplace = True)
df.rename(columns={"Rotational speed [rpm]":"Speed"}, inplace = True)
df.rename(columns={"Torque [Nm]":"Torque"}, inplace = True)
df.rename(columns={"Tool wear [min]":"Tool"}, inplace = True)
df.rename(columns={"Machine failure":"failure"}, inplace = True)
df.head()

Let's map Categorical column to numeric

In [None]:
df['Type'] = df['Type'].map({'L': 0, 'M': 1,'H':2})
df.head()

In [None]:
df.info()

## Check Multicolinerity

In [None]:
corrmat = df.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
#plot heat map
g=sns.heatmap(df[top_corr_features].corr(),annot=True,cmap="RdYlGn")

In [None]:
import statsmodels.formula.api as smf
lm = smf.ols(formula='Air ~ Process', data=df).fit()
lm.summary()

0,1,2,3
Dep. Variable:,Air,R-squared:,0.768
Model:,OLS,Adj. R-squared:,0.768
Method:,Least Squares,F-statistic:,33020.0
Date:,"Tue, 31 Aug 2021",Prob (F-statistic):,0.0
Time:,17:27:36,Log-Likelihood:,-13826.0
No. Observations:,10000,AIC:,27660.0
Df Residuals:,9998,BIC:,27670.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-66.1432,2.015,-32.824,0.000,-70.093,-62.193
Process,1.1811,0.007,181.703,0.000,1.168,1.194

0,1,2,3
Omnibus:,670.862,Durbin-Watson:,0.007
Prob(Omnibus):,0.0,Jarque-Bera (JB):,243.88
Skew:,-0.082,Prob(JB):,1.1e-53
Kurtosis:,2.253,Cond. No.,64800.0


#### observations:
Adj r sq and r sq are almost same and p value is less then 0.05 then we can conside process temp feature

In [None]:
df.columns

Index(['Type', 'Air', 'Process', 'Speed', 'Torque', 'Tool', 'failure', 'TWF',
       'HDF', 'PWF', 'OSF', 'RNF'],
      dtype='object')

In [20]:
lm = smf.ols(formula='Air ~ Type+Process+Speed+Torque+Tool+failure+TWF+HDF+PWF+OSF+RNF', data=df).fit()
lm.summary()

0,1,2,3
Dep. Variable:,Air,R-squared:,0.776
Model:,OLS,Adj. R-squared:,0.775
Method:,Least Squares,F-statistic:,3140.0
Date:,"Tue, 31 Aug 2021",Prob (F-statistic):,0.0
Time:,17:27:42,Log-Likelihood:,-13648.0
No. Observations:,10000,AIC:,27320.0
Df Residuals:,9988,BIC:,27410.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-64.1776,2.000,-32.092,0.000,-68.098,-60.258
Type,-0.0107,0.014,-0.754,0.451,-0.038,0.017
Process,1.1737,0.006,183.261,0.000,1.161,1.186
Speed,0.0002,0.000,1.578,0.115,-4.59e-05,0.000
Torque,0.0003,0.002,0.123,0.902,-0.004,0.005
Tool,8.454e-05,0.000,0.557,0.578,-0.000,0.000
failure,-0.0918,0.180,-0.510,0.610,-0.445,0.261
TWF,0.2048,0.219,0.937,0.349,-0.224,0.633
HDF,1.7801,0.191,9.338,0.000,1.406,2.154

0,1,2,3
Omnibus:,647.442,Durbin-Watson:,0.074
Prob(Omnibus):,0.0,Jarque-Bera (JB):,241.259
Skew:,-0.091,Prob(JB):,4.09e-53
Kurtosis:,2.261,Cond. No.,334000.0


#### observations:
Adj r sq and r sq are mostly same and p value is less then 0.05 for only process and HDF features

Also check it for VIF

In [21]:
y = df['Air']
x = df.drop(columns=['Air'])

In [22]:
scaler = StandardScaler()

In [23]:
arr = scaler.fit_transform(x)

In [24]:
arr

array([[ 0.74441274, -0.94735989,  0.06818514, ..., -0.09793424,
        -0.09948362, -0.04363046],
       [-0.74530658, -0.879959  , -0.72947151, ..., -0.09793424,
        -0.09948362, -0.04363046],
       [-0.74530658, -1.01476077, -0.22744984, ..., -0.09793424,
        -0.09948362, -0.04363046],
       ...,
       [ 0.74441274, -0.94735989,  0.59251888, ..., -0.09793424,
        -0.09948362, -0.04363046],
       [ 2.23413206, -0.879959  , -0.72947151, ..., -0.09793424,
        -0.09948362, -0.04363046],
       [ 0.74441274, -0.879959  , -0.2162938 , ..., -0.09793424,
        -0.09948362, -0.04363046]])

In [25]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_df = pd.DataFrame()

In [26]:
vif_df['vif'] = [variance_inflation_factor(arr,i) for i in range(arr.shape[1])]

In [27]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_df = pd.DataFrame()

In [28]:
vif_df['vif'] = [variance_inflation_factor(arr,i) for i in range(arr.shape[1])]

In [29]:
vif_df['feature']  = x.columns

In [30]:
vif_df

Unnamed: 0,vif,feature
0,1.003726,Type
1,1.00492,Process
2,5.171728,Speed
3,5.236158,Torque
4,1.039958,Tool
5,11.831609,failure
6,2.433879,TWF
7,4.597163,HDF
8,3.624287,PWF
9,3.348866,OSF


Failure is exceeding 10 hence it is multicolinear with other feature

In [31]:
# Let's create a function to create adjusted R-Squared
def adj_r2(x,y):
    r2 = lr.score(x,y)
    n = x.shape[0]
    p = x.shape[1]
    adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
    return adjusted_r2

## Model Building

In [32]:
x_train, x_test, y_train,y_test = train_test_split(arr, y,test_size= 0.30,random_state = 100)

In [33]:
lr = LinearRegression()
lr.fit(x_train,y_train)
y_pred = lr.predict(x_test)

In [34]:
from sklearn.metrics import r2_score
from sklearn import metrics

print('R square:',r2_score(y_test,y_pred))
print('Adj R square:',adj_r2(x_test,y_test))
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

R square: 0.7898277846405926
Adj R square: 0.7890540582788277
MAE: 0.7855836967661405
MSE: 0.8708250067970978
RMSE: 0.9331800505781818


### LASSO 

In [35]:
lassocv = LassoCV(alphas=None,cv= 50 , max_iter=200000, normalize=True)
lassocv.fit(x_train,y_train)
lassocv.alpha_
lasso = Lasso(alpha=lassocv.alpha_)
lasso.fit(x_train,y_train)
lasso.score(x_test,y_test)

0.7898478185053899

### Ridge 

In [36]:
ridgecv = RidgeCV(alphas=np.random.uniform(0,10,50),cv = 10 , normalize=True)
ridgecv.fit(x_train,y_train)
ridgecv.alpha_
ridge_lr = Ridge(alpha=ridgecv.alpha_)
ridge_lr.fit(x_train,y_train)
ridge_lr.score(x_test,y_test)

0.78982694363047

In [37]:
elastic= ElasticNetCV(alphas=None, cv = 10 )
elastic.fit(x_train,y_train)
elastic.alpha_
elastic.l1_ratio_
elastic_lr = ElasticNet(alpha=elastic.alpha_ , l1_ratio=elastic.l1_ratio_)
elastic_lr.fit(x_train,y_train)
elastic_lr.score(x_test,y_test)

0.7895815224616453

In [45]:
import pickle
pickle.dump(lr,open('Linear_Regression_model.pickle','wb'))


### Test cases

In [48]:
model = pickle.load(open('Linear_Regression_model.pickle','rb'))

In [50]:
test1 = scaler.transform([[1,308.6,1551,42.8,0,0,0,0,0,0,0]])
model.predict(test1)

array([298.34302764])

In [53]:
test2 = scaler.transform([[1,307.6,1551,42.8,1,0,0,0,0,0,0]])
model.predict(test2)

array([297.17528784])

In [54]:
test2 = scaler.transform([[1,307.6,1551,42.8,1,1,1,1,1,0,0]])
model.predict(test2)

array([299.45556617])

In [56]:
test2 = scaler.transform([[3,300.6,1400,42.8,1,1,1,1,1,0,0]])
model.predict(test2)

array([291.2614413])