In [None]:
# ! pip install pmdarima

# Import Essentials

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load Data set

In [3]:
dataset_initial = pd.read_csv('../Dataset/predictive_maintenance_dataset.csv')
dataset_initial.head()

Unnamed: 0,date,device,failure,metric1,metric2,metric3,metric4,metric5,metric6,metric7,metric8,metric9
0,1/1/2015,S1F01085,0,215630672,55,0,52,6,407438,0,0,7
1,1/1/2015,S1F0166B,0,61370680,0,3,0,6,403174,0,0,0
2,1/1/2015,S1F01E6Y,0,173295968,0,0,0,12,237394,0,0,0
3,1/1/2015,S1F01JE0,0,79694024,0,0,0,6,410186,0,0,0
4,1/1/2015,S1F01R2B,0,135970480,0,0,0,15,313173,0,0,3


In [4]:
numeric_cols = ['metric1', 'metric2', 'metric3', 'metric4', 'metric5', 'metric6', 'metric7', 'metric8', 'metric9']

# Descriptive Analysis


1.   Dataset Information
2.   Categorical feature and discriptive information



## Dataset Information

In [5]:
dataset_initial.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124494 entries, 0 to 124493
Data columns (total 12 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   date     124494 non-null  object
 1   device   124494 non-null  object
 2   failure  124494 non-null  int64 
 3   metric1  124494 non-null  int64 
 4   metric2  124494 non-null  int64 
 5   metric3  124494 non-null  int64 
 6   metric4  124494 non-null  int64 
 7   metric5  124494 non-null  int64 
 8   metric6  124494 non-null  int64 
 9   metric7  124494 non-null  int64 
 10  metric8  124494 non-null  int64 
 11  metric9  124494 non-null  int64 
dtypes: int64(10), object(2)
memory usage: 11.4+ MB


## Categorical feature and discriptive information

In [6]:
dataset_initial['device'].value_counts()

device
S1F0E9EP    304
S1F0EGMT    304
Z1F0QL3N    304
W1F0SJJ2    304
Z1F0QLC1    304
           ... 
S1F0A1PX      5
Z1F19VF5      5
S1F04KSC      4
W1F0WJFT      3
W1F1DA5ÿ      1
Name: count, Length: 1169, dtype: int64

# Just Group up the data with respective to the machine_name : Z1F0QLC1 wich has 304 data points.

In [None]:
# Make a copy of the original dataset before doing all manupulation.
Z1F0QLC1_Dataset = dataset_initial.copy(deep=True)


Z1F0QLC1_Dataset = Z1F0QLC1_Dataset[Z1F0QLC1_Dataset['device'] == 'Z1F0QLC1']
Z1F0QLC1_Dataset.head()

# Visualize the numerical paramater observed for the devies

NOTE : The data not having any information about what are all data paramanters are collected from the machines, just mentioed that metric1, metric2 , metric3,....,metricN

In [None]:
for features in numeric_cols:
    plt.figure(figsize=(20, 8))
    sns.histplot(data=Z1F0QLC1_Dataset[features], kde=True)
    plt.xlabel(features)
    plt.ylabel('Count')
    plt.show()

# Prepeare The data for forecasting

The Data for forecating has `date` column and the specific feature that needed to be forecasted here `metric5`

In [None]:
new_data = Z1F0QLC1_Dataset[['date', 'metric5']]
new_data.reset_index(drop=True)
new_data.head()

# Date time pre-processing

In [None]:
new_data['date']= pd.to_datetime(new_data['date'], format='%m/%d/%Y')
new_data.set_index('date', inplace=True)
new_data.head()

In [None]:
new_data['metric5'].rolling(window=7).mean().plot(figsize=(20, 10))

# Special Train test Split-up process for time series dataset

In [None]:
valueCount = new_data.count()
valueOf80 = valueCount*0.95
valueOf80 = valueOf80.astype('int')
val = valueOf80[0]
print(type(val))
trainSet = new_data.iloc[:val]
testSet = new_data.iloc[val:]

In [None]:
print(trainSet.count())
print(testSet.count())

# Moving average for `weekly` window

In [None]:
moving_average = trainSet.rolling(
    window=7,       # 365-day window
    center=True,      # puts the average at the center of the window
    min_periods=1,  # choose about half the window size
).mean()              # compute the mean (could also do median, std, min, max, ...)

ax = trainSet.plot(style=".", color="0.5")
moving_average.plot(
    ax=ax, linewidth=3, title="SOME METRIC", legend=False,
);

# Moving average for `monthly` window

In [None]:
moving_average = trainSet.rolling(
    window=30,       # 365-day window
    center=True,      # puts the average at the center of the window
    min_periods=1,  # choose about half the window size
).mean()              # compute the mean (could also do median, std, min, max, ...)

ax = trainSet.plot(style=".", color="0.5")
moving_average.plot(
    ax=ax, linewidth=3, title="SOME METRIC", legend=False,
);

# Using the deterministic process going to create dummy variable X wich adapts the `trend` component of the data

In [None]:
from statsmodels.tsa.deterministic import DeterministicProcess

dp = DeterministicProcess(
    index=trainSet.index, 
    constant=True,       
    order=1,             
    drop=True,           
)

X_train = dp.in_sample()

X_train.head()

In [None]:
# Target variable
y_train = trainSet["metric5"] 

In [None]:
from sklearn.linear_model import LinearRegression


# The intercept is the same as the `const` feature from
# DeterministicProcess. LinearRegression behaves badly with duplicated
# features, so we need to be sure to exclude it here.
model = LinearRegression(fit_intercept=False)
model.fit(X_train, y_train)

y_pred = pd.Series(model.predict(X_train), index=X_train.index)

# Ploting the `True train set` vs `Predicted train set`

In [None]:
trainSet.plot()

In [None]:
y_pred.plot()

# Creating the Dummy for the `testSet` made during the splitup

In [None]:
X_test_Dummy = dp.out_of_sample(steps=16,forecast_index=pd.date_range(start='2015-09-01', periods=16, freq=None))
X_test_Dummy

In [None]:
Forecast_output_test = pd.Series(model.predict(X_test_Dummy), index=testSet.index)

# Ploting the `True test set` vs `Predicted test set`

In [None]:
Forecast_output_test.plot()

In [None]:
testSet.plot()

# Trail for AutoML technique for time series models

In [None]:
from pmdarima.arima import auto_arima
arima_model =  auto_arima(trainSet,start_p=0, d=2, start_q=0,
                          max_p=5, max_d=5, max_q=5, start_P=0,
                          D=1, start_Q=0, max_P=5, max_D=5,
                          max_Q=5, m=1, seasonal=False,
                          error_action='warn',trace = True,
                          supress_warnings=True,stepwise = True,n_fits = 100)

# AutoML Forecast

In [None]:
forecastValue_16 = arima_model.predict(16, alpha=0.05)
forecastValue_16 = forecastValue_16.astype(int)
print(forecastValue_16.shape)

# Plotting the `AutoML Forecast` testSet VS `testSet`

In [None]:
forecastValue_16.plot(kind='line')

In [None]:
testSet.plot(kind='line')

In [None]:
from sklearn.metrics import mean_squared_error, explained_variance_score, mean_absolute_error, root_mean_squared_error
print("Mean absolute error : ",mean_absolute_error(testSet.values.reshape(16,), forecastValue_16.values))
print("Root mean square Error : ", root_mean_squared_error(testSet.values.reshape(16,), forecastValue_16.values))
print("Mean squared Error : ", mean_squared_error(testSet.values.reshape(16,), forecastValue_16.values))
print("Explained varience score : ",explained_variance_score(testSet.values.reshape(16,), forecastValue_16.values))

# Mean Values

In [None]:
testSet.values.reshape(16,).mean()

In [None]:
forecastValue_16.values.mean()