In [1]:
import numpy as np 
import pandas as pd 
pd.set_option("display.max_columns",None)
from statsmodels.tsa.arima_model import ARIMA
import os
!jupyter notebook --clear-cache

ModuleNotFoundError: No module named 'statsmodels'




## Data Collection

In [None]:
fail_df = pd.read_csv("PdM_failures.csv")
print(fail_df.shape)
fail_df.head()

In [None]:
error_df = pd.read_csv("PdM_errors.csv")
print(error_df.shape)
error_df.head()

In [None]:
tele_df = pd.read_csv("PdM_telemetry.csv")
print(tele_df.shape)
tele_df.head()

## Pre-Processing

In [None]:
fail_df.info()

In [None]:
error_df.info()

In [None]:
tele_df.info()

In [None]:
tele_df['datetime_new'] = pd.to_datetime(
    tele_df['datetime'],
    format="%Y-%m-%d %H:%M:%S"
)
print("---------------------------------------------Telemetry--------------------------------------------------")
print(tele_df.info())

fail_df['datetime_new'] = pd.to_datetime(
    fail_df['datetime'],
    format="%Y-%m-%d %H:%M:%S"
)
print("---------------------------------------------Machine Failure--------------------------------------------------")
print(fail_df.info())

error_df['datetime_new'] = pd.to_datetime(
    error_df['datetime'],
    format="%Y-%m-%d %H:%M:%S"
)
print("---------------------------------------------Error occured--------------------------------------------------")
print(error_df.info())

In [None]:
# Uncomment this cell to run notebook

tele_df.drop('datetime',axis='columns', inplace=True)
fail_df.drop('datetime',axis='columns', inplace=True)
error_df.drop('datetime',axis='columns', inplace=True)

## Machinewise data seperation

In [None]:
machine_id = tele_df['machineID'].unique()
dfs = []

def sort_by_date(df, col):
    return df.sort_values(by=col)

for ID in machine_id:
    temp_df = tele_df[tele_df['machineID']==ID]
    temp_df = sort_by_date(temp_df, 'datetime_new')
    dfs.append(temp_df)
    
print("No. of Machines in Telemetry = {}".format(len(dfs)))    

In [None]:
machine_id = tele_df['machineID'].unique()
fail_dfs, error_dfs = [],[]

def sort_by_date(df, col):
    return df.sort_values(by=col)

for ID in machine_id:
    fail_temp = fail_df[fail_df['machineID']==ID]
    error_temp = error_df[error_df['machineID']==ID]
    fail_temp = sort_by_date(fail_temp, 'datetime_new')
    error_temp = sort_by_date(error_temp, 'datetime_new')
    fail_dfs.append(fail_temp)
    error_dfs.append(error_temp)
    
print("No. of Machines fail dfs = {}".format(len(fail_dfs)))
print("No. of Machines error dfs = {}".format(len(error_dfs)))

## Data Visualization
> Consider first machine for data visualization

In [None]:
import plotly.graph_objects as go
import plotly.express as ex
import matplotlib.pyplot as plt

In [None]:
trace = go.Scatter(x=dfs[0]['datetime_new'], y=dfs[0]['volt'], mode='lines', name='Voltage')
trace_error = go.Scatter(x=error_dfs[0]['datetime_new'], mode='markers', name='error', marker=dict(size=10, color='yellow'))
trace_dots = go.Scatter(x=fail_dfs[0]['datetime_new'], mode='markers', name='failures', marker=dict(size=10, color='red'))

fig = go.Figure(data=[trace,trace_error,trace_dots])

fig.update_layout(
    title = "Errors and Failures with voltage variations",
    xaxis_title = "Date",
    yaxis_title = "Voltage level"   
)

fig.show()

In [None]:
trace = go.Scatter(x=dfs[0]['datetime_new'], y=dfs[0]['rotate'], mode='lines', name='rotation')
trace_error = go.Scatter(x=error_dfs[0]['datetime_new'], mode='markers', name='error', marker=dict(size=10, color='yellow'))
trace_dots = go.Scatter(x=fail_dfs[0]['datetime_new'], mode='markers', name='failures', marker=dict(size=10, color='red'))

fig = go.Figure(data=[trace,trace_error,trace_dots])

fig.update_layout(
    title = "Errors and Failures with rotation variations",
    xaxis_title = "Date",
    yaxis_title = "Rotation level"   
)

fig.show()

In [None]:
trace = go.Scatter(x=dfs[0]['datetime_new'], y=dfs[0]['pressure'], mode='lines', name='pressure')
trace_error = go.Scatter(x=error_dfs[0]['datetime_new'], mode='markers', name='error', marker=dict(size=10, color='yellow'))
trace_dots = go.Scatter(x=fail_dfs[0]['datetime_new'], mode='markers', name='failures', marker=dict(size=10, color='red'))

fig = go.Figure(data=[trace,trace_error,trace_dots])

fig.update_layout(
    title = "Errors and Failures with pressure variations",
    xaxis_title = "Date",
    yaxis_title = "pressure level"   
)

fig.show()

In [None]:
trace = go.Scatter(x=dfs[0]['datetime_new'], y=dfs[0]['vibration'], mode='lines', name='vibration')
trace_error = go.Scatter(x=error_dfs[0]['datetime_new'], mode='markers', name='error', marker=dict(size=10, color='yellow'))
trace_dots = go.Scatter(x=fail_dfs[0]['datetime_new'], mode='markers', name='failures', marker=dict(size=10, color='red'))

fig = go.Figure(data=[trace,trace_error,trace_dots])

fig.update_layout(
    title = "Errors and Failures with vibration variations",
    xaxis_title = "Date",
    yaxis_title = "vibration level"   
)

fig.show()

In [None]:
trace_error = go.Scatter(x=error_dfs[0]['datetime_new'],y=[0]*len(error_dfs[0]['datetime_new']), mode='markers', name='error', marker=dict(size=10, color='yellow'))
trace_dots = go.Scatter(x=fail_dfs[0]['datetime_new'],y=[1]*len(error_dfs[0]['datetime_new']), mode='markers', name='failures', marker=dict(size=10, color='red'))

fig = go.Figure(data=[trace_error,trace_dots])
fig.update_layout(
    title="Error and Failure",
    xaxis_title = "Data",)

fig.show()

### Most Impacted Errror for failure

In [None]:
def nearby(lis, ind, no_of_neighbors=1):
    neighbors = []
    if ind-1 >= no_of_neighbors and len(lis)-ind-1 >= no_of_neighbors:
        print("if-----------------------------")
        for i in range(0, no_of_neighbors):
            neighbors.append(lis[ind-i-1])
            neighbors.append(lis[ind+i+1])
            neighbors.append(lis[ind])

    elif len(lis)-ind <= no_of_neighbors:
        print("elif-----------------------------")
        for i in range(no_of_neighbors):
            neighbors.append(lis[ind-i-1])
        for i in range(len(lis)-ind):
            neighbors.append(lis[-(i+1)])
        neighbors.append(lis[ind])
            
    else:
        print("else-----------------------------")
        for i in range(0,ind):
            neighbors.append(lis[i])
        for i in range(0, no_of_neighbors):
            neighbors.append(lis[ind+i+1])
        neighbors.append(lis[ind])
    return neighbors
    
        
def failure_nearest_errors(error_dates, failure_date):
    error_dates = list(error_dates)
    error_dates.append(failure_date)
    error_dates = sorted(error_dates)
    ind = error_dates.index(failure_date)
    print("Error dates", len(error_dates), "Index of failure dates", ind)
    return nearby(error_dates, ind)

near_dates_of_failure = {}
near_dates_lis = []
for i,date in enumerate(fail_dfs[0]['datetime_new']):
    res = failure_nearest_errors(error_dfs[0]['datetime_new'], fail_dfs[0]['datetime_new'][i])
    near_dates_of_failure[date] = res
    near_dates_lis += res
    
near_dates_lis = np.unique(near_dates_lis)

In [None]:
trace_error = go.Scatter(x=error_dfs[0]['datetime_new'],y=[0] * len(error_dfs[0]['datetime_new']), mode='markers', name='Casual Error', marker=dict(size=10, color = 'green', opacity=0.5))
trace_bad_error = go.Scatter(x=near_dates_lis,y=[0] * len(near_dates_lis), mode='markers', name='Alert Errors', marker=dict(size=20, color = 'red', opacity=0.5, symbol='star'))
trace_fails = go.Scatter(x=fail_dfs[0]['datetime_new'],y=[1] * len(error_dfs[0]['datetime_new']), mode='markers', name='failures', marker=dict(size=10, color = 'red'))

fig = go.Figure(data=[trace_error,trace_fails, trace_bad_error])
fig.update_layout(
    title="Most Impactfull Errors for Failure",
    xaxis_title = "Data",
    )

fig.show()

In [None]:
alert_errors_df = pd.merge(error_dfs[0], pd.Series(near_dates_lis, name='datetime_new'), on='datetime_new')
alert_errors_lis = list(alert_errors_df['errorID'])
alert_errors_dic = {}
for error in alert_errors_lis:
    alert_errors_dic[error] = alert_errors_lis.count(error)
alert_errors_dic

In [None]:
plt.bar(x=list(alert_errors_dic.keys()), height=list(alert_errors_dic.values()))
plt.title("Error Impact")
plt.xlabel("Error ID")
plt.ylabel("Impact of Error")
plt.show()

## Model Building (Time Series Forecasting)

In [None]:
!pip install pmdarima

In [None]:
from statsmodels.tsa.arima.model import ARIMA
from sklearn.model_selection import train_test_split

In [None]:
train_dfs = []
test_dfs = []

def splitter(df, train_set=0.7):
    train,test = df[:round(train_set * len(df))], df[round(train_set * len(df)):]
    return train,test

for df in dfs:
    train,test = splitter(df, train_set=0.8)
    train_dfs.append(train)
    test_dfs.append(test)

train_dfs[0].shape, test_dfs[0].shape

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
plot_pacf(train_dfs[0]['volt'])
plt.title("Partial Autocorrelation (MA)")
plt.show()

In [None]:
plot_acf(train_dfs[0]['volt'])
plt.title("Autocorrelation (AR model)")
plt.show()

In [None]:
# Normalizing data

from sklearn.preprocessing import MinMaxScaler
train_dfs_norm = []
shifted = train_dfs[0]['volt'] - train_dfs[0]['volt'].shift()
shifted = shifted.dropna()
for df in train_dfs:
    scaler = MinMaxScaler()
    res = scaler.fit_transform(df.drop(['machineID','datetime_new'], axis=1))
    res = pd.DataFrame(res,columns=df.drop(['machineID','datetime_new'], axis=1).columns)
    train_dfs_norm.append(res)
train_dfs_norm[0].head()

In [None]:
from pmdarima import auto_arima

def find_best_arima_parameters(train_data, max_p=5, max_d=2, max_q=5, seasonal=False):
    """
    Find the best suitable values for p, d, and q for an ARIMA model.

    Parameters:
    - train_data: A pandas Series or 1D array containing the training time series data.
    - max_p: The maximum value of p to search for.
    - max_d: The maximum value of d to search for.
    - max_q: The maximum value of q to search for.
    - seasonal: If True, search for seasonal ARIMA parameters as well.

    Returns:
    - A tuple (p, d, q) representing the best ARIMA parameters.
    """
    arima_model = auto_arima(train_data, 
                            max_p=max_p, 
                            max_d=max_d, 
                            max_q=max_q, 
                            seasonal=seasonal, 
                            stepwise=True, 
                            suppress_warnings=True, 
                            error_action="ignore")
    return arima_model.order

# best_p, best_d, best_q = find_best_arima_parameters(train_dfs_norm[0]['volt'])
# print("Best ARIMA Parameters (p, d, q):", best_p, best_d, best_q)

In [None]:
# model = ARIMA(train_dfs_norm[0]['volt'], order=(best_p, best_d, best_q))
# model_fit = model.fit()
# print(model_fit)
# result = model_fit.forecast(steps=5)
# result

## Forecasting

In [None]:
# import datetime
# PERIODS = 2
# START = train_dfs[0]['datetime_new'].iloc[-1]
# PAST_DATA = 40

# future_dates = pd.Series(pd.date_range(start= START, periods=PERIODS))
# initial = train_dfs[0]['datetime_new'].iloc[-PAST_DATA:-1]
# y = list(train_dfs_norm[0]['volt'].iloc[-PAST_DATA:-1])

# plt.figure(figsize=(16,3))
# plt.plot(initial,y, label="Train")
# plt.plot(future_dates,list(result[:PERIODS]), label='Forcasted')
# plt.show()

## Predictive Model Building
> **Idealogy:**
*     Collect nearest data(volt,rotate,pressure,vibration) where error occured
*     Collect nearest data(volt,rotate,pressure,vibration) where error not occured to make balanced dataset
*     Generate dataframe from it
*     Implement Machine Learning Model 
*     Predict Error which will occure in system in future

In [None]:
error_df.head()

In [None]:
tele_df.head()

In [None]:
import pandas
NEARBY_NUM = 10
class NearDates:
    """ Returns list of dataframes"""
    def __init__(self, datetimes:pd.DataFrame, error_dates:pd.Series, date_col:str):
        """constructor for datetimes series and error dates
            datetimes: datetime dataframe which you want to slice
            error_dates: error_dates series near which you want to slice from dataframe"""
        self.datetimes = datetimes
        self.error_dates = error_dates
        self.date_col = date_col
    
    def find_index(self, error_date:pandas._libs.tslibs.timestamps.Timestamp) -> int:
        """Returns index of error time instance"""
        for i,datetime in enumerate(self.datetimes[self.date_col]):
            if error_date < datetime:
                return i
            else:
                continue
        
    def slicer(self, error_date:pandas._libs.tslibs.timestamps.Timestamp, nearby:int) -> tuple:
        """ Returns start and end index of slice"""
        ind = self.find_index(error_date)
        if ind > nearby:
            return (ind-nearby, ind)
        else:
            return (0,ind)
        
    def iterator(self, nearby:int) -> list:
        """Returns list of dataframes (sliced dataframes)"""
        slices = []
        for i,error_date in enumerate(self.error_dates):
            start,end = self.slicer(error_date, nearby)
            slices.append(self.datetimes.iloc[start:end])
        return slices
    
near_finder = NearDates(tele_df, error_df['datetime_new'], 'datetime_new')
slices_dfs_error = near_finder.iterator(nearby=NEARBY_NUM)

In [None]:
slices_index_non_error = []
for df1,df2 in zip(slices_dfs_error[:-1],slices_dfs_error[1:]):
    end_ind = df1.index[-1]
    start_ind = df2.index[0]

    if start_ind - end_ind > NEARBY_NUM+1:
        slices_index_non_error.append((end_ind,start_ind))
print("length of slices_index_non_error ----> ", len(slices_index_non_error))
print("length of slices_index_error ----> ", len(slices_dfs_error))

In [None]:
slices_dfs_non_error = []
for val in slices_index_non_error:
    mid = int(sum(val)/len(val))
    start = mid-5
    end = mid+5
    slices_dfs_non_error.append(tele_df.iloc[start:end])

In [None]:
for df1,df2 in zip(slices_dfs_error, slices_dfs_non_error):
    if len(df1) != 10:
        ind = list(map(str,slices_dfs_error)).index(str(df1))
        del slices_dfs_error[ind]
        
for df1,df2 in zip(slices_dfs_error, slices_dfs_non_error):
    if len(df1) != 10:
        print("Warning...")

In [None]:
plt.plot(slices_dfs_error[0]['rotate'],label="error_rotate")
plt.plot(slices_dfs_error[0]['volt'], label='error_volt')
plt.plot(slices_dfs_error[0]['vibration'], label="error_vibrate")
plt.plot(slices_dfs_non_error[0]['rotate'],label="no_error_rotate")
plt.plot(slices_dfs_non_error[0]['volt'], label='no_error_volt')
plt.plot(slices_dfs_non_error[0]['vibration'], label="no_error_vibrate")
plt.title("parameters during error occurance and non error occurance")
plt.legend()
plt.show()

In [None]:
final_dfs = []
for df1,df2 in zip(slices_dfs_error, slices_dfs_non_error):
    if (df1.shape[0] != df2.shape[0] or df1.shape[1] != df2.shape[1]):
        print("True---------------------------------> ")
    cols = ['volt','rotate','pressure','vibration']
    error = []
    non_error = []
    for col in cols:
        error += list(df1[col])
        non_error += list(df2[col])
    final_dfs.append(error + [1])
    final_dfs.append(non_error + [0])

In [None]:
column_names = []
for col in cols:
    for i in range(1,11):
        column_names.append(col + " " + str(i))
if 'Error' not in column_names:
    column_names += ['Error']
print(column_names)

In [None]:
final_df = pd.DataFrame(final_dfs, columns=column_names)
final_df.head()

In [None]:
# this code fetching insightfull information from dataframes row like mean, min, max, max_change, min_change
params = []
for i in range(len(final_df)):
    row = list(final_df.iloc[i]) # getting single row from df
    volt = row[:10]
    rotate = row[10:20]
    pressure = row[20:30]
    vibrate = row[30:40]
    readings = [volt,rotate,pressure,vibrate]
    readings_labels = ['volt','rotate','pressure','vibrate']
    
    temp = []
    for reading,label in zip(readings,readings_labels):
        mean = np.mean(reading)
        max_ = max(reading)
        min_ = min(reading)
        changes = [abs(val1-val2) for val1,val2 in zip(reading, reading[1:])]
        max_variation = max(changes)
        min_variation = min(changes)
        
        temp += [mean,max_,min_,max_variation,min_variation] 
        
    params.append(temp)

In [None]:
column_names = ['volt_mean','volt_max','volt_min','volt_max_variation','volt_min_variation','rotate_mean','rotate_max','rotate_min','rotate_max_variation','rotate_min_variation','pressure_mean','pressure_max','pressure_min','pressure_max_variation','pressure_min_variation','vibrate_mean','vibrate_max','vibrate_min','vibrate_max_variation','vibrate_min_variation']
print("lenght of column_names ",len(column_names))
extend_df = pd.DataFrame(params, columns = column_names)
extend_df.head()

In [None]:
extend_df['Error'] = final_df['Error']
X,y = extend_df.drop('Error',axis=1), extend_df['Error']
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X_train,X_test,y_train,y_test = train_test_split(X, y)
X_train.shape, y_test.shape

In [None]:
model = RandomForestClassifier().fit(X_train,y_train)
model.score(X_test,y_test)

In [None]:
temp_df = pd.concat([final_df,extend_df],axis='columns')

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X_train,X_test,y_train,y_test = train_test_split(temp_df.drop('Error',axis=1), temp_df['Error'])
X_train.shape, y_test.shape

In [None]:
model = RandomForestClassifier().fit(X_train,y_train)
model.score(X_test,y_test)

In [None]:
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

In [None]:
cross_val_score(SVC(), X,y)

In [None]:
cross_val_score(GaussianNB(), X,y)

In [None]:
cross_val_score(KNeighborsClassifier(), X,y)

In [None]:
cross_val_score(RandomForestClassifier(), X,y)