In [1]:
# Import Dependencies
%matplotlib inline
import time

# Data Manipulation
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import math, time, random, datetime

# Machine learning
from sklearn.model_selection import train_test_split
from sklearn import model_selection, tree, preprocessing, metrics, linear_model
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
#Metrics 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, plot_confusion_matrix


In [2]:
df = pd.read_csv('../input/pump-sensor-data/sensor.csv')

# Dealing with missing values

In [3]:
df = df.drop(['sensor_15' ], axis = 1) 

It is cleaned using fillna() method with ffill parameter which propagates last valid observation to fill gaps

In [4]:
df = df.iloc[1:]
df = df.fillna(method='ffill')
df.isnull().sum().sum()

0

# Machine Learning models

In [5]:
X = df.drop(['Unnamed: 0','timestamp','machine_status'], axis=1)
Y = df['machine_status']

In [6]:
le = LabelEncoder()
df['machine_status'] = le.fit_transform(df['machine_status'])
df['machine_status'].value_counts()

# 1 - normal 
# 2 - recovering 
# 0 - broken

1    205835
2     14477
0         7
Name: machine_status, dtype: int64

In [7]:
# apply the logitic regression 
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.30, random_state = 42)

In [8]:
# Function that runs the requested algorithm and returns the accuracy metrics
def fit_ml_algo(algo, X_train, y_train, cv):
    
    # One Pass
    model = algo.fit(X_train, y_train)
    acc = round(model.score(X_train, y_train) * 100, 2)
    
    # Cross Validation 
    train_pred = model_selection.cross_val_predict(algo, 
                                                  X_train, 
                                                  y_train, 
                                                  cv=cv, 
                                                  n_jobs = -1)
    # Cross-validation accuracy metric
    acc_cv = round(metrics.accuracy_score(y_train, train_pred) * 100, 2)
    
    return train_pred, acc, acc_cv

In [9]:
start_time = time.time()
train_pred_log, acc_log, acc_cv_log = fit_ml_algo(LogisticRegression(), 
                                                               X_train, 
                                                               y_train, 
                                                                    10)
log_time = (time.time() - start_time)
print("Accuracy: %s" % acc_log)
print("Accuracy CV 10-Fold: %s" % acc_cv_log)
print("Running Time: %s" % datetime.timedelta(seconds=log_time))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy: 99.51
Accuracy CV 10-Fold: 99.51
Running Time: 0:01:12.342401


In [10]:
# k-Nearest Neighbours
start_time = time.time()
train_pred_knn, acc_knn, acc_cv_knn = fit_ml_algo(KNeighborsClassifier(), 
                                                  X_train, 
                                                  y_train, 
                                                  10)
knn_time = (time.time() - start_time)
print("Accuracy: %s" % acc_knn)
print("Accuracy CV 10-Fold: %s" % acc_cv_knn)
print("Running Time: %s" % datetime.timedelta(seconds=knn_time))



Accuracy: 99.98
Accuracy CV 10-Fold: 99.96
Running Time: 0:04:56.134672


In [11]:
# Gaussian Naive Bayes
start_time = time.time()
train_pred_gaussian, acc_gaussian, acc_cv_gaussian = fit_ml_algo(GaussianNB(), 
                                                                      X_train, 
                                                                      y_train, 
                                                                           10)
gaussian_time = (time.time() - start_time)
print("Accuracy: %s" % acc_gaussian)
print("Accuracy CV 10-Fold: %s" % acc_cv_gaussian)
print("Running Time: %s" % datetime.timedelta(seconds=gaussian_time))



Accuracy: 97.74
Accuracy CV 10-Fold: 97.74
Running Time: 0:00:06.125240


In [12]:
# Linear SVC
start_time = time.time()
train_pred_svc, acc_linear_svc, acc_cv_linear_svc = fit_ml_algo(LinearSVC(),
                                                                X_train, 
                                                                y_train, 
                                                                10)
linear_svc_time = (time.time() - start_time)
print("Accuracy: %s" % acc_linear_svc)
print("Accuracy CV 10-Fold: %s" % acc_cv_linear_svc)
print("Running Time: %s" % datetime.timedelta(seconds=linear_svc_time))



Accuracy: 99.69
Accuracy CV 10-Fold: 99.58
Running Time: 0:03:05.949613


In [13]:
# Stochastic Gradient Descent
start_time = time.time()
train_pred_sgd, acc_sgd, acc_cv_sgd = fit_ml_algo(SGDClassifier(), 
                                                  X_train, 
                                                  y_train,
                                                  10)
sgd_time = (time.time() - start_time)
print("Accuracy: %s" % acc_sgd)
print("Accuracy CV 10-Fold: %s" % acc_cv_sgd)
print("Running Time: %s" % datetime.timedelta(seconds=sgd_time))



Accuracy: 99.48
Accuracy CV 10-Fold: 99.45
Running Time: 0:01:24.684150


In [14]:
# Decision Tree Classifier
start_time = time.time()
train_pred_dt, acc_dt, acc_cv_dt = fit_ml_algo(DecisionTreeClassifier(), 
                                                                X_train, 
                                                                y_train,
                                                                10)
dt_time = (time.time() - start_time)
print("Accuracy: %s" % acc_dt)
print("Accuracy CV 10-Fold: %s" % acc_cv_dt)
print("Running Time: %s" % datetime.timedelta(seconds=dt_time))



Accuracy: 100.0
Accuracy CV 10-Fold: 99.96
Running Time: 0:00:50.690494


In [15]:
# Stochastic Random forest
start_time = time.time()
train_pred_rdf, acc_rdf, acc_cv_rdf = fit_ml_algo(RandomForestClassifier(), 
                                                  X_train, 
                                                  y_train,
                                                  10)
rdf_time = (time.time() - start_time)
print("Accuracy: %s" % acc_rdf)
print("Accuracy CV 10-Fold: %s" % acc_cv_rdf)
print("Running Time: %s" % datetime.timedelta(seconds=rdf_time))



Accuracy: 100.0
Accuracy CV 10-Fold: 99.99
Running Time: 0:08:34.586277


In [16]:
models = pd.DataFrame({
    'Model': ['KNN', 'Logistic Regression', 'Naive Bayes', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree','Random Forest'],
    'Score': [
        acc_knn, 
        acc_log,  
        acc_gaussian, 
        acc_sgd, 
        acc_linear_svc, 
        acc_dt,
        acc_rdf
    ]})
print("---Reuglar Accuracy Scores---")
models.sort_values(by='Score', ascending=False)

---Reuglar Accuracy Scores---


Unnamed: 0,Model,Score
5,Decision Tree,100.0
6,Random Forest,100.0
0,KNN,99.98
4,Linear SVC,99.69
1,Logistic Regression,99.51
3,Stochastic Gradient Decent,99.48
2,Naive Bayes,97.74


In [17]:
cv_models = pd.DataFrame({
    'Model': ['KNN', 'Logistic Regression', 'Naive Bayes', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree', 'Random Forest'],
    'Score': [
        acc_cv_knn, 
        acc_cv_log,      
        acc_cv_gaussian, 
        acc_cv_sgd, 
        acc_cv_linear_svc, 
        acc_cv_dt,
        acc_cv_rdf
    ]})
print('---Cross-validation Accuracy Scores---')
cv_models.sort_values(by='Score', ascending=False)

---Cross-validation Accuracy Scores---


Unnamed: 0,Model,Score
6,Random Forest,99.99
0,KNN,99.96
5,Decision Tree,99.96
4,Linear SVC,99.58
1,Logistic Regression,99.51
3,Stochastic Gradient Decent,99.45
2,Naive Bayes,97.74


In [18]:
FOREST = RandomForestClassifier(n_estimators = 100, random_state = 0)

FOREST.fit(X_train,y_train)

y_pred=FOREST.predict(X_test)

In [19]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[    0     0     0]
 [    0 61796     2]
 [    1     5  4292]]


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

      BROKEN       0.00      0.00      0.00         0
      NORMAL       1.00      1.00      1.00     61798
  RECOVERING       1.00      1.00      1.00      4298

    accuracy                           1.00     66096
   macro avg       0.67      0.67      0.67     66096
weighted avg       1.00      1.00      1.00     66096

0.9998789639312515
