## Load Libraries and Packages

In [28]:
#Standard Libraries
import numpy as np
import pandas as pd
import os
import time

#Data Preprocessing
from sklearn.model_selection import train_test_split

#Vectorizers
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

#Models
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier

#Model Evaluation
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import metrics

import warnings
warnings.filterwarnings("ignore")

## Load Data

In [3]:
#Loading data from file system
#Data source located: https://github.ibm.com/Jewel-Matsch-Rowekamp/Energy-NW
data = pd.read_csv(os.environ['DSX_PROJECT_DIR']+'/datasets/energy_nw_clean_dataframe.csv')

In [4]:
data.head(5)

Unnamed: 0,AR_NUMBER,AR_PRIORITY,AR_SEVERITY,PRIORITY_SEVERITY,CONCAT_TEXT_FOR_WKS
0,383472,CAQ,C,CAQ:C,rcic hpcs low cst level swap fill vent suction...
1,383473,CAQ,D,CAQ:D,rcic p not start run sop rcic fill direct star...
2,383474,CAQ,D,CAQ:D,hp drop hpcs suction switchover alarm hp drop ...
3,383476,NCAQ,4,NCAQ:4,bre bre need window evaluate bre
4,383477,CAQ,D,CAQ:D,receive rod drive control sys inop alarm recei...


## Select the Independent and Dependent Variables

In [5]:
X_array = data.iloc[:,[0,4]].values
Y_array = data.iloc[:,3].values

In [6]:
X = pd.DataFrame(X_array, columns=['AR_NUMBER','CONCAT_TEXT_FOR_WKS'])
X.head(5)

Unnamed: 0,AR_NUMBER,CONCAT_TEXT_FOR_WKS
0,383472,rcic hpcs low cst level swap fill vent suction...
1,383473,rcic p not start run sop rcic fill direct star...
2,383474,hp drop hpcs suction switchover alarm hp drop ...
3,383476,bre bre need window evaluate bre
4,383477,receive rod drive control sys inop alarm recei...


In [7]:
Y = pd.DataFrame(Y_array, columns=['PRIORITY_SEVERITY'])
Y.head(5)

Unnamed: 0,PRIORITY_SEVERITY
0,CAQ:C
1,CAQ:D
2,CAQ:D
3,NCAQ:4
4,CAQ:D


## Vectorize the Text Column

In [8]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

In [9]:
features = tfidf.fit_transform(X.CONCAT_TEXT_FOR_WKS).toarray()
labels = Y
features.shape

(11337, 17880)

## Divide the Data into Train and Test

In [10]:
X_train, X_test, Y_train, Y_test=train_test_split(features,Y,test_size=0.2,random_state=0)

In [11]:
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(9069, 17880) (9069, 1)
(2268, 17880) (2268, 1)


### Logistic Regression

In [12]:
classifier_Obj_LR = LogisticRegression()
classifier_Obj_LR.fit(X_train, Y_train)

modelAccuracies_LR = cross_val_score(estimator=classifier_Obj_LR, X=X_train, y=Y_train.values.ravel(),cv=10)
print("Mean score accuracy: ", modelAccuracies_LR.mean())
print("95% confidence interval: ", modelAccuracies_LR.std())

Mean score accuracy:  0.6877266704055051
95% confidence interval:  0.01278131435769145


In [16]:
print("The overall accuracy of the model: %0.2f (+/- %0.2f)" % (modelAccuracies_LR.mean(), modelAccuracies_LR.std() * 2))

The overall accuracy of the model: 0.69 (+/- 0.03)


In [17]:
y_pred_LR = classifier_Obj_LR.predict(X_test)

In [18]:
print(metrics.classification_report(Y_test, y_pred_LR))

              precision    recall  f1-score   support

       CAQ:A       0.00      0.00      0.00         1
       CAQ:B       0.00      0.00      0.00        13
       CAQ:C       0.66      0.17      0.27       197
       CAQ:D       0.82      0.48      0.61       431
      NCAQ:2       0.00      0.00      0.00         4
      NCAQ:3       0.69      0.75      0.72       744
      NCAQ:4       0.66      0.86      0.74       878

    accuracy                           0.69      2268
   macro avg       0.40      0.32      0.33      2268
weighted avg       0.69      0.69      0.66      2268



In [19]:
cm_LR = confusion_matrix(Y_test, y_pred_LR)
print("This is the confusion matrix: ")
print(cm_LR)

This is the confusion matrix: 
[[  0   0   0   0   0   0   1]
 [  0   0   3   2   0   6   2]
 [  0   0  33  28   0  99  37]
 [  0   0   8 208   0  36 179]
 [  0   0   0   0   0   4   0]
 [  0   0   5   4   0 561 174]
 [  0   0   1  13   0 112 752]]


### Naive Bayes

In [13]:
classifierObj_NB = BernoulliNB()
classifierObj_NB.fit(X_train, Y_train)

modelAccuracies_NB = cross_val_score(estimator=classifierObj_NB, X=X_train, y=Y_train.values.ravel(),cv=10)
print("Mean score accuracy: ", modelAccuracies_NB.mean())
print("95% confidence interval: ", modelAccuracies_NB.std())

Mean score accuracy:  0.6543116383706611
95% confidence interval:  0.013619686904286329


In [20]:
print("The overall accuracy of the model: %0.2f (+/- %0.2f)" % (modelAccuracies_NB.mean(), modelAccuracies_NB.std() * 2))

The overall accuracy of the model: 0.65 (+/- 0.03)


In [21]:
y_pred_NB = classifierObj_NB.predict(X_test)

In [22]:
print(metrics.classification_report(Y_test, y_pred_NB))

              precision    recall  f1-score   support

       CAQ:A       0.00      0.00      0.00         1
       CAQ:B       0.00      0.00      0.00        13
       CAQ:C       0.46      0.35      0.40       197
       CAQ:D       0.71      0.56      0.63       431
      NCAQ:2       0.00      0.00      0.00         4
      NCAQ:3       0.73      0.65      0.69       744
      NCAQ:4       0.65      0.82      0.72       878

    accuracy                           0.67      2268
   macro avg       0.36      0.34      0.35      2268
weighted avg       0.66      0.67      0.66      2268



In [23]:
cm_NB = confusion_matrix(Y_test, y_pred_NB)
print("This is the confusion matrix: ")
print(cm_NB)

This is the confusion matrix: 
[[  0   0   0   0   0   0   1]
 [  0   0   6   1   0   4   2]
 [  0   0  69  36   0  49  43]
 [  0   0  20 243   0  19 149]
 [  0   0   0   0   0   4   0]
 [  0   0  49  15   0 480 200]
 [  0   0   7  49   0 100 722]]


### Random Forest

In [15]:
classifier_Obj_Randfor = RandomForestClassifier(bootstrap=False, criterion='entropy', n_estimators=40)
classifier_Obj_Randfor.fit(X_train, Y_train)

modelAccuracies_RF = cross_val_score(estimator=classifier_Obj_Randfor, X=X_train, y=Y_train.values.ravel(),cv=10)
print("Mean score accuracy: ", modelAccuracies_RF.mean())
print("95% confidence interval: ", modelAccuracies_RF.std())

Mean score accuracy:  0.6562975877090914
95% confidence interval:  0.010887297058673492


In [24]:
print("The overall accuracy of the model: %0.2f (+/- %0.2f)" % (modelAccuracies_RF.mean(), modelAccuracies_RF.std() * 2))

The overall accuracy of the model: 0.66 (+/- 0.02)


In [25]:
y_pred_RF = classifier_Obj_Randfor.predict(X_test)

In [26]:
print(metrics.classification_report(Y_test, y_pred_RF))

              precision    recall  f1-score   support

       CAQ:A       0.00      0.00      0.00         1
       CAQ:B       0.00      0.00      0.00        13
       CAQ:C       0.78      0.11      0.19       197
       CAQ:D       0.85      0.37      0.52       431
      NCAQ:2       0.00      0.00      0.00         4
      NCAQ:3       0.64      0.78      0.71       744
      NCAQ:4       0.64      0.83      0.72       878

    accuracy                           0.66      2268
   macro avg       0.42      0.30      0.30      2268
weighted avg       0.69      0.66      0.63      2268



In [27]:
cm_RF = confusion_matrix(Y_test, y_pred_RF)
print("This is the confusion matrix: ")
print(cm_RF)

This is the confusion matrix: 
[[  0   0   0   0   0   0   1]
 [  0   0   0   1   0  10   2]
 [  0   0  21   9   0 127  40]
 [  0   0   5 161   0  49 216]
 [  0   0   0   0   0   4   0]
 [  0   0   1   3   0 584 156]
 [  0   0   0  16   0 133 729]]


### SVM using SGD

In [29]:
classifier_Obj_SVM = SGDClassifier(loss="hinge", penalty="l2", max_iter=5)
classifier_Obj_SVM_linear.fit(X_train, Y_train)

modelAccuracies_SVM = cross_val_score(estimator=classifier_Obj_SVM_linear, X=X_train, y=Y_train,cv=10)
print("Mean score accuracy: ", modelAccuracies_SVM.mean())
print("95% confidence interval: ", modelAccuracies_SVM.std())


Mean score accuracy:  0.7116576359390284
95% confidence interval:  0.012293575384724599


In [30]:
print("The overall accuracy of the model: %0.2f (+/- %0.2f)" % (modelAccuracies_SVM.mean(), modelAccuracies_SVM.std() * 2))

The overall accuracy of the model: 0.71 (+/- 0.02)


In [31]:
y_pred_SVM = classifier_Obj_SVM_linear.predict(X_test)

In [32]:
print(metrics.classification_report(Y_test, y_pred_SVM))

              precision    recall  f1-score   support

       CAQ:A       0.00      0.00      0.00         1
       CAQ:B       0.00      0.00      0.00        13
       CAQ:C       0.57      0.29      0.39       197
       CAQ:D       0.74      0.61      0.67       431
      NCAQ:2       0.00      0.00      0.00         4
      NCAQ:3       0.71      0.75      0.73       744
      NCAQ:4       0.70      0.81      0.75       878

    accuracy                           0.70      2268
   macro avg       0.39      0.35      0.36      2268
weighted avg       0.69      0.70      0.69      2268



In [33]:
cm_RF = confusion_matrix(Y_test, y_pred_SVM)
print("This is the confusion matrix: ")
print(cm_RF)

This is the confusion matrix: 
[[  0   0   0   1   0   0   0]
 [  0   0   4   3   0   5   1]
 [  0   0  58  39   0  78  22]
 [  0   0  19 265   0  21 126]
 [  0   0   0   0   0   4   0]
 [  0   0  16  10   0 556 162]
 [  0   0   5  42   0 117 714]]
