# Energy NW Model Selection - CAQ vs. Non-CAQ

The purpose of this notebook is to perform featurization on the text column and create a classification model. We will perform multiple tests on the models.

#### Load Data & Libraries

In [75]:
#Standard Libraries
import numpy as np
import pandas as pd
import os
import time

#Data Preprocessing
from sklearn.model_selection import train_test_split

#Vectorizers
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

#Feature Selectors
from sklearn.feature_selection import chi2

#Models
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

#Model Evaluation
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import metrics

import warnings
warnings.filterwarnings("ignore")

In [3]:
#Loading data from file system
#Data source located: https://github.ibm.com/Jewel-Matsch-Rowekamp/Energy-NW
data = pd.read_csv(os.environ['DSX_PROJECT_DIR']+'/datasets/energy_nw_clean_dataframe.csv')
data.head(15)

Unnamed: 0,AR_NUMBER,AR_PRIORITY,AR_SEVERITY,PRIORITY_SEVERITY,CONCAT_TEXT_FOR_WKS
0,383472,CAQ,C,CAQ:C,rcic hpcs low cst level swap fill vent suction...
1,383473,CAQ,D,CAQ:D,rcic p not start run sop rcic fill direct star...
2,383474,CAQ,D,CAQ:D,hp drop hpcs suction switchover alarm hp drop ...
3,383476,NCAQ,4,NCAQ:4,bre bre need window evaluate bre
4,383477,CAQ,D,CAQ:D,receive rod drive control sys inop alarm recei...
5,383478,CAQ,D,CAQ:D,hpcs suction switchover alarm locked fill vent...
6,383479,NCAQ,3,NCAQ:3,cw cta approx overflow operator round note due...
7,383490,CAQ,D,CAQ:D,security cctv need maintenance security cctv n...
8,383491,CAQ,C,CAQ:C,sw va change lo without screen anonymous cr de...
9,383497,NCAQ,3,NCAQ:3,ep copiers unable scan email new canon copiers...


### Determine Dependent and Independent Variables

Split data into X & Y (X=independent, Y=dependent)

In [4]:
X_array = data.iloc[:,[0,4]].values
Y_array = data.iloc[:,1].values

#### Verify Variables <br>
* X should contain the independent columns: AR_NUMBER, AR_PRIORITY, CONCAT_TEXT_FOR_WKS. <br>
* Y should contain the dependent column: AR_SEVERITY

In [5]:
X = pd.DataFrame(X_array, columns=['AR_NUMBER','CONCAT_TEXT_FOR_WKS'])
X.head(5)

Unnamed: 0,AR_NUMBER,CONCAT_TEXT_FOR_WKS
0,383472,rcic hpcs low cst level swap fill vent suction...
1,383473,rcic p not start run sop rcic fill direct star...
2,383474,hp drop hpcs suction switchover alarm hp drop ...
3,383476,bre bre need window evaluate bre
4,383477,receive rod drive control sys inop alarm recei...


In [6]:
Y = pd.DataFrame(Y_array, columns=['AR_PRIORITY'])
Y.head(5)

Unnamed: 0,AR_PRIORITY
0,CAQ
1,CAQ
2,CAQ
3,NCAQ
4,CAQ


### One Hot Encode Categorical Variables

#### Categorical Dependent Column

In [7]:
Y_df = pd.get_dummies(data=Y, drop_first=True)

In [8]:
Y_df.head(5)

Unnamed: 0,AR_PRIORITY_NCAQ
0,0
1,0
2,0
3,1
4,0


### Vectorize the Text Column

In [66]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

In [67]:
features = tfidf.fit_transform(X.CONCAT_TEXT_FOR_WKS).toarray()
labels = Y
features.shape

(11337, 17880)

In [68]:
features

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.37294593, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [12]:
labels

Unnamed: 0,AR_PRIORITY
0,CAQ
1,CAQ
2,CAQ
3,NCAQ
4,CAQ
5,CAQ
6,NCAQ
7,CAQ
8,CAQ
9,NCAQ


#### Divide Data into Train and Test Sets

In [60]:
X_train, X_test, Y_train, Y_test=train_test_split(features_count,Y,test_size=0.2,random_state=0)

In [87]:
X_train, X_test, Y_train_SVM, Y_test_SVM=train_test_split(features_count,Y_df,test_size=0.2,random_state=0)

In [61]:
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(9069, 18866) (9069, 1)
(2268, 18866) (2268, 1)


## Classifiers

### Logistic Regression

In [62]:
classifierObj_LR = LogisticRegression()
classifierObj_LR.fit(X_train, Y_train)

modelAccuracies_LR = cross_val_score(estimator=classifierObj_LR, X=X_train, y=Y_train.values.ravel(),cv=10)
print("Mean score accuracy: ", modelAccuracies_LR.mean())
print("95% confidence interval: ", modelAccuracies_LR.std())

Mean score accuracy:  0.8241290515345197
95% confidence interval:  0.013566655746959975


In [54]:
print("The overall accuracy of the model: %0.2f (+/- %0.2f)" % (modelAccuracies_LR.mean(), modelAccuracies_LR.std() * 2))

The overall accuracy of the model: 0.83 (+/- 0.02)


In [55]:
y_pred_LR = classifierObj_LR.predict(X_test)

In [56]:
print(metrics.classification_report(Y_test, y_pred_LR))

              precision    recall  f1-score   support

         CAQ       0.91      0.47      0.62       642
        NCAQ       0.82      0.98      0.90      1626

    accuracy                           0.84      2268
   macro avg       0.87      0.72      0.76      2268
weighted avg       0.85      0.84      0.82      2268



In [52]:
cm_LR = confusion_matrix(Y_test, y_pred_LR)
print("This is the confusion matrix: ")
print(cm_LR)

This is the confusion matrix: 
[[ 300  342]
 [  30 1596]]


### Support Vector Machine

In [81]:
classifier_Obj_SVM = SVC()

grid_params = {
    'kernel':['linear','poly','rbf','sigmoid']
}
gd_sr_SVM=GridSearchCV(estimator=classifier_Obj_SVM, param_grid=grid_params, scoring='accuracy', cv=5, n_jobs=-1)
gd_sr_SVM.fit(X_train, Y_train)
print(gd_sr_SVM.best_params_)
print(gd_sr_SVM.best_score_)

{'kernel': 'linear'}
0.7900540302128128


In [89]:
classifier_Obj_SVM_linear = SVC(kernel='linear')
classifier_Obj_SVM_linear.fit(X_train, Y_train)

modelAccuracies_SVM = cross_val_score(estimator=classifier_Obj_SVM_linear, X=X_train, y=Y_train,cv=10)
print("Mean score accuracy: ", modelAccuracies_SVM.mean())
print("95% confidence interval: ", modelAccuracies_SVM.std())

Mean score accuracy:  0.7931488542278783
95% confidence interval:  0.012643571716514392


In [90]:
print("The overall accuracy of the model: %0.2f (+/- %0.2f)" % (modelAccuracies_SVM.mean(), modelAccuracies_SVM.std() * 2))

The overall accuracy of the model: 0.79 (+/- 0.03)


In [91]:
y_pred_SVM = classifier_Obj_SVM_linear.predict(X_test)

In [92]:
print(metrics.classification_report(Y_test, y_pred_SVM))

              precision    recall  f1-score   support

         CAQ       0.65      0.65      0.65       642
        NCAQ       0.86      0.86      0.86      1626

    accuracy                           0.80      2268
   macro avg       0.75      0.76      0.75      2268
weighted avg       0.80      0.80      0.80      2268



In [93]:
cm_SVM = confusion_matrix(Y_test, y_pred_SVM)
print("This is the confusion matrix: ")
print(cm_SVM)

This is the confusion matrix: 
[[ 418  224]
 [ 228 1398]]


### Naive Bayes

In [76]:
classifierObj_NB = BernoulliNB()
classifierObj_NB.fit(X_train, Y_train)

modelAccuracies_NB = cross_val_score(estimator=classifierObj_NB, X=X_train, y=Y_train.values.ravel(),cv=10)
print("Mean score accuracy: ", modelAccuracies_NB.mean())
print("95% confidence interval: ", modelAccuracies_NB.std())

Mean score accuracy:  0.8043891424939671
95% confidence interval:  0.007879743300861256


In [77]:
print("The overall accuracy of the model: %0.2f (+/- %0.2f)" % (modelAccuracies_NB.mean(), modelAccuracies_NB.std() * 2))

The overall accuracy of the model: 0.80 (+/- 0.02)


In [78]:
y_pred_NB = classifierObj_NB.predict(X_test)

In [79]:
print(metrics.classification_report(Y_test, y_pred_NB))

              precision    recall  f1-score   support

         CAQ       0.74      0.52      0.61       642
        NCAQ       0.83      0.93      0.88      1626

    accuracy                           0.81      2268
   macro avg       0.78      0.72      0.74      2268
weighted avg       0.80      0.81      0.80      2268



In [80]:
cm_NB = confusion_matrix(Y_test, y_pred_NB)
print("This is the confusion matrix: ")
print(cm_NB)

This is the confusion matrix: 
[[ 335  307]
 [ 120 1506]]


### Random Forest

In [82]:
classifier_Obj_RF = RandomForestClassifier()

grid_param = {
    'n_estimators':[10,15,20,25,30,40,50],
    'criterion':['gini','entropy'],
    'bootstrap':[True, False]
}

gd_sr_RF = GridSearchCV(estimator=classifier_Obj_RF, param_grid=grid_param,scoring='accuracy',cv=5,n_jobs=-1)
gd_sr_RF.fit(X_train, Y_train)
print(gd_sr_RF.best_params_)
print(gd_sr_RF.best_score_)

{'bootstrap': False, 'criterion': 'entropy', 'n_estimators': 40}
0.8212592347557613


In [103]:
classifier_Obj_Randfor = RandomForestClassifier(bootstrap=False, criterion='entropy', n_estimators=40)
classifier_Obj_Randfor.fit(X_train, Y_train)

modelAccuracies_RF = cross_val_score(estimator=classifier_Obj_Randfor, X=X_train, y=Y_train.values.ravel(),cv=10)
print("Mean score accuracy: ", modelAccuracies_RF.mean())
print("95% confidence interval: ", modelAccuracies_RF.std())

Mean score accuracy:  0.8215918810363932
95% confidence interval:  0.010162333405805364


In [104]:
print("The overall accuracy of the model: %0.2f (+/- %0.2f)" % (modelAccuracies_RF.mean(), modelAccuracies_RF.std() * 2))

The overall accuracy of the model: 0.82 (+/- 0.02)


In [105]:
y_pred_RF = classifier_Obj_Randfor.predict(X_test)

In [106]:
print(metrics.classification_report(Y_test, y_pred_RF))

              precision    recall  f1-score   support

         CAQ       0.87      0.45      0.59       642
        NCAQ       0.82      0.97      0.89      1626

    accuracy                           0.82      2268
   macro avg       0.85      0.71      0.74      2268
weighted avg       0.83      0.82      0.80      2268



In [107]:
cm_RF = confusion_matrix(Y_test, y_pred_RF)
print("This is the confusion matrix: ")
print(cm_RF)

This is the confusion matrix: 
[[ 286  356]
 [  41 1585]]
