## Dataset Location##

In [1]:
# dataset location
FILE = '/content/drive/MyDrive/Deakin Energy T3/Dataset/cleaned_incidents1.csv'

##Required Libraries

In [3]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import re
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer 
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, plot_confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.svm import SVC
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

# set seeds for reproducability
from numpy.random import seed
tf.random.set_seed(2000)
seed(500)

# global configurations
pd.set_option("display.max_colwidth", -1)



##EDA##

In [4]:
# read file
df = pd.read_csv(FILE)

df.head()

Unnamed: 0,ActionTaken,Address,AssetLabel,CauseCommunity,CauseEnvironment,CausePre,CauseTechnical,CauseWorkP,ContactType,CorrectProtection,EventDescription,FailedAssets,FailedExplosion,FailedOilFilled,FailedOtherAssets,FailedOtherAssetsOther,FeederNumber,IncidentCause,IncidentConsequence,IncidentDatetime,IncidentFireFFactorReportable,IncidentFireSeverity,IncidentID,IncidentLocationType,IncidentLocationTypeOther,IncidentNumber,IncidentType,Lat,Long,MadeSafe,NetworkType,Status,SubmissionID,SubmittedDateTimeString,Voltage,WeatherStation,Postcode,Locality,Category
0,Crew isolated supply and undertook repairs,"PARA PARK, 1490 HENDY MAIN ROAD, PARAPARAP VIC 3240",,,,,Earth fault,,,,A nearby customer reported sparking of electrical lines at the above location. On attendance the crew found that a High Voltage ABC conductor had faulted midspan resulting in a ground fire (Approx 30 sqm). There were no reported injuries.,Conductor (ABC),0,0,0,,,HVABC cable faulted midspan,Grassfire,5/10/2015 20:36,1.0,Small: 10 - 1000 sq.m,56,Roadway,,20151009PWA_1,Infrastructure (network-based),-38.288982,144.191502,1,Powercor,Report,1030,01:04.8,22kV AC,Avalon Airport,3240.0,Paraparap,OH Cable
1,Crew undertook repairs,"1154-1198 CHRISTIES ROAD, RAVENHALL VIC 3023",,Working too close to underground cables,,,,,,,"A contractor reported that he had contacted an earthing cable while excavating a trench at the above location. On arrival the crew found that the contractor (Dennis James, Ph: 0418 390 584) was digging a trench and contacted an earthing cable, causing damage to the cable. There were no reported injuries or third party property damages.",No failed asset,0,0,0,,,Contractor contacted earthing conductor,No Go Zone (contact),7/10/2015 11:00,,,64,,,20151012PWA_5,Infrastructure (network-based),-37.767377,144.738509,1,Powercor,Report,96,39:29.0,Earthing cable,Laverton Raaf,3023.0,Ravenhall,Dug up
2,Crew isolated supply and undertook repairs,"326 DUNBARS ROAD, PETERBOROUGH VIC 3270",,,,,Corrosion,,,1.0,A field crew attending an outage found that a 22kV conductor had broken due to rust at a connecting sleeve and had fallen to the ground where one end remained alive due to a high impedance backfeed through downline transformer windings. Protection operated. There were no reported injuries or third party property damages.,Conductor (bare),0,0,0,,,Conductor broke due to rust,Serious risk to public safety (e.g. live conductor on ground or live asset accessible to unauthorised persons),10/10/2015 8:36,,,67,Agricultural,,20151013PWA_1,Infrastructure (network-based),-38.586647,142.901137,1,Powercor,Report,86,45:20.1,22kV AC,Warrnambool Airport Ndb,3270.0,Peterborough,Connection
3,Crew undertook repairs,"41-49 BANK STREET, SOUTH MELBOURNE VIC 3205",,Vandalism,,,,,,1.0,"Interfere and vandalism in substation. Unknown third parties used an unknown object to interfere with two HV fuses which blew in an Indoor Substation. ‘Little Bank – Kingsway’ Indoor substation affected. Also, Secure Parking at the rear of 19-29 Bank Street had theft and vandalism of their Ticket Machine in conjunction with this substation interfere and vandalism.",Indoor,0,2,0,,,Vandalism,Loss of supply;Damage to network assets,4/10/2015 10:30,,,73,,,20151015PWA_4,Infrastructure (network-based),-37.832352,144.968452,1,CitiPower,Report,92,03:46.9,22kV AC,Essendon Airport,3205.0,South Melbourne,Other
4,Crew isolated supply and undertook repairs,"199 WILSONS ROAD, WHITTINGTON VIC 3219",,Vehicle,,,,,,,"A nearby customer reported that a high load had pulled down wires at the above location. On arrival the crew found that a LV service cable had been contacted by an unknown vehicle, breaking the cable which fell to the ground. There were no reported injuries or third party property damages. Following repairs the cable was measured at 5.3m at the kerb.",Service conductor,0,0,0,,,Unknown high load contacted LV service cable,No Go Zone (contact);Damage to network assets,5/10/2015 14:41,,,77,,,20151016PWA_1,Infrastructure (network-based),-38.177436,144.39048,1,Powercor,Report,98,13:07.0,Low voltage AC (<1kV),Avalon Airport,3219.0,Whittington,Vehicle


In [5]:
# missing value counts
df.isna().sum()

ActionTaken                      0   
Address                          0   
AssetLabel                       1334
CauseCommunity                   4033
CauseEnvironment                 4996
CausePre                         6426
CauseTechnical                   3348
CauseWorkP                       6342
ContactType                      5650
CorrectProtection                2257
EventDescription                 0   
FailedAssets                     0   
FailedExplosion                  0   
FailedOilFilled                  0   
FailedOtherAssets                0   
FailedOtherAssetsOther           6494
FeederNumber                     1326
IncidentCause                    1   
IncidentConsequence              0   
IncidentDatetime                 0   
IncidentFireFFactorReportable    3840
IncidentFireSeverity             3737
IncidentID                       0   
IncidentLocationType             651 
IncidentLocationTypeOther        6442
IncidentNumber                   0   
IncidentType

In [6]:
# class distribution check
df['Category'].value_counts()

Other           1321
Connection      904 
Vehicle         814 
Trees           569 
Crossarm        475 
Fuse            430 
Dug up          385 
AF Other        378 
Pole            328 
Animal          281 
OH Cable        216 
Conductor       187 
Lightning       146 
Installation    29  
UG Cable        26  
Name: Category, dtype: int64

In [7]:
'''
Features to consider:
EventDescription, FailedAssets, IncidentCause, IncidentConsequence, IncidentType, Status, WeatherStation

Data Dictionary:
EventDescription --> Text
FailedAssets --> Text
IncidentCause --> Text
IncidentConsequence --> Text
IncidentType --> Categorical 
Status --> Categorical
WeatherStation --> Categorical
'''

X = df[['EventDescription', 'FailedAssets', 'IncidentCause', 'IncidentConsequence', 'IncidentType', 'Status', 'WeatherStation', 'Category']]

X.head()

Unnamed: 0,EventDescription,FailedAssets,IncidentCause,IncidentConsequence,IncidentType,Status,WeatherStation,Category
0,A nearby customer reported sparking of electrical lines at the above location. On attendance the crew found that a High Voltage ABC conductor had faulted midspan resulting in a ground fire (Approx 30 sqm). There were no reported injuries.,Conductor (ABC),HVABC cable faulted midspan,Grassfire,Infrastructure (network-based),Report,Avalon Airport,OH Cable
1,"A contractor reported that he had contacted an earthing cable while excavating a trench at the above location. On arrival the crew found that the contractor (Dennis James, Ph: 0418 390 584) was digging a trench and contacted an earthing cable, causing damage to the cable. There were no reported injuries or third party property damages.",No failed asset,Contractor contacted earthing conductor,No Go Zone (contact),Infrastructure (network-based),Report,Laverton Raaf,Dug up
2,A field crew attending an outage found that a 22kV conductor had broken due to rust at a connecting sleeve and had fallen to the ground where one end remained alive due to a high impedance backfeed through downline transformer windings. Protection operated. There were no reported injuries or third party property damages.,Conductor (bare),Conductor broke due to rust,Serious risk to public safety (e.g. live conductor on ground or live asset accessible to unauthorised persons),Infrastructure (network-based),Report,Warrnambool Airport Ndb,Connection
3,"Interfere and vandalism in substation. Unknown third parties used an unknown object to interfere with two HV fuses which blew in an Indoor Substation. ‘Little Bank – Kingsway’ Indoor substation affected. Also, Secure Parking at the rear of 19-29 Bank Street had theft and vandalism of their Ticket Machine in conjunction with this substation interfere and vandalism.",Indoor,Vandalism,Loss of supply;Damage to network assets,Infrastructure (network-based),Report,Essendon Airport,Other
4,"A nearby customer reported that a high load had pulled down wires at the above location. On arrival the crew found that a LV service cable had been contacted by an unknown vehicle, breaking the cable which fell to the ground. There were no reported injuries or third party property damages. Following repairs the cable was measured at 5.3m at the kerb.",Service conductor,Unknown high load contacted LV service cable,No Go Zone (contact);Damage to network assets,Infrastructure (network-based),Report,Avalon Airport,Vehicle


In [8]:
# check missing values
X.isna().sum()

EventDescription       0 
FailedAssets           0 
IncidentCause          1 
IncidentConsequence    0 
IncidentType           0 
Status                 0 
WeatherStation         0 
Category               15
dtype: int64

In [9]:
# drop null value row
X.dropna(axis=0, inplace=True)

In [10]:
# check missing values
X.isna().sum()

EventDescription       0
FailedAssets           0
IncidentCause          0
IncidentConsequence    0
IncidentType           0
Status                 0
WeatherStation         0
Category               0
dtype: int64

##Preprocess##

In [11]:
# English punctuations
PUNCT_TO_REMOVE = string.punctuation

# English Stopwords
nltk.download('stopwords')
stop = stopwords.words('english')
nltk.download('punkt')

# initiate lemmatizer
lemmatizer = WordNetLemmatizer() 

# clean text
def preprocess_text(text_data):
  '''
  data preprocessing:
    - lower text
    - remove punctuation
    - remove stopwords
    - remove numbers
    - tokenize
  '''
  lower_text = text_data.lower()
  no_punc_text = lower_text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))
  no_stopwords_text = " ".join(x for x in no_punc_text.split() if x not in stop)
  remove_number = re.sub('[0-9]+', '', no_stopwords_text)
  tokenize = nltk.word_tokenize(remove_number)
  
  return remove_number

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [12]:
all_features = X['IncidentType'] + ' ' + X['Status'] + ' ' + X['WeatherStation'] + X['EventDescription'] + X['FailedAssets'] + X['IncidentCause'] + X['IncidentConsequence']
all_features

0       Infrastructure (network-based) Report Avalon AirportA nearby customer reported sparking of electrical lines at the above location. On attendance the crew found that a High Voltage ABC conductor had faulted midspan resulting in a ground fire (Approx 30 sqm). There were no reported injuries.Conductor (ABC)HVABC cable faulted midspanGrassfire                                                                                                                                                                                                                                                                                                                                                                                                                                                   
1       Infrastructure (network-based) Report Laverton RaafA contractor reported that he had contacted an earthing cable while excavating a trench at the above location. On arrival the crew found that the contractor

In [13]:
# category field encoder
encoder = LabelEncoder()
X['Category'] = encoder.fit_transform((X['Category']))
X['Category'].unique()

array([ 9,  5,  3, 10, 14,  6, 12, 11,  4,  2,  1,  0,  8, 13,  7])

In [14]:
# preprocess free text data
clean_text = all_features.apply(lambda text: preprocess_text(text))
clean_text

0       infrastructure networkbased report avalon airporta nearby customer reported sparking electrical lines location attendance crew found high voltage abc conductor faulted midspan resulting ground fire approx  sqm reported injuriesconductor abchvabc cable faulted midspangrassfire                                                                                                                                                                                                                                                           
1       infrastructure networkbased report laverton raafa contractor reported contacted earthing cable excavating trench location arrival crew found contractor dennis james ph    digging trench contacted earthing cable causing damage cable reported injuries third party property damagesno failed assetcontractor contacted earthing conductorno go zone contact                                                                                                          

##Train Test Split##

In [15]:
# train test split
x_train, x_test, y_train, y_test = train_test_split(clean_text, X['Category'], test_size=0.1, random_state=100)
x_train.shape, x_test.shape

((5839,), (649,))

## String to Number##

In [16]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(clean_text)
xtrain_tfidf =  tfidf_vect.transform(x_train)
xtest_tfidf =  tfidf_vect.transform(x_test)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(clean_text)
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(x_train)
xtest_tfidf_ngram =  tfidf_vect_ngram.transform(x_test)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(clean_text)
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(x_train) 
xtest_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(x_test) 

##Stacking Classifier##

In [None]:
# estimators to use
estimators = [
              ('RF', RandomForestClassifier(n_estimators=10, random_state=42)),
              ('LR', LogisticRegression(random_state=42)),
              ('SVM', SVC(random_state=100))
              ]

# build stacking classifier
stacking_clf = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression()
    )

# train
stacking_clf.fit(xtrain_tfidf, y_train)

# predict
y_pred = stacking_clf.predict(xtest_tfidf)

# print accuraccy
print(f'Stacking accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%')

# classification report
print(classification_report(y_test, y_pred))

## Stacking with N-grams

In [None]:
# estimators to use
estimators = [
              ('RF', RandomForestClassifier(n_estimators=10, random_state=42)),
              ('LR', LogisticRegression(random_state=42)),
              ('SVM', SVC(random_state=100))
              ]

# build stacking classifier
stacking_clf = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression()
    )

# train
stacking_clf.fit(xtrain_tfidf_ngram, y_train)

# predict
y_pred = stacking_clf.predict(xtest_tfidf_ngram)

# print accuraccy
print(f'Stacking accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%')

# classification report
print(classification_report(y_test, y_pred))

In [19]:
%%timeit
# grid search parameters
param_grid = {
    'RF__n_estimators':[50,100,200,250,300],
    'RF__criterion': ['gini', 'entropy']
    #'LR__penalty': ['l1', 'l2'],
    #'LR__C': [0.5,0.8, 1],
    #'LR__solver': ['lbfgs', 'newton-cg'],
    #'LR__max_iter': [100,150,200,300],
    #'SVM__C' : [0.5, 0.8, 1],
    #'SVM__kernel': ['linear', 'poly', 'rbf']

}

# estimators to use
estimators = [
              ('RF', RandomForestClassifier(random_state=42)),
              ('LR', LogisticRegression(random_state=42)),
              ('SVM', SVC(random_state=100))
              ]

# build stacking classifier
stacking_clf = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression()
    )

# grid search
grid_search_stacking = GridSearchCV(stacking_clf, param_grid, 'accuracy', -1, cv=5)

# fit
grid_search_stacking.fit(xtrain_tfidf_ngram, y_train)

# print
print(f'best estimator is {grid_search_stacking.best_estimator_}')

print(f'best score is {grid_search_stacking.best_score_}')

print(f'best params is {grid_search_stacking.best_params_}')

best estimator is StackingClassifier(cv=None,
                   estimators=[('RF',
                                RandomForestClassifier(bootstrap=True,
                                                       ccp_alpha=0.0,
                                                       class_weight=None,
                                                       criterion='gini',
                                                       max_depth=None,
                                                       max_features='auto',
                                                       max_leaf_nodes=None,
                                                       max_samples=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                   

KeyboardInterrupt: ignored

## using best parameters

In [20]:
# estimators to use
estimators = [
              ('RF', RandomForestClassifier(n_estimators=250, criterion='gini', random_state=42)),
              ('LR', LogisticRegression(random_state=42)),
              ('SVM', SVC(C = 3.0, kernel = 'linear', gamma = 0.001,random_state=100))
              ]

# build stacking classifier
stacking_clf = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression()
    )

# train
stacking_clf.fit(xtrain_tfidf, y_train)

# predict
y_pred = stacking_clf.predict(xtest_tfidf)

# print accuraccy
print(f'Stacking accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%')

# classification report
print(classification_report(y_test, y_pred))

Stacking accuracy: 82.13%
              precision    recall  f1-score   support

           0       0.60      0.50      0.55        36
           1       0.91      0.78      0.84        37
           2       0.83      0.53      0.65        19
           3       0.75      0.87      0.80        99
           4       0.89      0.87      0.88        46
           5       0.83      0.89      0.86        38
           6       0.76      0.89      0.82        36
           7       0.00      0.00      0.00         1
           8       0.85      0.79      0.81        14
           9       0.55      0.46      0.50        24
          10       0.91      0.85      0.88       125
          11       0.63      0.79      0.70        24
          12       0.91      0.89      0.90        65
          13       0.00      0.00      0.00         3
          14       0.89      0.96      0.92        82

    accuracy                           0.82       649
   macro avg       0.69      0.67      0.67       649
