In [23]:
# configuration
config = {
    'FILE_PATH': '/Users/pradeep/Desktop/ProjectANotebooks/notebooks/cleaned_incidents1.csv'
    }
config

{'FILE_PATH': '/Users/pradeep/Desktop/ProjectANotebooks/notebooks/cleaned_incidents1.csv'}

In [24]:
# import libraries
try:
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import string
    import re
    from time import time
    from imblearn.over_sampling import SMOTE
    import texthero as hero
    from texthero import preprocessing
    from gensim.models import Word2Vec
    import nltk
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer
    from sklearn.preprocessing import LabelEncoder
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.metrics import accuracy_score, confusion_matrix, plot_confusion_matrix, classification_report, ConfusionMatrixDisplay
    from sklearn.model_selection import train_test_split, GridSearchCV
    from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
    from sklearn.pipeline import Pipeline
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import StackingClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.svm import SVC
    import warnings
except(ImportError):
    print(f'Import Error: {ImportError}')

# ignore warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

# set seeds for reproducability
from numpy.random import seed
seed(500)

# global configurations
pd.set_option("display.max_colwidth", -1) # show larger text in pandas dataframe

In [25]:
'''
- select features
- drop missing values
- combine selected features into one
'''
def data(df):
    new_df = df[['EventDescription', 'FailedAssets', 'IncidentCause', 'IncidentConsequence', 'IncidentType', 'Status', 'WeatherStation', 'Category']]
    new_df.dropna(axis=0, inplace=True)
    features = new_df['EventDescription'] +' ' + new_df['IncidentCause'] + ' '+ new_df['IncidentConsequence']
    target = new_df['Category']
    return features, target

In [26]:
# read csv
dataset = pd.read_csv(config['FILE_PATH'])

# get features and target
features, target = data(dataset)

# Text Cleaning and Pre-processing
def preprocess_text():
    # cleaning steps
    cleaning_pipeline = [
        preprocessing.fillna,
        preprocessing.lowercase,
        preprocessing.remove_whitespace,
        preprocessing.remove_punctuation,
        preprocessing.remove_urls,
        preprocessing.remove_brackets,
        preprocessing.remove_stopwords,
        preprocessing.remove_digits,
        preprocessing.remove_angle_brackets,
        preprocessing.remove_curly_brackets,
        preprocessing.stem
        #preprocessing.tokenize
    ]

    # apply pipeline to text
    clean_text = features.pipe(hero.clean, cleaning_pipeline)

    return clean_text

In [27]:
# check processed text
clean_text = preprocess_text()
clean_text

0       nearbi custom report spark electr line locat attend crew found high voltag abc conductor fault midspan result ground fire approx sqm report injuri hvabc cabl fault midspan grassfir                                                                                                                                                                                                               
1       contractor report contact earth cabl excav trench locat arriv crew found contractor denni jame ph dig trench contact earth cabl caus damag cabl report injuri third parti properti damag contractor contact earth conductor go zone contact                                                                                                                                                        
2       field crew attend outag found 22kv conductor broken due rust connect sleev fallen ground one end remain aliv due high imped backfe downlin transform wind protect oper report injuri third parti propert

In [28]:
# train test split
x_train, x_test, y_train, y_test = train_test_split(clean_text, target, random_state=0, test_size=0.25, shuffle=True)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((4866,), (1622,), (4866,), (1622,))

In [29]:
# feature extraction methods
# tfidf
def tfidf():
    vectorizer = TfidfVectorizer(analyzer='word', max_features=1000)
    vectorizer.fit(clean_text)
    train_tfidf = vectorizer.transform(x_train)
    test_tfidf = vectorizer.transform(x_test)
    return train_tfidf, test_tfidf

# bow
def bow():
    count_vectorizer = CountVectorizer(analyzer='word', max_features=1000)
    count_vectorizer.fit(clean_text)
    train_bow = count_vectorizer.transform(x_train)
    test_bow = count_vectorizer.transform(x_test)
    return train_bow, test_bow

# bigrams
def bigrams():
    bigram_vectorizer = CountVectorizer(analyzer='word', ngram_range=(2,2), max_features=1000)
    bigram_vectorizer.fit(clean_text)
    train_bigram = bigram_vectorizer.transform(x_train)
    test_bigram = bigram_vectorizer.transform(x_test)
    return train_bigram, test_bigram

In [30]:
# get features
train_tfidf, test_tfidf = tfidf()
train_tfidf.shape, test_tfidf.shape

train_bigram, test_bigram = bigrams()

train_bow, test_bow = bow()

In [21]:
oversample = SMOTE(random_state=0,n_jobs=-1,k_neighbors=5)
train_tfidf, y_train = oversample.fit_resample(train_tfidf, y_train)
print(f'Shape: {train_tfidf.shape}')
print(y_train.value_counts())

Shape: (15090, 1000)
AF Other        1006
Other           1006
UG Cable        1006
OH Cable        1006
Pole            1006
Conductor       1006
Installation    1006
Lightning       1006
Fuse            1006
Trees           1006
Connection      1006
Vehicle         1006
Dug up          1006
Animal          1006
Crossarm        1006
Name: Category, dtype: int64


In [10]:
# random forest classifier with TFIDF
rf = RandomForestClassifier(random_state=0)

# fit
rf.fit(train_tfidf, y_train)

# predict
y_pred = rf.predict(test_tfidf)

# accuracy
accuracy_score(y_test, y_pred)

0.811960542540074

In [None]:
# grid search
param_grid = {
    'n_estimators': [380, 400, 420, 450, 500],
    #'max_depth' : [20,30,35,40,45],
    'criterion' :['gini']
}
CV_rfc = GridSearchCV(estimator=RandomForestClassifier(random_state=0),
                      param_grid=param_grid, cv= 5, n_jobs=-1)
CV_rfc.fit(train_tfidf, y_train)
print(CV_rfc.best_params_)
print(CV_rfc.best_score_)

In [10]:
# random forest classifier
rf = RandomForestClassifier(random_state=0, criterion='gini',  n_estimators=390)

# fit
rf.fit(train_tfidf, y_train)

# predict
y_pred = rf.predict(test_tfidf)

# accuracy
print(accuracy_score(y_test, y_pred))

# classification report
print(classification_report(y_test, y_pred))

0.8175092478421702
              precision    recall  f1-score   support

    AF Other       0.59      0.57      0.58        91
      Animal       0.90      0.85      0.88        75
   Conductor       0.48      0.56      0.52        39
  Connection       0.77      0.81      0.79       227
    Crossarm       0.83      0.95      0.88       116
      Dug up       0.91      0.94      0.93        99
        Fuse       0.83      0.86      0.85       105
Installation       0.25      0.14      0.18         7
   Lightning       0.88      0.88      0.88        33
    OH Cable       0.58      0.61      0.60        59
       Other       0.89      0.82      0.85       315
        Pole       0.77      0.57      0.66        84
       Trees       0.91      0.92      0.92       138
    UG Cable       0.00      0.00      0.00         6
     Vehicle       0.88      0.92      0.90       228

    accuracy                           0.82      1622
   macro avg       0.70      0.69      0.69      1622
weighte

In [31]:
# random forest classifier
rf2 = RandomForestClassifier(random_state=0, criterion='gini',  n_estimators=390)

# fit
rf2.fit(train_bow, y_train)

# predict
y_pred = rf2.predict(test_bow)

# accuracy
print(accuracy_score(y_test, y_pred))

# classification report
print(classification_report(y_test, y_pred))

0.8076448828606658
              precision    recall  f1-score   support

    AF Other       0.67      0.40      0.50        91
      Animal       0.90      0.76      0.83        75
   Conductor       0.54      0.36      0.43        39
  Connection       0.70      0.90      0.79       227
    Crossarm       0.78      0.93      0.85       116
      Dug up       0.89      0.89      0.89        99
        Fuse       0.86      0.87      0.86       105
Installation       0.00      0.00      0.00         7
   Lightning       0.93      0.82      0.87        33
    OH Cable       0.75      0.46      0.57        59
       Other       0.85      0.86      0.85       315
        Pole       0.75      0.57      0.65        84
       Trees       0.86      0.93      0.90       138
    UG Cable       0.00      0.00      0.00         6
     Vehicle       0.85      0.93      0.88       228

    accuracy                           0.81      1622
   macro avg       0.69      0.64      0.66      1622
weighte

In [32]:
# random forest classifier
rf = RandomForestClassifier(random_state=0, criterion='gini',  n_estimators=390)

# fit
rf.fit(train_bigram, y_train)

# predict
y_pred = rf.predict(test_bigram)

# accuracy
print(accuracy_score(y_test, y_pred))

# classification report
print(classification_report(y_test, y_pred))

0.717632552404439
              precision    recall  f1-score   support

    AF Other       0.45      0.40      0.42        91
      Animal       0.76      0.59      0.66        75
   Conductor       0.33      0.28      0.31        39
  Connection       0.67      0.77      0.72       227
    Crossarm       0.80      0.86      0.83       116
      Dug up       0.79      0.77      0.78        99
        Fuse       0.79      0.80      0.80       105
Installation       0.10      0.14      0.12         7
   Lightning       0.95      0.64      0.76        33
    OH Cable       0.49      0.39      0.43        59
       Other       0.75      0.82      0.78       315
        Pole       0.60      0.62      0.61        84
       Trees       0.77      0.77      0.77       138
    UG Cable       1.00      0.17      0.29         6
     Vehicle       0.82      0.77      0.80       228

    accuracy                           0.72      1622
   macro avg       0.67      0.59      0.60      1622
weighted

In [36]:
# use hashing vectorizer
from sklearn.feature_extraction.text import HashingVectorizer

hashing_vectorizer = HashingVectorizer(analyzer='word', n_features=1000)
hashing_vectorizer.fit(clean_text)
train_hash = hashing_vectorizer.transform(x_train)
test_hash = hashing_vectorizer.transform(x_test)

# random forest classifier
rf = RandomForestClassifier(random_state=0, criterion='gini',  n_estimators=390)

# fit
rf.fit(train_hash, y_train)

# predict
y_pred = rf.predict(test_hash)

# accuracy
print(accuracy_score(y_test, y_pred))

# classification report
print(classification_report(y_test, y_pred))

0.7990135635018496
              precision    recall  f1-score   support

    AF Other       0.64      0.40      0.49        91
      Animal       0.90      0.76      0.83        75
   Conductor       0.45      0.26      0.33        39
  Connection       0.69      0.90      0.78       227
    Crossarm       0.80      0.91      0.85       116
      Dug up       0.93      0.87      0.90        99
        Fuse       0.84      0.86      0.85       105
Installation       0.00      0.00      0.00         7
   Lightning       1.00      0.79      0.88        33
    OH Cable       0.83      0.42      0.56        59
       Other       0.81      0.87      0.84       315
        Pole       0.75      0.54      0.63        84
       Trees       0.85      0.91      0.88       138
    UG Cable       0.00      0.00      0.00         6
     Vehicle       0.85      0.92      0.88       228

    accuracy                           0.80      1622
   macro avg       0.69      0.63      0.65      1622
weighte