In [84]:
# configuration
config = {
    'FILE_PATH': '/Users/pradeep/Desktop/ProjectANotebooks/notebooks/cleaned_incidents1.csv'
    }
config

{'FILE_PATH': '/Users/pradeep/Desktop/ProjectANotebooks/notebooks/cleaned_incidents1.csv'}

In [85]:
# import libraries
try:
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import string
    import re
    from time import time
    from imblearn.over_sampling import SMOTE
    import texthero as hero
    from texthero import preprocessing
    from autocorrect import spell
    from gensim.models import Word2Vec
    import nltk
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer
    from sklearn.preprocessing import LabelEncoder
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.metrics import accuracy_score, confusion_matrix, plot_confusion_matrix, classification_report, ConfusionMatrixDisplay
    from sklearn.model_selection import train_test_split, GridSearchCV
    from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
    from sklearn.pipeline import Pipeline
    from sklearn.linear_model import SGDClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import StackingClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.svm import SVC
    from sklearn.neural_network import MLPClassifier
    import warnings
except(ImportError):
    print(f'Import Error: {ImportError}')

# ignore warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

# set seeds for reproducability
from numpy.random import seed
seed(500)

# global configurations
pd.set_option("display.max_colwidth", -1) # show larger text in pandas dataframe

In [86]:
'''
- select features
- drop missing values
- combine selected features into one
'''
def data(df):
    new_df = df[['EventDescription', 'FailedAssets', 'IncidentCause', 'IncidentConsequence', 'IncidentType', 'Status', 'WeatherStation', 'Category']]
    new_df.dropna(axis=0, inplace=True)
    features = new_df['EventDescription'] + ' ' +  new_df['IncidentCause'] + ' ' +  new_df['IncidentConsequence']
    target = new_df['Category']
    return features, target

In [87]:
# read csv
dataset = pd.read_csv(config['FILE_PATH'])

# get features and target
features, target = data(dataset)

# Text Cleaning and Pre-processing
def preprocess_text():
    # cleaning steps
    cleaning_pipeline = [
        preprocessing.fillna,
        preprocessing.lowercase,
        preprocessing.remove_whitespace,
        preprocessing.remove_punctuation,
        preprocessing.remove_urls,
        preprocessing.remove_brackets,
        preprocessing.remove_stopwords,
        preprocessing.remove_digits,
        preprocessing.remove_angle_brackets,
        preprocessing.remove_curly_brackets,
        preprocessing.stem
        #preprocessing.tokenize
    ]

    # apply pipeline to text
    clean_text = features.pipe(hero.clean, cleaning_pipeline)

    return clean_text

In [88]:
# check processed text
clean_text = preprocess_text()
clean_text

0       nearbi custom report spark electr line locat attend crew found high voltag abc conductor fault midspan result ground fire approx sqm report injuri hvabc cabl fault midspan grassfir                                                                                                                                                                                                               
1       contractor report contact earth cabl excav trench locat arriv crew found contractor denni jame ph dig trench contact earth cabl caus damag cabl report injuri third parti properti damag contractor contact earth conductor go zone contact                                                                                                                                                        
2       field crew attend outag found 22kv conductor broken due rust connect sleev fallen ground one end remain aliv due high imped backfe downlin transform wind protect oper report injuri third parti propert

In [89]:
# train test split
x_train, x_test, y_train, y_test = train_test_split(clean_text, target, random_state=0, test_size=0.25, shuffle=True)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((4866,), (1622,), (4866,), (1622,))

In [90]:
# feature extraction methods
# tfidf
def tfidf():
    vectorizer = TfidfVectorizer(analyzer='word', max_features=10000)
    vectorizer.fit(clean_text)
    train_tfidf = vectorizer.transform(x_train)
    test_tfidf = vectorizer.transform(x_test)
    return train_tfidf, test_tfidf

# bow
def bow():
    count_vectorizer = CountVectorizer(analyzer='word', max_features=1000)
    count_vectorizer.fit(clean_text)
    train_bow = count_vectorizer.transform(x_train)
    test_bow = count_vectorizer.transform(x_test)
    return train_bow, test_bow

# bigrams
def bigrams():
    bigram_vectorizer = CountVectorizer(analyzer='word', ngram_range=(2,2), max_features=1000)
    bigram_vectorizer.fit(clean_text)
    train_bigram = bigram_vectorizer.transform(x_train)
    test_bigram = bigram_vectorizer.transform(x_test)
    return train_bigram, test_bigram

In [91]:
# get features
train_tfidf, test_tfidf = tfidf()
train_tfidf.shape, test_tfidf.shape

train_bigram, test_bigram = bigrams()

train_bow, test_bow = bow()

In [9]:
# classifier
mlp_clf = MLPClassifier(random_state=0)

# fit
mlp_clf.fit(train_tfidf, y_train)

# predict
y_pred = mlp_clf.predict(test_tfidf)

# print accuraccy
print(f'MLP accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%')

# classification report
print(classification_report(y_test, y_pred))

MLP accuracy: 78.24%
              precision    recall  f1-score   support

    AF Other       0.43      0.52      0.47        91
      Animal       0.82      0.81      0.82        75
   Conductor       0.46      0.44      0.45        39
  Connection       0.72      0.79      0.75       227
    Crossarm       0.79      0.84      0.82       116
      Dug up       0.94      0.85      0.89        99
        Fuse       0.88      0.83      0.85       105
Installation       0.25      0.14      0.18         7
   Lightning       0.91      0.64      0.75        33
    OH Cable       0.53      0.53      0.53        59
       Other       0.83      0.87      0.85       315
        Pole       0.70      0.61      0.65        84
       Trees       0.91      0.80      0.85       138
    UG Cable       0.00      0.00      0.00         6
     Vehicle       0.92      0.91      0.91       228

    accuracy                           0.78      1622
   macro avg       0.67      0.64      0.65      1622
weigh

In [20]:
%%timeit
# grid search parameters
param_grid = {
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'alpha' : [0.0001, 0.001, 0.01],
    'learning_rate_init': [0.001, 0.01, 0.1],
    'max_iter':[200, 300, 500],
    'early_stopping': [True],
    'hidden_layer_sizes': [(100,), (200,), (250,)]
}

# build stacking classifier
mlp_clf = MLPClassifier(random_state=0)

# grid search
grid_search_mlp = GridSearchCV(mlp_clf, param_grid, 'accuracy', -1, cv=5)

# fit
grid_search_mlp.fit(train_tfidf, y_train)

# print
print(f'best score is {grid_search_mlp.best_score_}')

print(f'best params is {grid_search_mlp.best_params_}')

best score is 0.8123730877427715
best params is {'alpha': 0.01, 'early_stopping': True, 'hidden_layer_sizes': (250,), 'learning_rate': 'constant', 'learning_rate_init': 0.001, 'max_iter': 200}


KeyboardInterrupt: 

In [74]:
# classifier
mlp_clf = MLPClassifier(random_state=0, alpha=0.01, early_stopping=True,
                        hidden_layer_sizes=(500,),
                        learning_rate='constant',
                        learning_rate_init= 0.001, max_iter= 100
                        )
# fit
mlp_clf.fit(train_tfidf, y_train)

# predict
y_pred = mlp_clf.predict(test_tfidf)

# print accuraccy
print(f'MLP accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%')

# classification report
print(classification_report(y_test, y_pred))

MLP accuracy: 83.35%
              precision    recall  f1-score   support

    AF Other       0.61      0.55      0.58        91
      Animal       0.88      0.91      0.89        75
   Conductor       0.51      0.51      0.51        39
  Connection       0.75      0.89      0.82       227
    Crossarm       0.88      0.90      0.89       116
      Dug up       0.94      0.90      0.92        99
        Fuse       0.90      0.87      0.88       105
Installation       0.00      0.00      0.00         7
   Lightning       0.92      0.67      0.77        33
    OH Cable       0.63      0.54      0.58        59
       Other       0.86      0.90      0.88       315
        Pole       0.73      0.73      0.73        84
       Trees       0.90      0.86      0.88       138
    UG Cable       0.00      0.00      0.00         6
     Vehicle       0.95      0.92      0.94       228

    accuracy                           0.83      1622
   macro avg       0.70      0.68      0.68      1622
weigh

In [92]:
# classifier
mlp_clf = MLPClassifier(random_state=0, alpha=0.01, early_stopping=True,
                        hidden_layer_sizes=(500,),
                        learning_rate='constant',
                        learning_rate_init= 0.001, max_iter= 100
                        )
# fit
mlp_clf.fit(train_bow, y_train)

# predict
y_pred = mlp_clf.predict(test_bow)

# print accuraccy
print(f'MLP accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%')

# classification report
print(classification_report(y_test, y_pred))

MLP accuracy: 82.55%
              precision    recall  f1-score   support

    AF Other       0.57      0.57      0.57        91
      Animal       0.84      0.83      0.83        75
   Conductor       0.48      0.51      0.49        39
  Connection       0.76      0.88      0.82       227
    Crossarm       0.86      0.87      0.86       116
      Dug up       0.92      0.93      0.92        99
        Fuse       0.88      0.88      0.88       105
Installation       0.00      0.00      0.00         7
   Lightning       0.89      0.76      0.82        33
    OH Cable       0.66      0.56      0.61        59
       Other       0.87      0.87      0.87       315
        Pole       0.73      0.68      0.70        84
       Trees       0.92      0.83      0.87       138
    UG Cable       0.67      0.33      0.44         6
     Vehicle       0.93      0.94      0.94       228

    accuracy                           0.83      1622
   macro avg       0.73      0.70      0.71      1622
weigh

In [10]:
oversample = SMOTE(random_state=0,n_jobs=-1,k_neighbors=5)
train_tfidf, y_train = oversample.fit_resample(train_tfidf, y_train)
print(f'Shape: {train_tfidf.shape}')
print(y_train.value_counts())


Shape: (15090, 1000)
Animal          1006
Crossarm        1006
Conductor       1006
Vehicle         1006
Trees           1006
Other           1006
Dug up          1006
OH Cable        1006
Installation    1006
Fuse            1006
AF Other        1006
Connection      1006
UG Cable        1006
Pole            1006
Lightning       1006
Name: Category, dtype: int64


In [11]:
# classifier
mlp_clf = MLPClassifier(random_state=0)

# fit
mlp_clf.fit(train_tfidf, y_train)

# predict
y_pred = mlp_clf.predict(test_tfidf)

# print accuraccy
print(f'MLP accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%')

# classification report
print(classification_report(y_test, y_pred))

MLP accuracy: 77.99%
              precision    recall  f1-score   support

    AF Other       0.44      0.52      0.47        91
      Animal       0.81      0.80      0.81        75
   Conductor       0.41      0.49      0.45        39
  Connection       0.72      0.78      0.75       227
    Crossarm       0.80      0.85      0.83       116
      Dug up       0.95      0.85      0.90        99
        Fuse       0.86      0.83      0.84       105
Installation       0.29      0.29      0.29         7
   Lightning       0.88      0.64      0.74        33
    OH Cable       0.47      0.51      0.49        59
       Other       0.84      0.87      0.86       315
        Pole       0.71      0.58      0.64        84
       Trees       0.92      0.80      0.85       138
    UG Cable       0.00      0.00      0.00         6
     Vehicle       0.93      0.90      0.92       228

    accuracy                           0.78      1622
   macro avg       0.67      0.65      0.65      1622
weigh