In [1]:
# configuration
config = {
    'FILE_PATH': 'cleaned_incidents1.csv'
    }
config

{'FILE_PATH': 'cleaned_incidents1.csv'}

In [19]:
# import libraries
try:
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import string
    import re
    from time import time
    from imblearn.over_sampling import SMOTE
    import texthero as hero
    from texthero import preprocessing
    from gensim.models import Word2Vec
    import nltk
    from nltk.corpus import stopwords
    from sklearn.preprocessing import LabelEncoder
    from sklearn.metrics import accuracy_score, confusion_matrix, plot_confusion_matrix, classification_report, ConfusionMatrixDisplay
    from sklearn.model_selection import train_test_split, GridSearchCV
    from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
    from sklearn.pipeline import Pipeline
    from sklearn.linear_model import SGDClassifier
    from sklearn.svm import SVC
    from sklearn.calibration import CalibratedClassifierCV
    import warnings
except(ImportError):
    print(f'Import Error: {ImportError}')

# ignore warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

# set seeds for reproducability
from numpy.random import seed
seed(500)

# global configurations
pd.set_option("display.max_colwidth", -1) # show larger text in pandas dataframe

In [4]:
'''
- select features
- drop missing values
- combine selected features into one
'''
def data(df):
    new_df = df[['EventDescription', 'FailedAssets', 'IncidentCause', 'IncidentConsequence', 'IncidentType', 'Status', 'WeatherStation', 'Category']]
    new_df.dropna(axis=0, inplace=True)
    features = new_df['EventDescription'] +' ' + new_df['IncidentCause'] + ' '+ new_df['IncidentConsequence']
    target = new_df['Category']
    return features, target

In [5]:
# read csv
dataset = pd.read_csv(config['FILE_PATH'])

# get features and target
features, target = data(dataset)

# Text Cleaning and Pre-processing
def preprocess_text():
    # cleaning steps
    cleaning_pipeline = [
        preprocessing.fillna,
        preprocessing.lowercase,
        preprocessing.remove_whitespace,
        preprocessing.remove_punctuation,
        preprocessing.remove_urls,
        preprocessing.remove_brackets,
        preprocessing.remove_stopwords,
        preprocessing.remove_digits,
        preprocessing.remove_angle_brackets,
        preprocessing.remove_curly_brackets,
        preprocessing.stem
        #preprocessing.tokenize
    ]

    # apply pipeline to text
    clean_text = features.pipe(hero.clean, cleaning_pipeline)

    return clean_text

In [6]:
# check processed text
clean_text = preprocess_text()
clean_text

0       nearbi custom report spark electr line locat attend crew found high voltag abc conductor fault midspan result ground fire approx sqm report injuri hvabc cabl fault midspan grassfir                                                                                                                                                                                                               
1       contractor report contact earth cabl excav trench locat arriv crew found contractor denni jame ph dig trench contact earth cabl caus damag cabl report injuri third parti properti damag contractor contact earth conductor go zone contact                                                                                                                                                        
2       field crew attend outag found 22kv conductor broken due rust connect sleev fallen ground one end remain aliv due high imped backfe downlin transform wind protect oper report injuri third parti propert

In [7]:
# train test split
x_train, x_test, y_train, y_test = train_test_split(clean_text, target, random_state=0, test_size=0.25, shuffle=True)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((4866,), (1622,), (4866,), (1622,))

In [8]:
# feature extraction methods
# tfidf
def tfidf():
    vectorizer = TfidfVectorizer(analyzer='word', max_features=1000)
    vectorizer.fit(clean_text)
    train_tfidf = vectorizer.transform(x_train)
    test_tfidf = vectorizer.transform(x_test)
    return train_tfidf, test_tfidf

# bow
def bow():
    count_vectorizer = CountVectorizer(analyzer='word', max_features=1000)
    count_vectorizer.fit(clean_text)
    train_bow = count_vectorizer.transform(x_train)
    test_bow = count_vectorizer.transform(x_test)
    return train_bow, test_bow

# bigrams
def bigrams():
    bigram_vectorizer = CountVectorizer(analyzer='word', ngram_range=(2,2), max_features=1000)
    bigram_vectorizer.fit(clean_text)
    train_bigram = bigram_vectorizer.transform(x_train)
    test_bigram = bigram_vectorizer.transform(x_test)
    return train_bigram, test_bigram

In [9]:
# get features
train_tfidf, test_tfidf = tfidf()
train_tfidf.shape, test_tfidf.shape

train_bigram, test_bigram = bigrams()

train_bow, test_bow = bow()

In [10]:
# SGD classifier with TFIDF
rf = SGDClassifier(random_state=0, alpha=0.0001, loss='hinge', max_iter=100, penalty='l2', n_jobs=-1)

# fit
rf.fit(train_tfidf, y_train)

# predict
y_pred = rf.predict(test_tfidf)

# accuracy
print(accuracy_score(y_test, y_pred))
print('Classification Report:\n', classification_report(y_test, y_pred))

0.8329223181257707
Classification Report:
               precision    recall  f1-score   support

    AF Other       0.53      0.53      0.53        91
      Animal       0.90      0.88      0.89        75
   Conductor       0.62      0.41      0.49        39
  Connection       0.75      0.90      0.82       227
    Crossarm       0.84      0.94      0.89       116
      Dug up       0.92      0.93      0.92        99
        Fuse       0.93      0.87      0.90       105
Installation       0.00      0.00      0.00         7
   Lightning       0.90      0.82      0.86        33
    OH Cable       0.66      0.53      0.58        59
       Other       0.87      0.88      0.87       315
        Pole       0.82      0.63      0.71        84
       Trees       0.90      0.88      0.89       138
    UG Cable       0.00      0.00      0.00         6
     Vehicle       0.92      0.95      0.93       228

    accuracy                           0.83      1622
   macro avg       0.70      0.68    

In [39]:
# grid search
param_grid = {
    'loss': ['hinge', 'log', 'modified_huber'],
    'penalty' : ['l2','l1', 'elasticnet'],
    'alpha' : [0.0001, 0.00001, 0.0005],
    'max_iter': [500, 800, 1000, 1200]

}
CV_rfc = GridSearchCV(estimator=SGDClassifier(random_state=0),
                      param_grid=param_grid, cv= 5, n_jobs=-1)
CV_rfc.fit(train_tfidf, y_train)
print(CV_rfc.best_params_)
print(CV_rfc.best_score_)

{'alpha': 0.0001, 'loss': 'log', 'max_iter': 500, 'penalty': 'l1'}
0.82881390985774


In [11]:
# SGD classifier with TFIDF
rf = SGDClassifier(random_state=0, n_jobs=-1, loss='log', max_iter=500, penalty='l1')

# fit
rf.fit(train_tfidf, y_train)

# predict
y_pred = rf.predict(test_tfidf)

# accuracy
accuracy_score(y_test, y_pred)

0.8286066584463625

In [12]:
# get features
train_tfidf, test_tfidf = tfidf()
train_tfidf.shape, test_tfidf.shape

train_bigram, test_bigram = bigrams()

train_bow, test_bow = bow()

In [13]:
oversample = SMOTE(random_state=0,n_jobs=-1,k_neighbors=5)
train_tfidf, y_train = oversample.fit_resample(train_tfidf, y_train)
print(f'Shape: {train_tfidf.shape}')
print(y_train.value_counts())

Shape: (15090, 1000)
Crossarm        1006
Lightning       1006
Connection      1006
Conductor       1006
UG Cable        1006
Installation    1006
Fuse            1006
Pole            1006
OH Cable        1006
Vehicle         1006
Animal          1006
AF Other        1006
Dug up          1006
Other           1006
Trees           1006
Name: Category, dtype: int64


In [21]:
# SGD classifier with TFIDF
sgd = SGDClassifier(random_state=0, max_iter=5000, n_jobs=-1)

# fit
sgd.fit(train_tfidf, y_train)

# predict
y_pred = rf.predict(test_tfidf)

# accuracy
accuracy_score(y_test, y_pred)

0.8242909987669543

In [26]:
# grid search
param_grid = {
    'loss': ['hinge', 'log'],
    'penalty' : ['l2','l1', 'elasticnet'],
    'alpha' : [0.0001, 0.001, 0.01],
    'max_iter': [1000, 2000, 5000, 8000]

}
CV_sgd = GridSearchCV(estimator=SGDClassifier(random_state=0),
                      param_grid=param_grid, cv= 5, n_jobs=-1)
CV_sgd.fit(train_tfidf, y_train)
print(CV_sgd.best_params_)
print(CV_sgd.best_score_)

{'alpha': 0.0001, 'loss': 'hinge', 'max_iter': 1000, 'penalty': 'l2'}
0.9380384360503644


In [14]:
# SGD classifier with TFIDF
sgd = SGDClassifier(random_state=0, alpha=0.0001, loss='hinge', max_iter=1000, penalty='l2', n_jobs=-1)

# fit
sgd.fit(train_tfidf, y_train)

# predict
y_pred = sgd.predict(test_tfidf)

# accuracy
accuracy_score(y_test, y_pred)

0.8242909987669543

In [20]:
calibrator = CalibratedClassifierCV(sgd, cv='prefit')
sgd=calibrator.fit(train_tfidf, y_train)

y_train_pred = sgd.predict_proba(train_tfidf)
y_test_pred = sgd.predict_proba(test_tfidf)

train proba: [[3.93549788e-02 1.03363528e-04 7.17322352e-04 ... 1.02406432e-04
  7.41410394e-06 6.73492518e-05]
 [9.79981425e-02 2.51615235e-02 6.99433632e-01 ... 2.17490662e-02
  3.80493714e-04 3.29265132e-02]
 [1.91265185e-04 2.19319510e-05 3.19567842e-03 ... 2.90667684e-04
  3.83388901e-08 5.99863496e-01]
 ...
 [5.11730606e-07 2.38084437e-03 7.91652150e-03 ... 1.87054299e-04
  6.09606234e-08 9.86163738e-01]
 [7.77361729e-04 8.72246761e-05 2.39120853e-02 ... 1.02288938e-03
  1.46396934e-06 9.59954806e-01]
 [2.04419016e-04 2.09317432e-04 8.16809235e-06 ... 3.87167390e-05
  5.09346080e-07 9.66785706e-01]]
test proba: [[6.18494229e-02 1.38052444e-04 6.39777105e-04 ... 6.65910326e-05
  2.09972166e-07 1.44493871e-04]
 [1.83140188e-03 5.12124968e-06 6.20007466e-04 ... 1.12436293e-04
  1.84410891e-08 1.31017611e-03]
 [2.39895099e-03 1.48623617e-05 2.78145513e-03 ... 7.70815145e-05
  2.94837582e-07 9.83379985e-01]
 ...
 [4.83044319e-03 1.03379257e-04 8.05992153e-06 ... 1.21942615e-04
  1.426

In [21]:
#saving the model into a package
import pickle
pickle.dump(sgd, open("sgd.pickle", "wb"))