In [1]:
# configuration
config = {
    'FILE_PATH': 'cleaned_incidents1.csv'
    }
config

{'FILE_PATH': 'cleaned_incidents1.csv'}

In [33]:
# import libraries
try:
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import string
    import re
    from time import time
    import texthero as hero
    from texthero import preprocessing
    from gensim.models import Word2Vec
    import nltk
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer
    from sklearn.preprocessing import LabelEncoder
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.metrics import accuracy_score, confusion_matrix, plot_confusion_matrix, classification_report, ConfusionMatrixDisplay
    from sklearn.model_selection import train_test_split, GridSearchCV
    from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
    from sklearn.pipeline import Pipeline
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import StackingClassifier
    from sklearn.svm import SVC
    import warnings
except(ImportError):
    print(f'Import Error: {ImportError}')

# ignore warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

# set seeds for reproducability
from numpy.random import seed
seed(500)

# global configurations
pd.set_option("display.max_colwidth", -1) # show larger text in pandas dataframe

In [115]:
'''
- select features
- drop missing values
- combine selected features into one
'''
def data(df):
    new_df = df[['EventDescription', 'FailedAssets', 'IncidentCause', 'IncidentConsequence', 'IncidentType', 'Status', 'WeatherStation', 'Category']]
    new_df.dropna(axis=0, inplace=True)
    features = new_df['IncidentType'] + ' ' + new_df['Status'] + ' ' + new_df['WeatherStation'] + new_df['EventDescription'] + new_df['FailedAssets'] + new_df['IncidentCause'] + new_df['IncidentConsequence']
    target = new_df['Category']
    return features, target

In [116]:
# read csv
dataset = pd.read_csv(config['FILE_PATH'])

# get features and target
features, target = data(dataset)

# Text Cleaning and Pre-processing
def preprocess_text():
    # cleaning steps
    cleaning_pipeline = [
        preprocessing.fillna,
        preprocessing.lowercase,
        preprocessing.remove_whitespace,
        preprocessing.remove_punctuation,
        preprocessing.remove_urls,
        preprocessing.remove_brackets,
        preprocessing.remove_stopwords,
        preprocessing.remove_digits,
        preprocessing.remove_angle_brackets,
        preprocessing.remove_curly_brackets,
        preprocessing.stem
        #preprocessing.tokenize
    ]

    # apply pipeline to text
    clean_text = features.pipe(hero.clean, cleaning_pipeline)

    return clean_text

In [117]:
# check processed text
clean_text = preprocess_text()
clean_text

0       infrastructure  network based  report avalon airporta nearby customer reported sparking  electrical lines    location   attendance  crew found   high voltage abc conductor  faulted midspan resulting   ground fire  approx   sqm     reported injuries conductor  abc hvabc cable faulted midspangrassfire                                                                                                                                                                                                                                                                                                           
1       infrastructure  network based  report laverton raafa contractor reported    contacted  earthing cable  excavating  trench    location   arrival  crew found   contractor  dennis james  ph          digging  trench  contacted  earthing cable  causing damage   cable     reported injuries  third party property damages  failed assetcontractor contacted earthing conductorno go zone  conta

In [118]:
# train test split
x_train, x_test, y_train, y_test = train_test_split(clean_text, target, random_state=0, test_size=0.25, shuffle=True)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((4866,), (1622,), (4866,), (1622,))

In [128]:
# feature extraction methods
# tfidf
def tfidf():
    vectorizer = TfidfVectorizer(analyzer='word', max_features=1000)
    vectorizer.fit(clean_text)
    train_tfidf = vectorizer.transform(x_train)
    test_tfidf = vectorizer.transform(x_test)
    return train_tfidf, test_tfidf

# bow
def bow():
    count_vectorizer = CountVectorizer(analyzer='word', max_features=1000)
    count_vectorizer.fit(clean_text)
    train_bow = count_vectorizer.transform(x_train)
    test_bow = count_vectorizer.transform(x_test)
    return train_bow, test_bow

# bigrams
def bigrams():
    bigram_vectorizer = CountVectorizer(analyzer='word', ngram_range=(2,2), max_features=1000)
    bigram_vectorizer.fit(clean_text)
    train_bigram = bigram_vectorizer.transform(x_train)
    test_bigram = bigram_vectorizer.transform(x_test)
    return train_bigram, test_bigram

In [129]:
# get features
train_tfidf, test_tfidf = tfidf()
train_tfidf.shape, test_tfidf.shape

train_bigram, test_bigram = bigrams()

train_bow, test_bow = bow()

In [131]:
# random forest classifier with TFIDF
rf = RandomForestClassifier(random_state=0)

# fit
rf.fit(train_tfidf, y_train)

# predict
y_pred = rf.predict(test_tfidf)

# accuracy
accuracy_score(y_test, y_pred)

0.7879161528976573

In [140]:
# grid search
param_grid = {
    'n_estimators': [375,380,385],
    'max_depth' : [29,30,31],
    'criterion' :['gini', 'entropy']
}
CV_rfc = GridSearchCV(estimator=RandomForestClassifier(random_state=0),
                      param_grid=param_grid, cv= 5, n_jobs=-1)
CV_rfc.fit(train_tfidf, y_train)
CV_rfc.best_params_

{'criterion': 'gini', 'max_depth': 30, 'n_estimators': 375}

In [141]:
# random forest classifier
rf = RandomForestClassifier(random_state=0, criterion='gini', max_depth=30, n_estimators=375)

# fit
rf.fit(train_tfidf, y_train)

# predict
y_pred = rf.predict(test_tfidf)

# accuracy
accuracy_score(y_test, y_pred)

0.7842170160295932

##Experiements##

In [148]:
# random forest with bow
count_vectorizer = CountVectorizer(analyzer='word', max_features=1000)
count_vectorizer.fit(clean_text)
train_bow = count_vectorizer.transform(x_train)
test_bow = count_vectorizer.transform(x_test)

# random forest classifier
rf = RandomForestClassifier(random_state=0)

# fit
rf.fit(train_bow, y_train)

# predict
y_pred = rf.predict(test_bow)

# accuracy
accuracy_score(y_test, y_pred)

0.3686806411837238

In [146]:
# RF with bigram
bigram_vectorizer = CountVectorizer(analyzer='word', ngram_range=(2,2), max_features=1000)
bigram_vectorizer.fit(clean_text)
train_bigram = bigram_vectorizer.transform(x_train)
test_bigram = bigram_vectorizer.transform(x_test)


# random forest classifier
rf = RandomForestClassifier(random_state=0, criterion='gini', max_depth=30, n_estimators=375)

# fit
rf.fit(train_bigram, y_train)

# predict
y_pred = rf.predict(test_bigram)

# accuracy
accuracy_score(y_test, y_pred)

0.15782983970406905

In [None]:
# RF with TFIDF using max_feature 5000
vectorizer = TfidfVectorizer(analyzer='word', max_features=3000)
vectorizer.fit(clean_text)
train_tfidf = vectorizer.transform(x_train)
test_tfidf = vectorizer.transform(x_test)

# grid search
param_grid = {
    'n_estimators': [100,150,200,300,500],
    'max_depth' : [20,25,30,35],
    'criterion' :['gini', 'entropy']
}
CV_rfc = GridSearchCV(estimator=RandomForestClassifier(random_state=0),
                      param_grid=param_grid, cv= 5, n_jobs=-1)
CV_rfc.fit(train_tfidf, y_train)
print(CV_rfc.best_params_)
print(CV_rfc.best_score_)