In [1]:
# configuration
config = {
    'FILE_PATH': 'cleaned_incidents1.csv'
    }


In [2]:
# import libraries
try:
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import string
    import re
    from time import time
    import texthero as hero
    from texthero import preprocessing
    from gensim.models import Word2Vec
    import nltk
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer
    from sklearn.preprocessing import LabelEncoder
    from sklearn.metrics import accuracy_score
    from sklearn.model_selection import train_test_split
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.pipeline import Pipeline
    from sklearn.tree import DecisionTreeClassifier 
    from imblearn.over_sampling import SMOTE
    
    from sklearn.svm import SVC

    from sklearn.feature_selection import chi2, SelectKBest
    from nltk.corpus import wordnet
    from nltk.stem import WordNetLemmatizer

    import warnings
except(ImportError):
    print(f'Import Error: {ImportError}')

# ignore warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

# set seeds for reproducability
from numpy.random import seed
seed(500)

# global configurations
pd.set_option("display.max_colwidth", -1) # show larger text in pandas dataframe

In [3]:
# read csv
df = pd.read_csv(config['FILE_PATH'])

new_df = df[['EventDescription', 'FailedAssets', 'IncidentCause', 'IncidentConsequence', \
             'IncidentType', 'Status', 'WeatherStation', 'CauseEnvironment', 'CauseTechnical', \
             'CauseCommunity', 'ActionTaken', 'Category', 'IncidentLocationType', 'NetworkType', \
             'WeatherStation', 'Locality']]

new_df = new_df.dropna(axis=0, subset=['Category'])
new_df = new_df.replace(np.nan, '', regex=True)

In [7]:
df.columns

Index(['ActionTaken', 'Address', 'AssetLabel', 'CauseCommunity',
       'CauseEnvironment', 'CausePre', 'CauseTechnical', 'CauseWorkP',
       'ContactType', 'CorrectProtection', 'EventDescription', 'FailedAssets',
       'FailedExplosion', 'FailedOilFilled', 'FailedOtherAssets',
       'FailedOtherAssetsOther', 'FeederNumber', 'IncidentCause',
       'IncidentConsequence', 'IncidentDatetime',
       'IncidentFireFFactorReportable', 'IncidentFireSeverity', 'IncidentID',
       'IncidentLocationType', 'IncidentLocationTypeOther', 'IncidentNumber',
       'IncidentType', 'Lat', 'Long', 'MadeSafe', 'NetworkType', 'Status',
       'SubmissionID', 'SubmittedDateTimeString', 'Voltage', 'WeatherStation',
       'Postcode', 'Locality', 'Category'],
      dtype='object')

In [4]:
# Text Cleaning and Pre-processing
def preprocess_text():
    # cleaning steps
    cleaning_pipeline = [
        preprocessing.fillna,
        preprocessing.lowercase,
        preprocessing.remove_whitespace,
        preprocessing.remove_punctuation,
        preprocessing.remove_urls,
        preprocessing.remove_brackets,
        preprocessing.remove_stopwords,
        preprocessing.remove_digits,
        preprocessing.remove_angle_brackets,
        preprocessing.remove_curly_brackets,
        preprocessing.stem
    ]

    # apply pipeline to text
    clean_text = features.pipe(hero.clean, cleaning_pipeline)
    
    return clean_text

In [5]:
# Lemmatize with POS Tag
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def get_lematizer(sentence):
    clean_text =  (" ".join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence) if w not in string.punctuation]))
    return clean_text


In [7]:
# EDA Experiments

# Get features
features1 = new_df['CauseCommunity'] + ' ' + new_df['CauseEnvironment'] + ' ' + new_df['CauseTechnical'] + \
    ' ' + new_df['EventDescription'] + ' ' + new_df['FailedAssets'] + ' ' + new_df['IncidentCause'] + \
    ' ' + new_df['IncidentConsequence'] + ' ' + new_df['IncidentType'] + ' ' + new_df['ActionTaken']
        
features2 = new_df['EventDescription'] + ' ' + new_df['FailedAssets'] + ' ' + new_df['IncidentCause'] 

features3 = new_df['CauseCommunity'] + ' ' + new_df['CauseEnvironment'] + ' ' + new_df['CauseTechnical'] + \
    ' ' + new_df['EventDescription'] + ' ' + new_df['FailedAssets'] + ' ' + new_df['IncidentCause'] + \
    ' ' + new_df['IncidentConsequence'] + ' ' + new_df['IncidentType'] + ' ' + new_df['ActionTaken'] + \
    ' ' + new_df['IncidentLocationType'] + ' ' + new_df['NetworkType'] + ' ' + new_df['Locality']

# change here to experiment with different features
features = features1

target = new_df['Category']
    
# clean the data
clean_text = preprocess_text() # feature extraction/vectorize

# lemmatization - the process will take a while and do not increase the accuracy
# uncomment it if you want to experiment with this
#clean_text = clean_text.apply(lambda sentence: get_lematizer(sentence))

# preparation for feature selection
new_df['category_id'] = new_df['Category'].factorize()[0]
category_id_df = new_df[['Category', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
#print(category_to_id)

# vectorize
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2))

tfidf_features = tfidf.fit_transform(clean_text).toarray()
labels = new_df.category_id
print('After Tfidf:', tfidf_features.shape)
clean_text = tfidf_features

# feature selection - set num_of_selection to > 0  if you want to do feature selection 
# based on to find the terms that are the most correlated with each of Category

num_of_selection = 5000 
if num_of_selection > 0:
    ch2 = SelectKBest(chi2, k=num_of_selection)
    clean_text = ch2.fit_transform(tfidf_features, new_df['category_id'])
    print('finish feature selection: ', clean_text.shape)

# # split data
x_train, x_test, y_train, y_test = train_test_split(clean_text, target, random_state=0, test_size=0.25, shuffle=True)

# classification - DT
dt = DecisionTreeClassifier(random_state=1)
dt.fit(x_train ,y_train)
y_pred = dt.predict(x_test)
print('DT:', accuracy_score(y_test, y_pred))

# classification - SVM
svc = SVC(C=1.0, kernel = 'linear')
svc.fit(x_train, y_train)
y_pred = svc.predict(x_test)
print('SVM:', accuracy_score(y_test, y_pred))

After Tfidf: (6489, 12364)
finish feature selection:  (6489, 5000)
DT: 0.7350585335797906


Results:
CauseCommunity, CauseEnvironment, CauseTechnical, EventDescription, FailedAssets, IncidentCause, IncidentConsequence
IncidentType, ActionTaken. Use all features. SVM: 84.47%, DT: 73.75%

Initial features after vectorisation: 12364

As above, but with feature selections 1000. SVM: 83.05%, DT: 73.14%, the SVM classification is much faster

As above, but with feature selections 3000. SVM: 83.06%, DT: 73.57%

As above, but with feature selections 5000. SVM: 84.47%, DT: 73.50%

EventDescription, FailedAssets, IncidentCause. No feature selections. SVM: 83.73%, DT: 71%

CauseCommunity, CauseEnvironment, CauseTechnical, EventDescription, FailedAssets, IncidentCause, IncidentConsequence
IncidentType, ActionTaken, IncidentLocationType, NetworkType, Locality. DT: 74%. SVM: %84.17
    


In [8]:
# show which terms related to each category
N = 5
for Category, category_id in sorted(category_to_id.items()):
  features_chi2 = chi2(tfidf_features, labels == category_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  print("# '{}':".format(Category))
  print("  . Most correlated unigrams:\n       . {}".format('\n       . '.join(unigrams[-N:])))
  print("  . Most correlated bigrams:\n       . {}".format('\n       . '.join(bigrams[-N:])))

# 'AF Other':
  . Most correlated unigrams:
       . compon
       . capacitor
       . hps
       . light
       . lantern
  . Most correlated bigrams:
       . report street
       . light lantern
       . public light
       . light fire
       . street light
# 'Animal':
  . Most correlated unigrams:
       . magpi
       . nest
       . flashov
       . possum
       . bird
  . Most correlated bigrams:
       . caus flashov
       . found bird
       . bird contact
       . bird unknown
       . bird flashov
# 'Conductor':
  . Most correlated unigrams:
       . span
       . splice
       . rusti
       . clash
       . spreader
  . Most correlated bigrams:
       . clash grassfir
       . bare conductor
       . conductor broken
       . wind conductor
       . conductor clash
# 'Connection':
  . Most correlated unigrams:
       . shower
       . corrod
       . box
       . overh
       . connect
  . Most correlated bigrams:
       . conductor termin
       . servic neutral
     

##Experiements##