In [None]:
!pip install texthero
import pandas as pd
import nltk
import texthero as hero
from texthero import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV 
from imblearn.over_sampling import SMOTE

import nltk 
from nltk.corpus import stopwords
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer,PorterStemmer
import pickle
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [217]:
# Creating a list for columns to keep
cols = ['EventDescription', 'IncidentCause','IncidentConsequence', 'Category']

In [218]:
# Importing file
df = pd.read_csv('/content/drive/MyDrive/ESV Data/cleaned_incidents1.csv', usecols=cols)
print(df.shape)

#dropping nulls
df = df.dropna()
df.isnull().sum()

(6504, 4)


EventDescription       0
IncidentCause          0
IncidentConsequence    0
Category               0
dtype: int64

In [219]:
df['Category'].unique()

array(['OH Cable', 'Dug up', 'Connection', 'Other', 'Vehicle', 'Fuse',
       'Trees', 'Pole', 'Crossarm', 'Conductor', 'Animal', 'AF Other',
       'Lightning', 'UG Cable', 'Installation'], dtype=object)

In [220]:
df['Description'] =  df['EventDescription'] + ' ' + df['IncidentCause']+ ' ' + df['IncidentConsequence']

In [221]:
print(df['Description'])

0       A nearby customer reported sparking of electri...
1       A contractor reported that he had contacted an...
2       A field crew attending an outage found that a ...
3       Interfere and vandalism in substation. Unknown...
4       A nearby customer reported that a high load ha...
                              ...                        
6499    A report came in to UE of a FMB sparking at 31...
6500    Report received from a resident to advise that...
6501    Report received of pole fire at incident locat...
6502    A customer called to report a tractor had hit ...
6503    concrete electrical cover outside of front of ...
Name: Description, Length: 6488, dtype: object


In [222]:
df['clean_description'] = hero.clean(df['Description'])
df['clean_description']

0       nearby customer reported sparking electrical l...
1       contractor reported contacted earthing cable e...
2       field crew attending outage found 22kv conduct...
3       interfere vandalism substation unknown third p...
4       nearby customer reported high load pulled wire...
                              ...                        
6499    report came ue fmb sparking st kilda st bright...
6500    report received resident advise crane made con...
6501    report received pole fire incident location ar...
6502    customer called report tractor hit wire brough...
6503    concrete electrical cover outside front house ...
Name: clean_description, Length: 6488, dtype: object


Stopwords, Splitting, Label Encoding

In [223]:
# Label encoding for Category
le = LabelEncoder()
df['Category'] = le.fit_transform(df['Category'].astype(str))
#store the 'Category' variable in Y
X = df['clean_description']
Y = df['Category']

In [224]:
print(Y)

0        9
1        5
2        3
3       10
4       14
        ..
6499     3
6500    14
6501    11
6502    14
6503     0
Name: Category, Length: 6488, dtype: int64


In [225]:
# Splitting of data in test and train
x_train, x_test, y_train, y_test = train_test_split(X,Y,
                                                    test_size=0.25, random_state=4)

In [226]:
print(x_train.shape,y_train.shape,x_test.shape,y_test.shape)

(4866,) (4866,) (1622,) (1622,)


Naive Bayes

TF-IDF

In [227]:
vectorizer = TfidfVectorizer(analyzer='word',max_features=5000)
vectorizer.fit(X)
tfidf_x_train=vectorizer.transform(x_train)
tfidf_x_test=vectorizer.transform(x_test)

In [228]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB(alpha=0.1)
clf.fit(tfidf_x_train,y_train)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [229]:
# Model evaluation
# xtest = x_test_tfidf.todense()
prediction = clf.predict(tfidf_x_test)
acc = accuracy_score(y_test, prediction).round(4)
print("Accuracy using TF-IDF is: {}%".format(acc * 100.0))

Accuracy using TF-IDF is: 77.99000000000001%


CountVect

In [230]:
count_vectorizer = CountVectorizer(ngram_range=(1,2))
count_vectorizer.fit(X)
train_bow = count_vectorizer.transform(x_train)
test_bow = count_vectorizer.transform(x_test)

In [231]:
clf.fit(train_bow,y_train)
prediction = clf.predict(test_bow)
acc = accuracy_score(y_test, prediction).round(4)
print("Accuracy using TF-IDF is: {}%".format(acc * 100.0))

Accuracy using TF-IDF is: 75.89%


SMOTE

In [232]:
oversample = SMOTE(random_state=0,n_jobs=-1,k_neighbors=10)
train_tfidf, y_train = oversample.fit_resample(tfidf_x_train, y_train)


Function safe_indexing is deprecated; safe_indexing is deprecated in version 0.22 and will be removed in version 0.24.



In [233]:
clf.fit(train_tfidf,y_train)
prediction = clf.predict(tfidf_x_test)
acc = accuracy_score(y_test, prediction).round(4)
print("Accuracy using TF-IDF + SMOTE is: {}%".format(acc * 100.0))

Accuracy using TF-IDF + SMOTE is: 75.03%


Findings: 

1. MultinomialNB with Tfidf and max_features=1000 gives the accuracy upto 76.759%. 

2. When using max_features=5000 with Tfidf, the accuracy is 77.99%

3. using countvectorizer with ngram_range=(1,2), the accuracy is only 75.89%

4. Tf-IDF (max_features = 5000) with SMOTE (n_neighbors = 10) gives 75.03 accuracy
