In [None]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV 

import nltk 
from nltk.corpus import stopwords
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer,PorterStemmer
import pickle
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

Collecting texthero
  Downloading https://files.pythonhosted.org/packages/1f/5a/a9d33b799fe53011de79d140ad6d86c440a2da1ae8a7b24e851ee2f8bde8/texthero-1.0.9-py3-none-any.whl
Collecting unidecode>=1.1.1
[?25l  Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)
[K     |████████████████████████████████| 245kB 7.1MB/s 
[?25hCollecting nltk>=3.3
[?25l  Downloading https://files.pythonhosted.org/packages/92/75/ce35194d8e3022203cca0d2f896dbb88689f9b3fce8e9f9cff942913519d/nltk-3.5.zip (1.4MB)
[K     |████████████████████████████████| 1.4MB 12.6MB/s 
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.5-cp36-none-any.whl size=1434675 sha256=4f02aacfa9c9bb5ed1668143e36b11bc42d340439a2816991c5a624d5e2155a7
  Stored in directory: /root/.cache/pip/wheels/ae/8c/3f/b1fe0ba04555b08b57ab52ab7f860

In [None]:
# Creating a list for columns to keep
cols = ['EventDescription','IncidentCause','IncidentConsequence', 'Category']

In [None]:
# Importing file
df = pd.read_csv('cleaned_incidents1.csv', usecols=cols)
print(df.shape)

#dropping nulls
df = df.dropna()
df.isnull().sum()

(6504, 4)


EventDescription       0
IncidentCause          0
IncidentConsequence    0
Category               0
dtype: int64

In [None]:
df['Description'] = df['EventDescription'] + ' ' + df['IncidentCause'] + ' ' + df['IncidentConsequence']

In [None]:
print(df['Description'])

0       A nearby customer reported sparking of electri...
1       A contractor reported that he had contacted an...
2       A field crew attending an outage found that a ...
3       Interfere and vandalism in substation. Unknown...
4       A nearby customer reported that a high load ha...
                              ...                        
6499    A report came in to UE of a FMB sparking at 31...
6500    Report received from a resident to advise that...
6501    Report received of pole fire at incident locat...
6502    A customer called to report a tractor had hit ...
6503    concrete electrical cover outside of front of ...
Name: Description, Length: 6488, dtype: object


Stopwords, Splitting, Label Encoding

In [None]:
# Creating stopwords list

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
numerical_symbols = re.compile('0-90-9a-z')
 
STOPWORDS = set(stopwords.words('english'))
 
def clean_text(text):
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = numerical_symbols.sub('', text)
    text = text.replace('x', '')
    #text = re.sub(r'\W+', '', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text

df['Description'] = df['Description'].apply(clean_text)


In [None]:
print(df['Description'])

0       nearby customer reported sparking electrical l...
1       contractor reported contacted earthing cable e...
2       field crew attending outage found 22kv conduct...
3       interfere vandalism substation unknown third p...
4       nearby customer reported high load pulled wire...
                              ...                        
6499    report came ue fmb sparking 310 st kilda st br...
6500    report received resident advise crane made con...
6501    report received pole fire incident locationon ...
6502    customer called report tractor hit wire brough...
6503    concrete electrical cover outside front house ...
Name: Description, Length: 6488, dtype: object


In [None]:
# Label encoding for Category
le = LabelEncoder()
df['Category'] = le.fit_transform(df['Category'].astype(str))
#store the 'Category' variable in Y
X = df[['Description']]
Y = df[['Category']]

In [None]:
# Splitting of data in test and train
x_train, x_test, y_train, y_test = train_test_split(df['Description'],Y, 
                                                    test_size=0.25, random_state=4)

XGBoost with three features (Event Description, Incident Cause and Incident Consequence)

In [None]:
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
vectorizer = TfidfVectorizer(stop_words=stopwords, analyzer='word', max_features=1000)
tfidf = vectorizer.fit(df['Description'])
# tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)

In [None]:
x_train_tfidf =  tfidf.transform(x_train)
x_test_tfidf =  tfidf.transform(x_test)

In [None]:
from xgboost import XGBClassifier
clf = XGBClassifier(learning_rate=0.1, verbosity=2)
clf.fit(x_train_tfidf, y_train)

In [None]:
# Model evaluation
prediction = clf.predict(x_test_tfidf)

acc = accuracy_score(y_test, prediction).round(4)
print("Accuracy using TF-IDF is: {}%".format(acc * 100.0))

Accuracy using TF-IDF is: 80.64%


In [None]:
# grid search
param_grid = {
    'learning_rate': [0.10, 0.20, 0.50],
    'max_depth' : [3, 4],
    'verbosity' : [1, 2],
    'min_child_weight' :[1, 2],
    # 'gamma' : [0.1, 0.2],
    # 'colsample_bytree' : [0.3, 0.4, 0.5, 0.7]
}
cv_xgb = GridSearchCV(estimator=XGBClassifier(random_state=4),
                      param_grid=param_grid, cv= 5, n_jobs=-1)
cv_xgb.fit(x_train_tfidf, y_train)
cv_xgb.best_params_
print(cv_xgb.best_score_)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.8288107443056993


In [None]:
print(cv_xgb.best_params_)

{'learning_rate': 0.1, 'max_depth': 4, 'min_child_weight': 2, 'verbosity': 1}


In [None]:
print(cv_xgb.best_params_)
#Best Parameters : {learning_rate : 0.1, max_depth: 4, min_child_weight: 2, verbosity: 1}

Optimized Model with Best Parameters

In [None]:
# clf = XGBClassifier(learning_rate=0.5, max_depth=3, gamma= 0.1, min_child_weight=5, verbosity=3)
clf = XGBClassifier(learning_rate=0.1, max_depth=4, verbosity=1, min_child_weight=2)
clf.fit(x_train_tfidf, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=4,
              min_child_weight=2, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
prediction = clf.predict(x_test_tfidf)

acc = accuracy_score(y_test, prediction).round(4)
print("Accuracy using TF-IDF is: {}%".format(acc * 100.0))

Accuracy using TF-IDF is: 81.26%
