# Importing Libraries

In [276]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV 

import nltk 
from nltk.corpus import stopwords
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer,PorterStemmer
import pickle
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [255]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Importing Dataset

In [256]:
# Creating a list for columns to keep
cols = ['EventDescription','FailedAssets','IncidentCause','IncidentConsequence','IncidentType','WeatherStation','Status','Category']

In [257]:
# Importing file
df = pd.read_csv('/content/drive/MyDrive/ESV Data/cleaned_incidents1.csv', usecols=cols)
print(df.shape)

#dropping nulls
df = df.dropna()
df.isnull().sum()

(6504, 8)


EventDescription       0
FailedAssets           0
IncidentCause          0
IncidentConsequence    0
IncidentType           0
Status                 0
WeatherStation         0
Category               0
dtype: int64

In [258]:
df['Description'] = df['WeatherStation'] + ' ' + df['IncidentType'] + ' ' + df['Status'] + ' ' + df['EventDescription'] + ' ' + df['FailedAssets'] + ' ' + df['IncidentCause']+ ' ' + df['IncidentConsequence']

In [259]:
df['Description']

0       Avalon Airport Infrastructure (network-based) ...
1       Laverton Raaf Infrastructure (network-based) R...
2       Warrnambool Airport Ndb Infrastructure (networ...
3       Essendon Airport Infrastructure (network-based...
4       Avalon Airport Infrastructure (network-based) ...
                              ...                        
6499    Moorabbin Airport Infrastructure (network-base...
6500    Avalon Airport Infrastructure (network-based) ...
6501    Mildura Airport Infrastructure (network-based)...
6502    Swan Hill Aerodrome Infrastructure (network-ba...
6503    Moorabbin Airport Infrastructure (network-base...
Name: Description, Length: 6488, dtype: object

# Stopwords, Splitting, Label Encoding

In [260]:
# Creating stopwords list

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
numerical_symbols = re.compile('0-90-9a-z')
 
STOPWORDS = set(stopwords.words('english'))
 
def clean_text(text):
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = numerical_symbols.sub('', text)
    text = text.replace('x', '')
    #text = re.sub(r'\W+', '', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text

df['Description'] = df['Description'].apply(clean_text)


'''
def getLemmText(text):
    tokens=word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    tokens=[lemmatizer.lemmatize(word) for word in tokens]
    
    #ps = PorterStemmer()
    #tokens=[ps.stem(word) for word in tokens]
    return ' '.join(tokens)
df['Description'] = list(map(getLemmText,df['Description']))


def getStemmText(text):
    tokens=word_tokenize(text)
    ps = PorterStemmer()
    tokens=[ps.stem(word) for word in tokens]
    return ' '.join(tokens)
df['Description'] = list(map(getStemmText,df['Description']))

'''

"\ndef getLemmText(text):\n    tokens=word_tokenize(text)\n    lemmatizer = WordNetLemmatizer()\n    tokens=[lemmatizer.lemmatize(word) for word in tokens]\n    \n    #ps = PorterStemmer()\n    #tokens=[ps.stem(word) for word in tokens]\n    return ' '.join(tokens)\ndf['Description'] = list(map(getLemmText,df['Description']))\n\n\ndef getStemmText(text):\n    tokens=word_tokenize(text)\n    ps = PorterStemmer()\n    tokens=[ps.stem(word) for word in tokens]\n    return ' '.join(tokens)\ndf['Description'] = list(map(getStemmText,df['Description']))\n\n"

In [261]:
df['Description']

0       avalon airport infrastructure networkbased rep...
1       laverton raaf infrastructure networkbased repo...
2       warrnambool airport ndb infrastructure network...
3       essendon airport infrastructure networkbased r...
4       avalon airport infrastructure networkbased rep...
                              ...                        
6499    moorabbin airport infrastructure networkbased ...
6500    avalon airport infrastructure networkbased rep...
6501    mildura airport infrastructure networkbased re...
6502    swan hill aerodrome infrastructure networkbase...
6503    moorabbin airport infrastructure networkbased ...
Name: Description, Length: 6488, dtype: object

In [262]:
# Label encoding for Category
le = LabelEncoder()
df['Category'] = le.fit_transform(df['Category'].astype(str))
#store the 'Category' variable in Y
Y = df[['Category']]

In [263]:
# Splitting of data in test and train
x_train, x_test, y_train, y_test = train_test_split(X,Y, 
                                                    test_size=0.3, random_state=1)

# SVC using countvect

In [265]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(df['Description']) #learns the vocubularly by 'fit'
x_train = vect.transform(x_train)
x_test = vect.transform(x_test)

In [266]:
x_train.shape
y_train.shape

(4541, 1)

In [279]:
  # defining parameter range 
  #perfrom gridsearch to find the best params for SVC
param_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf', 'linear']}  
  
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3) 
grid.fit(x_train,y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.206, total=  21.4s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   21.4s remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.206, total=  21.6s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   43.0s remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.206, total=  21.4s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.206, total=  21.5s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.206, total=  21.4s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.831, total=   6.0s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.817, total=   6.1s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.806, total=   5.8s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.824, total=   6.0s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] .

[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed: 39.2min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf', 'linear']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [280]:
grid.best_params_

{'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}

In [274]:
#svc_model = SVC(C = 3.0, kernel = 'linear')
#svc_model.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


SVC(C=5.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [283]:
# Model evaluation
accuracy = accuracy_score(y_test,grid.predict(x_test))
print("Accuracy score of SVM model is  ",accuracy)

Accuracy score of SVM model is   0.8315356959424756


In [285]:
#Classification
print(classification_report(y_test,grid.predict(x_test)))

              precision    recall  f1-score   support

           0       0.63      0.62      0.63       116
           1       0.88      0.73      0.80        82
           2       0.62      0.53      0.57        60
           3       0.77      0.87      0.82       278
           4       0.80      0.92      0.86       132
           5       0.88      0.91      0.90       125
           6       0.84      0.86      0.85       132
           7       1.00      0.10      0.18        10
           8       0.90      0.86      0.88        42
           9       0.67      0.58      0.62        59
          10       0.91      0.89      0.90       386
          11       0.68      0.57      0.62        89
          12       0.91      0.92      0.92       171
          13       0.00      0.00      0.00         6
          14       0.91      0.93      0.92       259

    accuracy                           0.83      1947
   macro avg       0.76      0.69      0.70      1947
weighted avg       0.83   