<a href="https://colab.research.google.com/github/xcsengody/DP/blob/master/xcsengody_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)
root='/content/drive/My Drive/DP/'

Mounted at /content/drive


In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import pylab as pl
from pylab import rcParams
from collections import Counter
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import (SMOTE,SVMSMOTE,ADASYN)

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.utils import class_weight
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.externals import joblib



In [0]:
def predict(clf, X_train, y_train, X_test, y_test, bin):
  prediction = clf.predict(X_test)

  accuracy = accuracy_score(y_test, prediction)
  print("Accuracy: %.2f%%" % (accuracy * 100.0))

  score = metrics.accuracy_score(y_test, prediction)

  score = metrics.f1_score(y_test, prediction, average=None)
  print("F1 score: {}".format(score))

  print("\nClassification Report")
  classification_report(y_test, prediction)

  if bin:
    cv = cross_val_score(randomForest, X_train, y_train, cv=10, scoring='roc_auc')
    print("Standard Cross-validation accuracy: %f (+/- %f)" % (cv.mean(), (cv.std()*2)))

    skfold = StratifiedKFold(n_splits=10)
    skfold_cv = cross_val_score(randomForest, X_train, y_train, cv=skfold, scoring='roc_auc')
    print("Stratified K-fold Cross-validation accuracy: %f (+/- %f)" % (skfold_cv.mean(), (skfold_cv.std()*2)))
 
  print("Confusion-matrix")
  pd.crosstab(y_test, prediction, rownames=['Actual Species'], colnames=['Predicted Species'])

  # Confusion-matrix usually used to evaluate the performance of a multiclass model.
  conf_mat = confusion_matrix(y_test, prediction)
  #sns.heatmap(conf_mat,annot=True)
  #plt.title("Confusion-matrix")
  #plt.figure(figsize=(20,20))
  #plt.show()

  if bin:
    # ROC-AUC for binary class model performance evaluation
    proba = clf.predict_proba(X_test)
    proba = [p[1] for p in proba]
    print("ROC-AUC: {}".format(roc_auc_score(y_test, proba)))

In [0]:
pd.set_option('display.max_rows', 500);
pd.set_option('display.max_columns', None);
rcParams['figure.figsize'] = 15, 8;
sns.set(style="whitegrid");

In [7]:
path = root+"Dataset/NUSW-NB15_features.csv";
df = pd.read_csv(path, delimiter=',', encoding='unicode_escape', low_memory=False, skipinitialspace=True, skip_blank_lines=True, verbose=True);
features = df.Name

Tokenization took: 0.06 ms
Type conversion took: 1.34 ms
Parser memory cleanup took: 0.01 ms


In [9]:
#dataset=dataset.reset_index();
#dataset=dataset.set_index('index');

path = root+"Dataset/dataset.csv";
dataset = pd.read_csv(path, delimiter=',', encoding='utf-8', low_memory=False, skipinitialspace=True, skip_blank_lines=True, verbose=True);
del dataset["Unnamed: 0"]

Tokenization took: 15280.69 ms
Type conversion took: 13923.18 ms
Parser memory cleanup took: 60.11 ms


In [12]:
print("Number of rows: {}\nNumber of features: {}".format(dataset.shape[0],dataset.shape[1]))

Number of rows: 699984
Number of features: 58


In [13]:
X = dataset.drop('attack_cat', axis=1)
Y = dataset.attack_cat

print(X.shape, Y.shape)

(699984, 57) (699984,)


In [0]:
print("Original dataset categories shape {}".format(sorted(Counter(Y).items())))

pipe = make_pipeline(SMOTE(sampling_strategy='auto',random_state=41,n_jobs=5), ADASYN(sampling_strategy='auto',random_state=41,n_jobs=5))
X_resampled, Y_resampled = pipe.fit_resample(X,Y)

X_resampled_df = pd.DataFrame(X_resampled, columns=X.columns)
resampled_dataset = X_resampled_df.assign(attack_cat=pd.Series(Y_resampled))

print("Resampled dataset categories shape {}".format(sorted(Counter(Y_resampled).items())))

In [0]:
#y = resampled_dataset.attack_cat
y = resampled_dataset.attack_cat.apply(lambda x: 1 if x>0 else x)
X = resampled_dataset.drop('attack_cat', axis=1)

# Dataset without resampling
#y = dataset.attack_cat.apply(lambda x: 1 if x>0 else x)
#X = dataset.drop('attack_cat', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 21)

class_weights_arr = class_weight.compute_class_weight('balanced',np.unique(y_train),y_train)
class_weights = {};

i=0
for cw in class_weights_arr:
  class_weights[i] = cw;
  i+=1

In [0]:
randomForest = RandomForestClassifier(class_weight=class_weights, n_estimators=len(Counter(Y)), criterion='entropy', random_state=0, n_jobs=len(Counter(Y)), max_depth=None, bootstrap=False)

In [0]:
clf = joblib.load(root+'/Model/randomforestmodel_bin.sav')

In [0]:
predict(clf, X_train, y_train, X, Y, False)

In [0]:
# --------------------------------------

In [0]:
clf = randomForest.fit(X_train, y_train)
joblib.dump(clf, 'randomforestmodel.sav')

In [0]:
clf_bin = randomForest.fit(X_train, y_train)
clf = clf_bin
joblib.dump(clf_bin, 'randomforestmodel_bin.sav')

In [0]:
selected_features = SelectFromModel(randomForest)
selected_features.fit(X_train, y_train)

features = X_train.columns[(selected_features.get_support())]
print("Selected features: {}".format(features))

X_train_important_features = selected_features.transform(X_train)
X_test_important_features = selected_features.transform(X_test)

clf_important_features = randomForest.fit(X_train_important_features, y_train)
clf = clf_important_features
joblib.dump(clf_important_features, 'randomforestmodel_imp_feat.sav')