<a href="https://colab.research.google.com/github/vataliya/cav_security_attacls/blob/main/cav_sec_supervised.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd drive/My\ Drive/Colab\ Notebooks/kdd99/NSL-KDD

/content/drive/My Drive/Colab Notebooks/kdd99/NSL-KDD


In [3]:
import time
import pandas as pd
import numpy as np
import seaborn
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

**Data Preprocessing**

In [4]:
columns = ['duration',
    'protocol_type',
    'service',
    'flag',
    'src_bytes',
    'dst_bytes',
    'land',
    'wrong_fragment',
    'urgent',
    'hot',
    'num_failed_logins',
    'logged_in',
    'num_compromised',
    'root_shell',
    'su_attempted',
    'num_root',
    'num_file_creations',
    'num_shells',
    'num_access_files',
    'num_outbound_cmds',
    'is_host_login',
    'is_guest_login',
    'count',
    'srv_count',
    'serror_rate',
    'srv_serror_rate',
    'rerror_rate',
    'srv_rerror_rate',
    'same_srv_rate',
    'diff_srv_rate',
    'srv_diff_host_rate',
    'dst_host_count',
    'dst_host_srv_count',
    'dst_host_same_srv_rate',
    'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate',
    'dst_host_srv_serror_rate',
    'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate',
    'labels'
]

attacks_list = [
    'ipsweep',
    'imap',
    'mailbomb', 
    'neptune', 
    'pod', 
    'smurf', 
    'teardrop', 
    'udpstorm',
    'buffer_overflow', 
    'httptunnel',
    'ftp_write', 
    'guess_passwd', 
    'worm', 
    'xsnoop',
    'normal']

In [5]:
nsl_kdd_20p_train = pd.read_csv('KDDTrain+_20Percent.txt', names = columns, index_col=False)
nsl_kdd_20p_test = pd.read_csv('KDDTest-21.txt', names = columns, index_col=False)
nsl_kdd_20p = pd.concat([nsl_kdd_20p_train,nsl_kdd_20p_test], sort = False)
cav_20p = nsl_kdd_20p[nsl_kdd_20p['labels'].isin(attacks_list)]

In [6]:
nsl_kdd_train = pd.read_csv('KDDTrain+.txt', names = columns, index_col=False)
nsl_kdd_test = pd.read_csv('KDDTest+.txt', names = columns, index_col=False)
nsl_kdd = pd.concat([nsl_kdd_train,nsl_kdd_test], sort = False)
cav = nsl_kdd[nsl_kdd['labels'].isin(attacks_list)]

In [7]:
nsl_kdd_list = nsl_kdd['labels'].value_counts()
nsl_kdd_20p_list = nsl_kdd_20p['labels'].value_counts()
cav_list = cav['labels'].value_counts()
cav_20p_list = cav_20p['labels'].value_counts()

In [8]:
dataset_values = pd.DataFrame(index = nsl_kdd['labels'].unique(), columns = ['NSL_KDD', 'NSL_KDD_20p', 'CAV', 'CAV_20p'])
dataset_values['NSL_KDD'] = nsl_kdd_list
dataset_values['NSL_KDD_20p'] = nsl_kdd_20p_list
dataset_values['CAV'] = cav_list
dataset_values['CAV_20p'] = cav_20p_list
dataset_values

Unnamed: 0,NSL_KDD,NSL_KDD_20p,CAV,CAV_20p
normal,77054,15601,77054.0,15601.0
neptune,45871,9861,45871.0,9861.0
warezclient,890,181,,
ipsweep,3740,851,3740.0,851.0
portsweep,3088,743,,
teardrop,904,200,904.0,200.0
nmap,1566,374,,
satan,4368,1418,,
smurf,3311,1156,3311.0,1156.0
pod,242,79,242.0,79.0


In [9]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
for i in columns:
    cav_20p[i] = le.fit_transform(cav_20p[i])
    cav[i] = le.fit_transform(cav[i])

In [10]:
from sklearn.preprocessing import StandardScaler

In [11]:
x_train = cav_20p.drop(columns= ['labels'])
y_train = cav_20p['labels']
x_test = cav.drop(columns= ['labels'])
y_test = cav['labels']

In [12]:
#scaling the data
sc=StandardScaler()
x_sc = sc.fit_transform(x_train)
xt_sc=sc.transform(x_test)

In [13]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from  sklearn.metrics import accuracy_score

**Dimensionality Reduction using PCA**

In [14]:
from sklearn.decomposition import PCA
from sklearn.manifold import LocallyLinearEmbedding

In [15]:
pca=PCA(n_components=2)
x_reduced=pca.fit_transform(x_sc)
xt_reduced=pca.transform(xt_sc)
x_reduced.shape

(29459, 2)

**Logistic Regression**

In [16]:
from sklearn.linear_model import LogisticRegression
log=LogisticRegression()
log.fit(x_reduced,y_train)
log.score(x_reduced,y_train)

0.8596354255066363

In [17]:
c_range = [0.01, 0.1, 1, 10, 100]
train_score_l1 = []
train_score_l2 = []
test_score_l1 = []
test_score_l2 = []
for c in c_range:
    log_l1 = LogisticRegression(penalty = 'l1', C = c, solver='liblinear')
    log_l2 = LogisticRegression(penalty = 'l2', C = c, solver='liblinear')
    log_l1.fit(x_reduced, y_train)
    log_l2.fit(x_reduced, y_train)
    train_score_l1.append(log_l1.score(x_reduced, y_train))
    train_score_l2.append(log_l2.score(x_reduced, y_train))
    test_score_l1.append(log_l1.score(xt_reduced, y_test))
    test_score_l2.append(log_l2.score(xt_reduced, y_test))

In [18]:
print('Train_score_l1:',train_score_l1)
print('Test_score_l1:',test_score_l1)
print('Train_score_l2:',train_score_l2)
print('Test_score_l2:',test_score_l2)

Train_score_l1: [0.8574289690756645, 0.8602803896941512, 0.8612987542007536, 0.861332699684307, 0.861332699684307]
Test_score_l1: [0.9213620940013392, 0.9134019998043833, 0.9117618291664472, 0.9116489733886076, 0.9116339259515623]
Train_score_l2: [0.8524729284768662, 0.8582776061644998, 0.8604501171119182, 0.8612987542007536, 0.8612987542007536]
Test_score_l2: [0.9171262404730914, 0.9214147600309978, 0.9120928727814435, 0.9116564971071303, 0.9116489733886076]


In [19]:
from sklearn.model_selection import GridSearchCV
logit = LogisticRegression()
param = { 'C':[0.01, 0.1, 1, 10, 100], 'penalty': ['l1','l2']}
logistic = GridSearchCV(logit,param,cv=5, return_train_score=True)
logistic.fit(x_reduced,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring=None, verbose=0)

In [20]:
logistic.best_params_

{'C': 1, 'penalty': 'l2'}

In [21]:
logistic.best_score_

0.8590241560791583

In [22]:
log=LogisticRegression(C=100, penalty='l2')
log.fit(x_reduced,y_train)
print(log.score(x_reduced,y_train))
print(log.score(xt_reduced,y_test))

0.8596693709901898
0.9065554159487785


In [23]:
from sklearn.model_selection import cross_val_score
print('Cross-validation scores:',cross_val_score(log,x_sc,y_train) )

Cross-validation scores: [0.99473863 0.99236253 0.99270197 0.99304141 0.9619759 ]


In [24]:
pred_log=log.predict(xt_reduced)

In [25]:
from sklearn.metrics import classification_report
print(classification_report(y_test, pred_log, 
                            target_names=["0", "1", "2","3", "4", "5", 
                                          "6", "7","8", "9","10", "11", "12", "13", "14"]))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        50
           1       0.00      0.00      0.00        11
           2       0.00      0.00      0.00      1284
           3       0.00      0.00      0.00       133
           4       0.00      0.00      0.00        12
           5       0.01      0.00      0.00      3740
           6       0.00      0.00      0.00       293
           7       0.99      1.00      1.00     45871
           8       0.88      0.97      0.92     77054
           9       0.00      0.00      0.00       242
          10       0.00      0.00      0.00      3311
          11       0.00      0.00      0.00       904
          12       0.00      0.00      0.00         2
          13       0.00      0.00      0.00         2
          14       0.00      0.00      0.00         4

    accuracy                           0.91    132913
   macro avg       0.13      0.13      0.13    132913
weighted avg       0.86   

**Decision Tree Classifier**

In [26]:
from sklearn.tree import DecisionTreeClassifier
tree= DecisionTreeClassifier(max_depth=1).fit(x_reduced, y_train)
pred_tree= tree.predict(xt_reduced)
print("Test score: {:.3f}".format(tree.score(xt_reduced, y_test)))
print("Train score: {:.3f}".format(tree.score(x_reduced, y_train)))


Test score: 0.923
Train score: 0.861


In [27]:
print(classification_report(y_test, pred_tree, target_names=["0", "1", "2","3", "4", "5", 
                                          "6", "7","8", "9","10", "11", "12", "13", "14"]))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        50
           1       0.00      0.00      0.00        11
           2       0.00      0.00      0.00      1284
           3       0.00      0.00      0.00       133
           4       0.00      0.00      0.00        12
           5       0.00      0.00      0.00      3740
           6       0.00      0.00      0.00       293
           7       0.98      1.00      0.99     45871
           8       0.89      1.00      0.94     77054
           9       0.00      0.00      0.00       242
          10       0.00      0.00      0.00      3311
          11       0.00      0.00      0.00       904
          12       0.00      0.00      0.00         2
          13       0.00      0.00      0.00         2
          14       0.00      0.00      0.00         4

    accuracy                           0.92    132913
   macro avg       0.12      0.13      0.13    132913
weighted avg       0.86   

In [28]:
tree = DecisionTreeClassifier()
param = { 'max_depth':[1,2,3,4,5,6,7,8,9,10]}
dtree = GridSearchCV(tree,param,cv=5, return_train_score=True)
dtree.fit(x_reduced,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
      

In [29]:
dtree.best_params_

{'max_depth': 10}

In [30]:
dtree.best_score_

0.9035600637192316

In [31]:
tree= DecisionTreeClassifier(max_depth=10).fit(x_reduced, y_train)
pred_tree=tree.predict(xt_reduced)
print(tree.score(x_reduced,y_train))
print(tree.score(xt_reduced,y_test))

0.9264061916562002
0.9033503118581327


In [32]:
from sklearn.multiclass import OneVsOneClassifier

In [33]:
clf = OneVsOneClassifier(tree).fit(x_reduced, y_train)
y_pred = clf.predict(xt_reduced)

In [34]:
from sklearn.model_selection import cross_val_score
print('Cross-validation scores:',cross_val_score(tree,x_reduced,y_train) )
print('Cross-validation scores:',cross_val_score(clf,x_reduced,y_train) )

Cross-validation scores: [0.9059742  0.90546504 0.91479973 0.91242363 0.87947717]
Cross-validation scores: [0.91955193 0.92175832 0.92464358 0.93075356 0.89713122]


**LinearSVC**

In [35]:
from sklearn.svm import LinearSVC, SVC

In [36]:
svc=LinearSVC(C=10)
svc.fit(x_reduced,y_train)
pred_linearsvc=svc.predict(xt_reduced)
print(svc.score(x_reduced,y_train))
print(svc.score(xt_reduced,y_test))

0.8608914083981126
0.9245521506549397


In [37]:
linsvc = LinearSVC()
param = { 'C':[0.01,0.1,1,10,100]}
linear_svc= GridSearchCV(linsvc,param,cv=5, return_train_score=True)
linear_svc.fit(x_reduced,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='squared_hinge', max_iter=1000,
                                 multi_class='ovr', penalty='l2',
                                 random_state=None, tol=0.0001, verbose=0),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.01, 0.1, 1, 10, 100]}, pre_dispatch='2*n_jobs',
             refit=True, return_train_score=True, scoring=None, verbose=0)

In [38]:
linear_svc.best_params_

{'C': 10}

In [39]:
linear_svc.best_score_

0.8609250732041686

In [40]:
svc=LinearSVC(C=0.1)
svc.fit(x_reduced,y_train)
print(svc.score(x_reduced,y_train))
print(svc.score(xt_reduced,y_test))

0.8605858990461319
0.9244844371882359


In [41]:
print('Cross-validation scores:',cross_val_score(svc,x_reduced,y_train) )

Cross-validation scores: [0.86354379 0.86269518 0.86303462 0.8628649  0.84943134]


In [42]:
from sklearn.metrics import classification_report

print(classification_report(y_test, pred_linearsvc, target_names=["0", "1", "2","3", "4", "5", 
                                          "6", "7","8", "9","10", "11", "12", "13", "14"]))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        50
           1       0.00      0.00      0.00        11
           2       0.00      0.00      0.00      1284
           3       0.00      0.00      0.00       133
           4       0.00      0.00      0.00        12
           5       1.00      0.05      0.09      3740
           6       0.00      0.00      0.00       293
           7       0.99      1.00      0.99     45871
           8       0.89      1.00      0.94     77054
           9       0.00      0.00      0.00       242
          10       0.00      0.00      0.00      3311
          11       0.00      0.00      0.00       904
          12       0.00      0.00      0.00         2
          13       0.00      0.00      0.00         2
          14       0.00      0.00      0.00         4

    accuracy                           0.92    132913
   macro avg       0.19      0.14      0.13    132913
weighted avg       0.89   

**K Nearest Neighbors Classifier**

In [43]:
from sklearn.neighbors import KNeighborsClassifier
train_score_array = []
test_score_array = []

#Performing kNN model  5 fold cross validation:
k_acc_scores = []
k_cv_scores = []
k_values = [i for i in range(100,1000,50)] #evaluating values of k as all odd numbers between 1 and 20

for k in k_values:
    print(k)
    knn = KNeighborsClassifier(n_neighbors = 100)
    knn.fit(x_reduced,y_train)
    train_score_array.append(knn.score(x_reduced, y_train))
    # test_score_array.append(knn.score(xt_reduced, y_test))
    pred = knn.predict(xt_reduced)
    k_acc_scores.append("k({}) = {}".format(k, accuracy_score(y_test, pred)))
    cv_scores = cross_val_score(knn, x_reduced, y_train, cv=5, scoring='accuracy', n_jobs = -1)
    print(cv_scores)
    k_cv_scores.append(cv_scores.mean())


100
[0.89307536 0.89256619 0.90257977 0.90580448 0.89356646]
150
[0.89307536 0.89256619 0.90257977 0.90580448 0.89356646]
200
[0.89307536 0.89256619 0.90257977 0.90580448 0.89356646]
250
[0.89307536 0.89256619 0.90257977 0.90580448 0.89356646]
300
[0.89307536 0.89256619 0.90257977 0.90580448 0.89356646]
350
[0.89307536 0.89256619 0.90257977 0.90580448 0.89356646]
400
[0.89307536 0.89256619 0.90257977 0.90580448 0.89356646]
450
[0.89307536 0.89256619 0.90257977 0.90580448 0.89356646]
500
[0.89307536 0.89256619 0.90257977 0.90580448 0.89356646]
550
[0.89307536 0.89256619 0.90257977 0.90580448 0.89356646]
600
[0.89307536 0.89256619 0.90257977 0.90580448 0.89356646]
650
[0.89307536 0.89256619 0.90257977 0.90580448 0.89356646]
700
[0.89307536 0.89256619 0.90257977 0.90580448 0.89356646]
750
[0.89307536 0.89256619 0.90257977 0.90580448 0.89356646]
800
[0.89307536 0.89256619 0.90257977 0.90580448 0.89356646]
850
[0.89307536 0.89256619 0.90257977 0.90580448 0.89356646]
900
[0.89307536 0.892566

In [None]:
knn=KNeighborsClassifier()
param = { 'n_neighbors':range(100,1000,50)}
knnc= GridSearchCV(knn,param,cv=5, return_train_score=True)
knnc.fit(x_reduced,y_train)

In [None]:
knnc.best_params_

In [None]:
knnc.best_score_

In [None]:
print('Cross-validation scores:',cross_val_score(knn,x_reduced,y_train) )

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test, pred_knn, target_names=["0", "1", "2","3", "4", "5", 
                                          "6", "7","8", "9","10", "11", "12", "13", "14"]))