In [1]:
import numpy as np
import pandas as pd
import os
import math
import matplotlib.pyplot as plt
import scipy as sp
from scipy.spatial import distance
from sklearn.decomposition import PCA,qPCA
from tabulate import tabulate
from Models.PCAmodel import Model
from sklearn.QuantumUtility.Utility import *
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import warnings
warnings.filterwarnings("ignore")

# Load Train/Test 


To repeat the experiments done in the thesis, load these data.
Fix dataset parameter to 'CICIDS' to repeat experiments of principal components classifier over cicids dataset, otherwise KDDCUP.

In [2]:
cwd = os.getcwd()

dataset='KDDCUP' 
trains_ =[]
tests = []
te_labels=[]

if dataset=='KDDCUP':
    
    #Always execute this cell
    df = pd.read_csv('kddcup.data_10_percent_corrected',header = None)
    df1 = df.drop(columns=[1,2,3,6,11,20,21])
    df1.loc[df1[41] != 'normal.', 41] = 'attack'

    for filename in os.listdir(os.path.join(cwd,'KDDCUP/Trains/')):
        trains_.append(pd.read_csv(os.path.join(cwd,'KDDCUP/Trains/',filename)).drop(columns = ['Unnamed: 0', '41']))

    for filename in os.listdir(os.path.join(cwd,'KDDCUP/Tests/')):
        tests.append(pd.read_csv(os.path.join(cwd,'KDDCUP/Tests/',filename)).set_index('Unnamed: 0'))
    test = []
    labels = []
    for i in range(5):
        x = tests[i].index
        test.append(df1.loc[x])
        labels.append(df1.loc[x][41])
    neg_class='normal.'
    
else:
    
    for filename in os.listdir(os.path.join(cwd,'CICIDS2017/FinalTrainTest/')):
        if filename.startswith('train'):
            trains_.append(pd.read_csv(os.path.join(cwd,'CICIDS2017/FinalTrainTest/',filename)))
        elif filename.startswith('test'):
            te_labels.append(pd.read_csv(os.path.join(cwd,'CICIDS2017/FinalTrainTest/',filename)))
        else:
            tests.append(pd.read_csv(os.path.join(cwd,'CICIDS2017/FinalTrainTest/',filename)))
    labels = []
    for i in range(5):
        labels.append(te_labels[i]['Label'][:-50000])
    neg_class='BENIGN'
    

# Fit PCA 

To classically fit PCA just set to False all the quantum flags as theta_estimate, estimate_all, and estimate_least_k.

In [3]:
while True:
    try:
        qpca30 = qPCA(svd_solver='full', name='qpca30').fit(trains_[0], theta_estimate=False,
                                                            eps_theta=8, p=0.30,
                                                            theta_minor=np.sqrt(0.20*trains_[0].shape[0]),
                                                            estimate_all=False,
                                                            delta=0.1, eps=8, true_tomography=True,
                                                            eta=0.2, spectral_norm_est=False,
                                                            estimate_least_k=False)
    except:
        pass
    else:
        break
#print('PCA30 done', 'n_components', qpca30.topk)
while True:
    try:
        qpca40 = qPCA(svd_solver='full', name='qpca40').fit(trains_[1], theta_estimate=False,
                                                            eps_theta=1,
                                                            p=0.40,
                                                            estimate_all=False,
                                                            delta=0.1, eps=1, true_tomography=True, eta=0.16,
                                                            theta_minor=np.sqrt(0.20*trains_[1].shape[0]),
                                                            estimate_least_k=False)
    except:
        pass
    else:
        break
#print('PCA40 done', 'n_components', qpca40.topk)

while True:
    try:
        qpca50 = qPCA(svd_solver='full', name='PCA50').fit(trains_[2], theta_estimate=False,
                                                            eps_theta=1,
                                                            p=0.50,
                                                            estimate_all=False,
                                                            delta=0.1, eps=1, true_tomography=True, eta=0.1,
                                                            theta_minor=np.sqrt(0.20 * trains_[2].shape[0]),
                                                            estimate_least_k=False,spectral_norm_est=False)
    except:
        pass
    else:
        break
#print('PCA50 done', 'n_components', qpca50.topk)

while True:
    try:
        qpca60 = qPCA(svd_solver='full', name='qpca60').fit(trains_[3], theta_estimate=False,
                                                            eps_theta=1,
                                                            p=0.60,
                                                            estimate_all=False,
                                                            delta=0.1, eps=1, true_tomography=True, eta=0.1,
                                                            theta_minor=np.sqrt(0.20 * trains_[3].shape[0]),
                                                            estimate_least_k=False
                                                            ,spectral_norm_est=False
                                                            )
    except:
        pass
    else:
        break
#print('PCA60 done', 'n_components', qpca60.topk)


while True:
    try:
        qpca70 = qPCA(svd_solver='full', name='PCA70').fit(trains_[4], theta_estimate=False,
                                                            eps_theta=1,
                                                            p=0.70,
                                                            estimate_all=False,
                                                            delta=0.1, eps=1, true_tomography=True, eta=0.1,
                                                            theta_minor=np.sqrt(0.20 * trains_[4].shape[0]),
                                                            spectral_norm_est=False, estimate_least_k=False)
    except:
        pass
    else:
        break
#print('PCA70 done', 'n_components', qpca70.topk)

PCAs = [qpca30, qpca40, qpca50, qpca60, qpca70]

In [4]:
qpca70.components_retained_

10

# Principal components classifier  model

In [5]:
# Compute quantils
alpha = [0.01,0.02,0.04,0.06,0.08,0.10]
quantils = []
for i in alpha:
    eq =  [1,-2,i]
    quantile = 1-np.round(np.roots(eq)[1],decimals=4)
    quantils.append(quantile)
quantils

[0.995, 0.9899, 0.9798, 0.9695, 0.9592, 0.9487]

* Fit()
    * Set quantum to False if you want to execute classical experiments, otherwise set it to True.
    * PCAs is the list of PCA models previous fitted
    * trains_ is the list of normalized training sets without labels
    * minor_sv_variance is the value used to extract the minor components
    * only_dot_product -> set to True if you want to compute the base model. Set to False, if you want to execute the    Enseble.
    * experiment-> set to 0 if you want to use only the major components in the summations, otherwise set it to 1
* Predict()
    * tests -> list of normalized test sets
    * labels -> list of labels corresponding to the test sets
    * name_of_negative_class -> label of normal class

In [6]:
qmodel = Model(PCAs, quantils, quantum=False).fit(trains_, minor_sv_variance=0.20, only_dot_product=True,
                                                  experiment=0)
recall_dot, precision_dot, accuracy_dot, f1_score_dot = qmodel.predict(tests, labels, name_negative_class=neg_class,
                                                                       only_dot_product=True, experiment=0)


qpca30
(22.197239064164407,)
detection_rate: 0.6713212683369462
precision: 0.9849123585533615
accuracy: 0.8980841518127805
F1_score: 0.7984291624198094
TP: 26634 TN: 91870 FP: 408 FN: 13040 TOT_SAMPLES: 131952 LEN_TEST: 131952
(19.99135862820585,)
detection_rate: 0.6881836971316228
precision: 0.9715332882610398
accuracy: 0.9001834000242512
F1_score: 0.8056715404930876
TP: 27303 TN: 91478 FP: 800 FN: 12371 TOT_SAMPLES: 131952 LEN_TEST: 131952
(18.197348601053637,)
detection_rate: 0.7117507687654383
precision: 0.9428065840873426
accuracy: 0.9003501273190251
F1_score: 0.8111454219030519
TP: 28238 TN: 90565 FP: 1713 FN: 11436 TOT_SAMPLES: 131952 LEN_TEST: 131952
(17.001445027751092,)
detection_rate: 0.7136159701567777
precision: 0.9121721760422707
accuracy: 0.8932339032375409
F1_score: 0.8007693177961307
TP: 28312 TN: 89552 FP: 2726 FN: 11362 TOT_SAMPLES: 131952 LEN_TEST: 131952
(16.0683386812283,)
detection_rate: 0.7138932298230579
precision: 0.8889830508474577
accuracy: 0.887171092518491

In [12]:
# Print results
components = [i for i in recall_dot]
headers = ['FAR','PCA30','PCA40','PCA50','PCA60','PCA70','PCA90']
    
one_perc = [recall_dot[i][0] for i in components]
two_perc = [recall_dot[i][1] for i in components]
four_perc = [recall_dot[i][2] for i in components]
six_perc = [recall_dot[i][3] for i in components]
ten_perc = [recall_dot[i][4] for i in components]
thirty_perc = [recall_dot[i][5] for i in components]



one_perc_prec = [precision_dot[i][0] for i in components]
two_perc_prec = [precision_dot[i][1] for i in components]
four_perc_prec = [precision_dot[i][2] for i in components]
six_perc_prec = [precision_dot[i][3] for i in components]
ten_perc_prec = [precision_dot[i][4] for i in components]
thir_perc_prec = [precision_dot[i][5] for i in components]

one_perc_prec.insert(0,'1%')
two_perc_prec.insert(0,'2%')
four_perc_prec.insert(0,'4%')
six_perc_prec.insert(0,'6%')
ten_perc_prec.insert(0,'8%')
thir_perc_prec.insert(0,'10%')


one_perc.insert(0,'1%')
two_perc.insert(0,'2%')
four_perc.insert(0,'4%')
six_perc.insert(0,'6%')
ten_perc.insert(0,'8%')
thirty_perc.insert(0,'10%')


######
one_perc_f1 = [f1_score_dot[i][0] for i in components]
two_perc_f1 = [f1_score_dot[i][1] for i in components]
four_perc_f1 = [f1_score_dot[i][2] for i in components]
six_perc_f1 = [f1_score_dot[i][3] for i in components]
ten_perc_f1 = [f1_score_dot[i][4] for i in components]
thir_perc_f1 = [f1_score_dot[i][5] for i in components]

one_perc_f1.insert(0,'1%')
two_perc_f1.insert(0,'2%')
four_perc_f1.insert(0,'4%')
six_perc_f1.insert(0,'6%')
ten_perc_f1.insert(0,'8%')
thir_perc_f1.insert(0,'10%')


one_perc_acc = [accuracy_dot[i][0] for i in components]
two_perc_acc = [accuracy_dot[i][1] for i in components]
four_perc_acc = [accuracy_dot[i][2] for i in components]
six_perc_acc = [accuracy_dot[i][3] for i in components]
ten_perc_acc = [accuracy_dot[i][4] for i in components]
thir_perc_acc = [accuracy_dot[i][5] for i in components]



one_perc_acc.insert(0,'1%')
two_perc_acc.insert(0,'2%')
four_perc_acc.insert(0,'4%')
six_perc_acc.insert(0,'6%')
ten_perc_acc.insert(0,'8%')
thir_perc_acc.insert(0,'10%')



print('Detection Rate')
print(tabulate([one_perc, two_perc, four_perc, six_perc, ten_perc, thirty_perc], headers=headers))
print("\n \n ")
print('Precision')
print(tabulate([one_perc_prec, two_perc_prec, four_perc_prec, six_perc_prec, ten_perc_prec, thir_perc_prec], headers=headers))
print("\n \n ")
print('F1_Score')
print(tabulate([one_perc_f1, two_perc_f1, four_perc_f1, six_perc_f1, ten_perc_f1, thir_perc_f1], headers=headers))
print("\n \n ")
print('Accuracy')
print(tabulate([one_perc_acc, two_perc_acc, four_perc_acc, six_perc_acc, ten_perc_acc, thir_perc_acc], headers=headers))


Detection Rate
FAR       PCA30     PCA40     PCA50     PCA60     PCA70
-----  --------  --------  --------  --------  --------
1%     0.663457  0.954756  0.929979  0.948178  0.928442
2%     0.680824  0.982331  0.968493  0.979634  0.929022
4%     0.706584  0.982734  0.981751  0.982255  0.975929
6%     0.710264  0.983818  0.983012  0.983037  0.983289
8%     0.710717  0.984297  0.984347  0.983238  0.984247
10%    0.710969  0.984473  0.984549  0.984524  0.991506

 
 
Precision
FAR       PCA30     PCA40     PCA50     PCA60     PCA70
-----  --------  --------  --------  --------  --------
1%     0.984663  0.984842  0.993511  0.988465  0.989762
2%     0.969944  0.974569  0.981481  0.979412  0.983116
4%     0.942191  0.956809  0.959053  0.957306  0.96292
6%     0.914042  0.935997  0.933305  0.938584  0.941862
8%     0.888122  0.920168  0.91158   0.924976  0.921945
10%    0.864529  0.896422  0.89452   0.907971  0.902018

 
 
F1_Score
FAR       PCA30     PCA40     PCA50     PCA60     PCA70
-----