In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

col_names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label", "difficulty_level"]

df_train_multiple_labels = pd.read_csv("KDDTrain+.txt", header=None, names = col_names)
df_test_multiple_labels = pd.read_csv("KDDTest+.txt", header=None, names = col_names)

from sklearn.preprocessing import LabelEncoder,OneHotEncoder

pd.set_option('display.max_columns', None)

# Combine training dataset with testing dataset to apply One-Hot-Encoder
df_multiple_labels = pd.concat([df_train_multiple_labels, df_test_multiple_labels], ignore_index=True)
# Drop "difficulty_level"
df_multiple_labels = df_multiple_labels.drop("difficulty_level", axis=1)
# One-Hot-Encoder for "protocol_type","service","flag"
categorical_col = ["protocol_type","service","flag"]
for col in categorical_col:
    df_dummies = pd.get_dummies(df_multiple_labels[col], prefix=col+"_")
    df_multiple_labels = df_multiple_labels.drop(col, axis=1)
    df_multiple_labels = pd.concat([df_multiple_labels,df_dummies], axis=1)
# LabelEncoder for "labels"
# Five Labels: normal: 0, DoS: 1, Probe: 2, R2L:3, U2R: 4
df_multiple_labels = df_multiple_labels.replace({'label':{'normal' : 0, 'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,
                           'ipsweep' : 2,'nmap' : 2,'portsweep' : 2,'satan' : 2,'mscan' : 2,'saint' : 2
                           ,'ftp_write': 3,'guess_passwd': 3,'imap': 3,'multihop': 3,'phf': 3,'spy': 3,'warezclient': 3,'warezmaster': 3,'sendmail': 3,'named': 3,'snmpgetattack': 3,'snmpguess': 3,'xlock': 3,'xsnoop': 3,'httptunnel': 3,
                           'buffer_overflow': 4,'loadmodule': 4,'perl': 4,'rootkit': 4,'ps': 4,'sqlattack': 4,'xterm': 4}})
# print(df_multiple_labels.head(1))
df_multiple_labels_normal = df_multiple_labels.loc[df_multiple_labels["label"]==0]
df_multiple_labels_DoS = df_multiple_labels.loc[df_multiple_labels["label"]==1]
df_multiple_labels_Probe = df_multiple_labels.loc[df_multiple_labels["label"]==2]
df_multiple_labels_R2L = df_multiple_labels.loc[df_multiple_labels["label"]==3]
df_multiple_labels_U2R = df_multiple_labels.loc[df_multiple_labels["label"]==4]

## DoS Dataset
df_DoS_normal = pd.concat([df_multiple_labels_normal, df_multiple_labels_DoS], ignore_index=True)
df_DoS_normal['label'] = np.where(df_DoS_normal['label'].eq(1), 1, 0)
## Probe Dataset
df_Probe_normal = pd.concat([df_multiple_labels_normal, df_multiple_labels_Probe], ignore_index=True)
df_Probe_normal['label'] = np.where(df_Probe_normal['label'].eq(2), 1, 0)
## R2L Dataset
df_R2L_normal = pd.concat([df_multiple_labels_normal, df_multiple_labels_R2L], ignore_index=True)
df_R2L_normal['label'] = np.where(df_R2L_normal['label'].eq(3), 1, 0)
## U2R Dataset
df_U2R_normal = pd.concat([df_multiple_labels_normal, df_multiple_labels_U2R], ignore_index=True)
df_U2R_normal['label'] = np.where(df_U2R_normal['label'].eq(4), 1, 0)

print(df_DoS_normal.shape)
print(df_Probe_normal)
print(df_R2L_normal.shape)
print(df_U2R_normal.shape)

(130441, 123)
       duration  src_bytes  dst_bytes  land  wrong_fragment  urgent  hot  \
0             0        491          0     0               0       0    0   
1             0        146          0     0               0       0    0   
2             0        232       8153     0               0       0    0   
3             0        199        420     0               0       0    0   
4             0        287       2251     0               0       0    0   
...         ...        ...        ...   ...             ...     ...  ...   
91126         0          0          0     0               0       0    0   
91127         2         24        109     0               0       0    0   
91128         0          0          0     0               0       0    0   
91129         0          1          1     0               0       0    0   
91130         0          0          0     0               0       0    0   

       num_failed_logins  logged_in  num_compromised  root_shell  \
0    

In [2]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score, roc_curve, recall_score, precision_score, f1_score 

from sklearn.preprocessing import StandardScaler,Normalizer

from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectPercentile, f_classif

from sklearn.model_selection import KFold, cross_val_score, cross_validate
from sklearn.pipeline import make_pipeline, Pipeline

from sklearn.tree import DecisionTreeClassifier

def printResults(results):
    #print(results.keys())
    print("f1_scores(mean): ", sum(results['test_f1'])/len(results['test_f1']))
    print("accuracy(mean): ", sum(results['test_accuracy'])/len(results['test_accuracy']))
    print("precision(mean): ", sum(results['test_precision'])/len(results['test_precision']))
    print("recall(mean): ", sum(results['test_recall'])/len(results['test_recall']))
    print("roc_auc(mean): ", sum(results['test_roc_auc'])/len(results['test_roc_auc']))

def generate_preprocess_pipeline(preprocess):
    if preprocess == "standardize":
        return StandardScaler()
    elif preprocess == "normalize":
        return Normalizer()
    
def generate_feature_selection_pipeline(feature_selection, percentage):
    if feature_selection == "PCA":
        return PCA(n_components = percentage)
    elif feature_selection == "ANOVA":
        return SelectPercentile(f_classif, percentile=(percentage*100))
                                
def DT(X,y):
    preprocess_methods = ["standardize"]
    
    feature_selection_methods = ["PCA","ANOVA"]
    percentage_selections = [0.1,0.3,0.5,0.7,0.8,0.9]
  
    DT_depth_selections= [2,5,10,20,None]
    
    for preprocess_method in preprocess_methods:
        preprocess_pipeline = generate_preprocess_pipeline(preprocess_method)
        for feature_selection_method in feature_selection_methods:
            for percentage_selection in percentage_selections:
                feature_selection_pipeline = generate_feature_selection_pipeline(feature_selection_method, percentage_selection)
                for DT_depth in DT_depth_selections:
                    complete_pipline = Pipeline([
                         ('preprocess',preprocess_pipeline ),
                         ('feature_selection',feature_selection_pipeline),
                        ('estimator',DecisionTreeClassifier(max_depth=DT_depth,random_state=0))
                    ])
                    results = cross_validate(complete_pipline, X, y, cv=5,
                         scoring=('f1', 'accuracy', 'precision','recall','roc_auc'), return_estimator=True)
                    print("preprocess: ",preprocess_method)
                    print("feature_selection: ", feature_selection_method)
                    print("percentage_selection: ", percentage_selection)
                    print("depth: ", DT_depth)
                    printResults(results)
                    print("-------------------------------------------------------------------------------------")
                
from sklearn.linear_model import LogisticRegression
def LG(X,y):
    preprocess_methods = ["standardize"]
    
    feature_selection_methods = ["PCA","ANOVA"]
    percentage_selections = [0.1,0.3,0.5,0.7,0.8,0.9]
  
    LG_solvers= ['newton-cg','lbfgs','sag']
    
    for preprocess_method in preprocess_methods:
        preprocess_pipeline = generate_preprocess_pipeline(preprocess_method)
        for feature_selection_method in feature_selection_methods:
            for percentage_selection in percentage_selections:
                feature_selection_pipeline = generate_feature_selection_pipeline(feature_selection_method, percentage_selection)
                for LG_solver in LG_solvers:
                    complete_pipline = Pipeline([
                         ('preprocess',preprocess_pipeline ),
                         ('feature_selection',feature_selection_pipeline),
                        ('estimator',LogisticRegression(solver = LG_solver,max_iter=100))
                    ])
                    results = cross_validate(complete_pipline, X, y, cv=5,
                         scoring=('f1', 'accuracy', 'precision','recall','roc_auc'), return_estimator=True)
                    print("preprocess: ",preprocess_method)
                    print("feature_selection: ", feature_selection_method)
                    print("percentage_selection: ", percentage_selection)
                    print("solver: ", LG_solver)
                    printResults(results)
                    print("-------------------------------------------------------------------------------------")
                    
from sklearn.svm import SVC
def SVM(X,y):
    preprocess_methods = ["standardize"]
    
    feature_selection_methods = ["ANOVA"]
    percentage_selections = [0.1,0.2,0.3,0.4]
  
    C_selections = [10,100,1000,10000]
    kernel_selections = ['poly','rbf','sigmoid']
    for preprocess_method in preprocess_methods:
        preprocess_pipeline = generate_preprocess_pipeline(preprocess_method)
        for feature_selection_method in feature_selection_methods:
            for percentage_selection in percentage_selections:
                feature_selection_pipeline = generate_feature_selection_pipeline(feature_selection_method, percentage_selection)
                for  C_selection in  C_selections:
                    for kernel_selection in kernel_selections:
                        complete_pipline = Pipeline([
                         ('preprocess',preprocess_pipeline ),
                         ('feature_selection',feature_selection_pipeline),
                        ('estimator',SVC(C=C_selection, kernel=kernel_selection,verbose=True))
                    ])
                        results = cross_validate(complete_pipline, X, y, cv=5,
                         scoring=('f1', 'accuracy', 'precision','recall','roc_auc'), return_estimator=True)
                        print("preprocess: ",preprocess_method)
                        print("feature_selection: ", feature_selection_method)
                        print("percentage_selection: ", percentage_selection)
                        print("C_selection: ", C_selection)
                        print("kernel_selection: ", kernel_selection)
                        printResults(results)
                        print("-------------------------------------------------------------------------------------")
                    
## Split dataset to training set and testing set
from sklearn.model_selection import train_test_split
def splitDataset(df):
    X = df.drop(columns=['label'])
    y = df['label']
    print(X.shape)
    print(y.shape)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size = 0.3, random_state=6)
    print(X_train.shape)
    print(sum(y_train))
    print(X_test.shape)
    print(sum(y_test))
    return X_train, X_test, y_train, y_test

## U2R

In [3]:
X_train, X_test, y_train, y_test = splitDataset(df_U2R_normal)

(77173, 122)
(77173,)
(54021, 122)
76
(23152, 122)
43


In [None]:
DT(X_train,y_train)

preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.1
depth:  2
f1_scores(mean):  0.0
accuracy(mean):  0.9985561177489064
precision(mean):  0.0
recall(mean):  0.0
roc_auc(mean):  0.8019822658880958
-------------------------------------------------------------------------------------


In [4]:
# standardize ANOVA 0.7 None
final_pipline = Pipeline([
                         ('preprocess',StandardScaler()),
                         ('feature_selection',SelectPercentile(f_classif, percentile=60)),
                        ('estimator',DecisionTreeClassifier(max_depth=10,random_state=0))
                    ])
final_pipline.fit(X_train,y_train)
y_test_pred = final_pipline.predict(X_test)

print(accuracy_score(y_test,y_test_pred))
print(recall_score(y_test,y_test_pred))
print(precision_score(y_test,y_test_pred))
print(roc_auc_score(y_test,y_test_pred))
print(f1_score(y_test,y_test_pred))
cnf_matrix = confusion_matrix(y_test, y_test_pred)
print("tn, fp, fn, tp:",  cnf_matrix.ravel())

0.9990497581202488
0.6976744186046512
0.7692307692307693
0.848642479976089
0.7317073170731708
tn, fp, fn, tp: [23100     9    13    30]


In [6]:
LG(X_train,y_train)

preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.1
solver:  newton-cg
f1_scores(mean):  0.0
accuracy(mean):  0.998593141073601
precision(mean):  0.0
recall(mean):  0.0
roc_auc(mean):  0.7215311891741589
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.1
solver:  lbfgs
f1_scores(mean):  0.0
accuracy(mean):  0.998593141073601
precision(mean):  0.0
recall(mean):  0.0
roc_auc(mean):  0.7215311891741589
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.1
solver:  sag
f1_scores(mean):  0.0
accuracy(mean):  0.998593141073601
precision(mean):  0.0
recall(mean):  0.0
roc_auc(mean):  0.7215311891741589
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selectio

preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.5
solver:  newton-cg
f1_scores(mean):  0.570120082815735
accuracy(mean):  0.9990744305886332
precision(mean):  0.8283333333333334
recall(mean):  0.445
roc_auc(mean):  0.9827436895603547
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.5
solver:  lbfgs
f1_scores(mean):  0.5817142857142857
accuracy(mean):  0.9990929422509804
precision(mean):  0.8311111111111111
recall(mean):  0.4583333333333333
roc_auc(mean):  0.9812882720054376
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.5
solver:  sag
f1_scores(mean):  0.5248864468864469
accuracy(mean):  0.998981875703396
precision(mean):  0.7644444444444445
recall(mean):  0.4066666666666666
roc_auc(mean):  0.9711535669045632
-------------------------

In [36]:
# standardize ANOVA 0.9 lbfgs
final_pipline = Pipeline([
                         ('preprocess',StandardScaler()),
                         ('feature_selection',SelectPercentile(f_classif, percentile=90)),
                        ('estimator',LogisticRegression(solver ="lbfgs",max_iter=1000))
                    ])
final_pipline.fit(X_train,y_train)
y_test_pred = final_pipline.predict(X_test)

print(accuracy_score(y_test,y_test_pred))
print(recall_score(y_test,y_test_pred))
print(precision_score(y_test,y_test_pred))
print(roc_auc_score(y_test,y_test_pred))
print(f1_score(y_test,y_test_pred))
cnf_matrix = confusion_matrix(y_test, y_test_pred)
print("tn, fp, fn, tp:",  cnf_matrix.ravel())

0.998790601243953
0.5348837209302325
0.7419354838709677
0.7672687677306839
0.6216216216216216
tn, fp, fn, tp: [23101     8    20    23]


In [7]:
SVM(X_train,y_train)

preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.1
C_selection:  0.2
kernel_selection:  poly
f1_scores(mean):  0.0
accuracy(mean):  0.998593141073601
precision(mean):  0.0
recall(mean):  0.0
roc_auc(mean):  0.5608306299626162
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.1
C_selection:  0.2
kernel_selection:  rbf
f1_scores(mean):  0.0
accuracy(mean):  0.998593141073601
precision(mean):  0.0
recall(mean):  0.0
roc_auc(mean):  0.9087664905613743
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.1
C_selection:  0.2
kernel_selection:  sigmoid
f1_scores(mean):  0.0
accuracy(mean):  0.9985746294112537
precision(mean):  0.0
recall(mean):  0.0
roc_auc(mean):  0.06230481663422621
-----------------------------------------------------------------------

preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.3
C_selection:  2
kernel_selection:  poly
f1_scores(mean):  0.4758834498834498
accuracy(mean):  0.9988152707422705
precision(mean):  0.6469480519480519
recall(mean):  0.3808333333333333
roc_auc(mean):  0.6757068928229369
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.3
C_selection:  2
kernel_selection:  rbf
f1_scores(mean):  0.52
accuracy(mean):  0.9989448506654519
precision(mean):  0.7320634920634921
recall(mean):  0.40666666666666673
roc_auc(mean):  0.8678244044860508
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.3
C_selection:  2
kernel_selection:  sigmoid
f1_scores(mean):  0.0
accuracy(mean):  0.9973898950137754
precision(mean):  0.0
recall(mean):  0.0
roc_auc(mean):  0.2371654771835511

preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.7
C_selection:  0.2
kernel_selection:  sigmoid
f1_scores(mean):  0.0
accuracy(mean):  0.9979452397444447
precision(mean):  0.0
recall(mean):  0.0
roc_auc(mean):  0.4595582692248277
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.7
C_selection:  0.5
kernel_selection:  poly
f1_scores(mean):  0.533385794282346
accuracy(mean):  0.9988708074425621
precision(mean):  0.6412898212898213
recall(mean):  0.45916666666666667
roc_auc(mean):  0.8229831309667255
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.7
C_selection:  0.5
kernel_selection:  rbf
f1_scores(mean):  0.37797676008202324
accuracy(mean):  0.9988523009199636
precision(mean):  0.8261904761904763
recall(mean):  0.2508333333333333
roc_auc(mean):

preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.8
C_selection:  2
kernel_selection:  poly
f1_scores(mean):  0.5453734671125975
accuracy(mean):  0.9989078324805062
precision(mean):  0.6831623931623931
recall(mean):  0.4608333333333333
roc_auc(mean):  0.7831837210739333
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.8
C_selection:  2
kernel_selection:  rbf
f1_scores(mean):  0.5848201524723264
accuracy(mean):  0.9990559223527852
precision(mean):  0.7795238095238096
recall(mean):  0.4733333333333333
roc_auc(mean):  0.9751923255167301
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.8
C_selection:  2
kernel_selection:  sigmoid
f1_scores(mean):  0.0854865424430642
accuracy(mean):  0.9975379848860543
precision(mean):  0.10156288156288155
recall(me

preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.1
C_selection:  0.2
kernel_selection:  rbf
f1_scores(mean):  0.1581699346405229
accuracy(mean):  0.9987042110476846
precision(mean):  0.5333333333333333
recall(mean):  0.09333333333333334
roc_auc(mean):  0.8491213658973645
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.1
C_selection:  0.2
kernel_selection:  sigmoid
f1_scores(mean):  0.0
accuracy(mean):  0.9984080415826245
precision(mean):  0.0
recall(mean):  0.0
roc_auc(mean):  0.516062849507214
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.1
C_selection:  0.5
kernel_selection:  poly
f1_scores(mean):  0.6136799292661361
accuracy(mean):  0.9990559189262859
precision(mean):  0.7536363636363637
recall(mean):  0.5266666666666667
roc_auc(m

preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.3
C_selection:  1
kernel_selection:  sigmoid
f1_scores(mean):  0.01666666666666667
accuracy(mean):  0.9981118378525717
precision(mean):  0.02222222222222222
recall(mean):  0.013333333333333332
roc_auc(mean):  0.4896703432508419
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.3
C_selection:  2
kernel_selection:  poly
f1_scores(mean):  0.5782710622710623
accuracy(mean):  0.9990003925054923
precision(mean):  0.7220979020979021
recall(mean):  0.5016666666666667
roc_auc(mean):  0.8444636512497296
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.3
C_selection:  2
kernel_selection:  rbf
f1_scores(mean):  0.47598343685300215
accuracy(mean):  0.9989263441428535
precision(mean):  0.7916666666666667

preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.7
C_selection:  0.2
kernel_selection:  poly
f1_scores(mean):  0.6292263099219622
accuracy(mean):  0.9991484789512718
precision(mean):  0.8214646464646463
recall(mean):  0.5133333333333334
roc_auc(mean):  0.9100712917477678
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.7
C_selection:  0.2
kernel_selection:  rbf
f1_scores(mean):  0.29372549019607846
accuracy(mean):  0.9987597426082273
precision(mean):  0.79
recall(mean):  0.18416666666666667
roc_auc(mean):  0.9842252911916459
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.7
C_selection:  0.2
kernel_selection:  sigmoid
f1_scores(mean):  0.0
accuracy(mean):  0.998593141073601
precision(mean):  0.0
recall(mean):  0.0
roc_auc(mean):  0.4391

preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.8
C_selection:  1
kernel_selection:  rbf
f1_scores(mean):  0.4603478260869565
accuracy(mean):  0.9988893208181588
precision(mean):  0.74
recall(mean):  0.3416666666666667
roc_auc(mean):  0.98316286649983
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.8
C_selection:  1
kernel_selection:  sigmoid
f1_scores(mean):  0.038095238095238085
accuracy(mean):  0.9981488731700138
precision(mean):  0.06666666666666667
recall(mean):  0.026666666666666665
roc_auc(mean):  0.4380263076590355
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.8
C_selection:  2
kernel_selection:  poly
f1_scores(mean):  0.6269161234678475
accuracy(mean):  0.9990929473907293
precision(mean):  0.7469197469197468
recall(mean):  

preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.9
C_selection:  5
kernel_selection:  sigmoid
f1_scores(mean):  0.028571428571428574
accuracy(mean):  0.9976490634263863
precision(mean):  0.03076923076923077
recall(mean):  0.026666666666666665
roc_auc(mean):  0.4365457873760311
-------------------------------------------------------------------------------------


In [4]:
SVM(X_train,y_train)

preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.7
C_selection:  10
kernel_selection:  poly
f1_scores(mean):  0.5580515297906602
accuracy(mean):  0.9989818739901464
precision(mean):  0.7191919191919192
recall(mean):  0.45916666666666667
roc_auc(mean):  0.7874500262613154
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.7
C_selection:  10
kernel_selection:  rbf
f1_scores(mean):  0.6030019706541446
accuracy(mean):  0.9990929456774797
precision(mean):  0.8080952380952381
recall(mean):  0.48666666666666664
roc_auc(mean):  0.941381345197269
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.7
C_selection:  10
kernel_selection:  sigmoid
f1_scores(mean):  0.07381447381447383
accuracy(mean):  0.9975935267260947
precision(mean):  0.08710178710178709
reca

preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.9
C_selection:  100
kernel_selection:  rbf
f1_scores(mean):  0.5674059274059274
accuracy(mean):  0.9990559240660348
precision(mean):  0.7991053391053391
recall(mean):  0.44749999999999995
roc_auc(mean):  0.9519224827756666
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.9
C_selection:  100
kernel_selection:  sigmoid
f1_scores(mean):  0.06109890109890111
accuracy(mean):  0.997278842172188
precision(mean):  0.05870813397129186
recall(mean):  0.06583333333333333
roc_auc(mean):  0.45403798622053326
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.9
C_selection:  1000
kernel_selection:  poly
f1_scores(mean):  0.48798193278727203
accuracy(mean):  0.9986116510226987
precision(mean):  0.512652625152625

preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.8
C_selection:  1000
kernel_selection:  sigmoid
f1_scores(mean):  0.026666666666666665
accuracy(mean):  0.997445440280315
precision(mean):  0.026666666666666665
recall(mean):  0.026666666666666665
roc_auc(mean):  0.4344627243797695
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.9
C_selection:  10
kernel_selection:  poly
f1_scores(mean):  0.6297118813864956
accuracy(mean):  0.9990929439642301
precision(mean):  0.7289743589743589
recall(mean):  0.5641666666666667
roc_auc(mean):  0.842405382024902
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.9
C_selection:  10
kernel_selection:  rbf
f1_scores(mean):  0.532
accuracy(mean):  0.9990189007413403
precision(mean):  0.8155555555555555
recall(m

In [None]:
SVM(X_train,y_train)

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.1
C_selection:  10
kernel_selection:  poly
f1_scores(mean):  0.6215340022296545
accuracy(mean):  0.9990929439642301
precision(mean):  0.7741064491064491
recall(mean):  0.5266666666666667
roc_auc(mean):  0.8027055720332438
-------------------------------------------------------------------------------------
[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.1
C_selection:  10
kernel_selection:  rbf
f1_scores(mean):  0.5918219461697724
accuracy(mean):  0.9991299672889247
precision(mean):  0.888888888888889
recall(mean):  0.44749999999999995
roc_auc(mean):  0.8765295285321469
-------------------------------------------------------------------------------------
[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.1
C_selection:  10
kernel_selection: 

In [11]:
# standardize ANOVA 0.5 0.5 ploy
final_pipline = Pipeline([
                         ('preprocess',StandardScaler()),
                         ('feature_selection',SelectPercentile(f_classif, percentile=50)),
                        ('estimator',SVC(C=0.5, kernel="poly",verbose = True,degree=5))
                    ])
final_pipline.fit(X_train,y_train)
y_test_pred = final_pipline.predict(X_test)

print(accuracy_score(y_test,y_test_pred))
print(recall_score(y_test,y_test_pred))
print(precision_score(y_test,y_test_pred))
print(roc_auc_score(y_test,y_test_pred))
print(f1_score(y_test,y_test_pred))
cnf_matrix = confusion_matrix(y_test, y_test_pred)
print("tn, fp, fn, tp:",  cnf_matrix.ravel())

[LibSVM]0.9989633724948168
0.4883720930232558
0.9130434782608695
0.7441427733280197
0.6363636363636364
tn, fp, fn, tp: [23107     2    22    21]
