In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

col_names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label", "difficulty_level"]

df_train_multiple_labels = pd.read_csv("KDDTrain+.txt", header=None, names = col_names)
df_test_multiple_labels = pd.read_csv("KDDTest+.txt", header=None, names = col_names)

from sklearn.preprocessing import LabelEncoder,OneHotEncoder

pd.set_option('display.max_columns', None)

# Combine training dataset with testing dataset to apply One-Hot-Encoder
df_multiple_labels = pd.concat([df_train_multiple_labels, df_test_multiple_labels], ignore_index=True)
# Drop "difficulty_level"
df_multiple_labels = df_multiple_labels.drop("difficulty_level", axis=1)
# One-Hot-Encoder for "protocol_type","service","flag"
categorical_col = ["protocol_type","service","flag"]
for col in categorical_col:
    df_dummies = pd.get_dummies(df_multiple_labels[col], prefix=col+"_")
    df_multiple_labels = df_multiple_labels.drop(col, axis=1)
    df_multiple_labels = pd.concat([df_multiple_labels,df_dummies], axis=1)
# LabelEncoder for "labels"
# Five Labels: normal: 0, DoS: 1, Probe: 2, R2L:3, U2R: 4
df_multiple_labels = df_multiple_labels.replace({'label':{'normal' : 0, 'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,
                           'ipsweep' : 2,'nmap' : 2,'portsweep' : 2,'satan' : 2,'mscan' : 2,'saint' : 2
                           ,'ftp_write': 3,'guess_passwd': 3,'imap': 3,'multihop': 3,'phf': 3,'spy': 3,'warezclient': 3,'warezmaster': 3,'sendmail': 3,'named': 3,'snmpgetattack': 3,'snmpguess': 3,'xlock': 3,'xsnoop': 3,'httptunnel': 3,
                           'buffer_overflow': 4,'loadmodule': 4,'perl': 4,'rootkit': 4,'ps': 4,'sqlattack': 4,'xterm': 4}})
# print(df_multiple_labels.head(1))
df_multiple_labels_normal = df_multiple_labels.loc[df_multiple_labels["label"]==0]
df_multiple_labels_DoS = df_multiple_labels.loc[df_multiple_labels["label"]==1]
df_multiple_labels_Probe = df_multiple_labels.loc[df_multiple_labels["label"]==2]
df_multiple_labels_R2L = df_multiple_labels.loc[df_multiple_labels["label"]==3]
df_multiple_labels_U2R = df_multiple_labels.loc[df_multiple_labels["label"]==4]

## DoS Dataset
df_DoS_normal = pd.concat([df_multiple_labels_normal, df_multiple_labels_DoS], ignore_index=True)
df_DoS_normal['label'] = np.where(df_DoS_normal['label'].eq(1), 1, 0)
## Probe Dataset
df_Probe_normal = pd.concat([df_multiple_labels_normal, df_multiple_labels_Probe], ignore_index=True)
df_Probe_normal['label'] = np.where(df_Probe_normal['label'].eq(2), 1, 0)
## R2L Dataset
df_R2L_normal = pd.concat([df_multiple_labels_normal, df_multiple_labels_R2L], ignore_index=True)
df_R2L_normal['label'] = np.where(df_R2L_normal['label'].eq(3), 1, 0)
## U2R Dataset
df_U2R_normal = pd.concat([df_multiple_labels_normal, df_multiple_labels_U2R], ignore_index=True)
df_U2R_normal['label'] = np.where(df_U2R_normal['label'].eq(4), 1, 0)

print(df_DoS_normal.shape)
print(df_Probe_normal)
print(df_R2L_normal.shape)
print(df_U2R_normal.shape)

(130441, 123)
       duration  src_bytes  dst_bytes  land  wrong_fragment  urgent  hot  \
0             0        491          0     0               0       0    0   
1             0        146          0     0               0       0    0   
2             0        232       8153     0               0       0    0   
3             0        199        420     0               0       0    0   
4             0        287       2251     0               0       0    0   
...         ...        ...        ...   ...             ...     ...  ...   
91126         0          0          0     0               0       0    0   
91127         2         24        109     0               0       0    0   
91128         0          0          0     0               0       0    0   
91129         0          1          1     0               0       0    0   
91130         0          0          0     0               0       0    0   

       num_failed_logins  logged_in  num_compromised  root_shell  \
0    

In [5]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score, roc_curve, recall_score, precision_score,f1_score 

from sklearn.preprocessing import StandardScaler,Normalizer

from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectPercentile, f_classif

from sklearn.model_selection import KFold, cross_val_score, cross_validate
from sklearn.pipeline import make_pipeline, Pipeline

from sklearn.tree import DecisionTreeClassifier

def printResults(results):
    #print(results.keys())
    print("f1_scores(mean): ", sum(results['test_f1'])/len(results['test_f1']))
    print("accuracy(mean): ", sum(results['test_accuracy'])/len(results['test_accuracy']))
    print("precision(mean): ", sum(results['test_precision'])/len(results['test_precision']))
    print("recall(mean): ", sum(results['test_recall'])/len(results['test_recall']))
    print("roc_auc(mean): ", sum(results['test_roc_auc'])/len(results['test_roc_auc']))

def generate_preprocess_pipeline(preprocess):
    if preprocess == "standardize":
        return StandardScaler()
    elif preprocess == "normalize":
        return Normalizer()
    
def generate_feature_selection_pipeline(feature_selection, percentage):
    if feature_selection == "PCA":
        return PCA(n_components = percentage)
    elif feature_selection == "ANOVA":
        return SelectPercentile(f_classif, percentile=(percentage*100))
                                
def DT(X,y):
    preprocess_methods = ["standardize"]
    
    feature_selection_methods = ["PCA","ANOVA"]
    percentage_selections = [0.1,0.3,0.5,0.7,0.8,0.9]
  
    DT_depth_selections= [2,5,10,20,None]
    
    for preprocess_method in preprocess_methods:
        preprocess_pipeline = generate_preprocess_pipeline(preprocess_method)
        for feature_selection_method in feature_selection_methods:
            for percentage_selection in percentage_selections:
                feature_selection_pipeline = generate_feature_selection_pipeline(feature_selection_method, percentage_selection)
                for DT_depth in DT_depth_selections:
                    complete_pipline = Pipeline([
                         ('preprocess',preprocess_pipeline ),
                         ('feature_selection',feature_selection_pipeline),
                        ('estimator',DecisionTreeClassifier(max_depth=DT_depth,random_state=0))
                    ])
                    results = cross_validate(complete_pipline, X, y, cv=5,
                         scoring=('f1', 'accuracy', 'precision','recall','roc_auc'), return_estimator=True)
                    print("preprocess: ",preprocess_method)
                    print("feature_selection: ", feature_selection_method)
                    print("percentage_selection: ", percentage_selection)
                    print("depth: ", DT_depth)
                    printResults(results)
                    print("-------------------------------------------------------------------------------------")
                
from sklearn.linear_model import LogisticRegression
def LG(X,y):
    preprocess_methods = ["standardize"]
    
    feature_selection_methods = ["PCA","ANOVA"]
    percentage_selections = [0.1,0.3,0.5,0.7,0.8,0.9]
  
    LG_solvers= ['newton-cg','lbfgs','sag']
    
    for preprocess_method in preprocess_methods:
        preprocess_pipeline = generate_preprocess_pipeline(preprocess_method)
        for feature_selection_method in feature_selection_methods:
            for percentage_selection in percentage_selections:
                feature_selection_pipeline = generate_feature_selection_pipeline(feature_selection_method, percentage_selection)
                for LG_solver in LG_solvers:
                    complete_pipline = Pipeline([
                         ('preprocess',preprocess_pipeline ),
                         ('feature_selection',feature_selection_pipeline),
                        ('estimator',LogisticRegression(solver = LG_solver,max_iter=100))
                    ])
                    results = cross_validate(complete_pipline, X, y, cv=5,
                         scoring=('f1', 'accuracy', 'precision','recall','roc_auc'), return_estimator=True)
                    print("preprocess: ",preprocess_method)
                    print("feature_selection: ", feature_selection_method)
                    print("percentage_selection: ", percentage_selection)
                    print("solver: ", LG_solver)
                    printResults(results)
                    print("-------------------------------------------------------------------------------------")
                    
from sklearn.svm import SVC
def SVM(X,y):
    preprocess_methods = ["standardize"]
    
    feature_selection_methods = ["PCA","ANOVA"]
    percentage_selections = [0.1,0.3,0.5,0.7,0.8,0.9]
  
    C_selections = [0.2,0.5,1,2,5]
    kernel_selections = ['poly','rbf','sigmoid']
    for preprocess_method in preprocess_methods:
        preprocess_pipeline = generate_preprocess_pipeline(preprocess_method)
        for feature_selection_method in feature_selection_methods:
            for percentage_selection in percentage_selections:
                feature_selection_pipeline = generate_feature_selection_pipeline(feature_selection_method, percentage_selection)
                for  C_selection in  C_selections:
                    for kernel_selection in kernel_selections:
                        complete_pipline = Pipeline([
                         ('preprocess',preprocess_pipeline ),
                         ('feature_selection',feature_selection_pipeline),
                        ('estimator',SVC(C=C_selection, kernel=kernel_selection))
                    ])
                        results = cross_validate(complete_pipline, X, y, cv=5,
                         scoring=('f1', 'accuracy', 'precision','recall','roc_auc'), return_estimator=True)
                        print("preprocess: ",preprocess_method)
                        print("feature_selection: ", feature_selection_method)
                        print("percentage_selection: ", percentage_selection)
                        print("C_selection: ", C_selection)
                        print("kernel_selection: ", kernel_selection)
                        printResults(results)
                        print("-------------------------------------------------------------------------------------")

## Split dataset to training set and testing set
from sklearn.model_selection import train_test_split
def splitDataset(df):
    X = df.drop(columns=['label'])
    y = df['label']
    print(X.shape)
    print(y.shape)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size = 0.3, random_state=6)
    print(X_train.shape)
    print(y_train)
    print(X_test.shape)
    print(y_test)
    return X_train, X_test, y_train, y_test


## Probe

In [3]:
X_train, X_test, y_train, y_test = splitDataset(df_Probe_normal)

(91131, 122)
(91131,)
(63791, 122)
61710    0
56046    0
13213    0
77902    1
35507    0
        ..
41295    0
64877    0
4714     0
41187    0
31626    0
Name: label, Length: 63791, dtype: int32
(27340, 122)
54949    0
79817    1
55369    0
20365    0
83916    1
        ..
47423    0
47852    0
74592    0
52473    0
41763    0
Name: label, Length: 27340, dtype: int32


In [4]:
DT(X_train,y_train)

preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.1
depth:  2
f1_scores(mean):  0.6605732928590933
accuracy(mean):  0.9206471495193966
precision(mean):  0.9714935057092792
recall(mean):  0.5006110963590474
roc_auc(mean):  0.9238214744656922
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.1
depth:  5
f1_scores(mean):  0.810527260984425
accuracy(mean):  0.9445220075311532
precision(mean):  0.8572935874962326
recall(mean):  0.7687397297708388
roc_auc(mean):  0.973806171174618
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.1
depth:  10
f1_scores(mean):  0.8747957092951376
accuracy(mean):  0.9629414858970847
precision(mean):  0.9140911569199901
recall(mean):  0.8390193687434422
roc_auc(mean):  0.982775725436103
-----------------------------------

preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.8
depth:  None
f1_scores(mean):  0.9824301444043648
accuracy(mean):  0.9945760506362105
precision(mean):  0.9824313581647026
recall(mean):  0.9824295875408939
roc_auc(mean):  0.9896113087393967
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.9
depth:  2
f1_scores(mean):  0.811114833479726
accuracy(mean):  0.9438795721240318
precision(mean):  0.8382499274232327
recall(mean):  0.820262649751349
roc_auc(mean):  0.9482395311027492
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.9
depth:  5
f1_scores(mean):  0.9592468985744373
accuracy(mean):  0.9875060850787916
precision(mean):  0.964617223407984
recall(mean):  0.9542981698561201
roc_auc(mean):  0.9890121154238146
---------------------------------

preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.7
depth:  20
f1_scores(mean):  0.9861405076186177
accuracy(mean):  0.9957204072885867
precision(mean):  0.9858965462621864
recall(mean):  0.9863907314646049
roc_auc(mean):  0.9927281734847868
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.7
depth:  None
f1_scores(mean):  0.9863935409630056
accuracy(mean):  0.9957987870228383
precision(mean):  0.9862014186581869
recall(mean):  0.986593828710495
roc_auc(mean):  0.9924591897425117
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.8
depth:  2
f1_scores(mean):  0.9020492238008675
accuracy(mean):  0.9697452235364391
precision(mean):  0.8852702363790581
recall(mean):  0.9287215804358417
roc_auc(mean):  0.9609288451774182
------------------------

In [6]:
# standardize ANOVA 0.9 None
final_pipline = Pipeline([
                         ('preprocess',StandardScaler()),
                         ('feature_selection',SelectPercentile(f_classif, percentile=90)),
                        ('estimator',DecisionTreeClassifier(max_depth=None,random_state=0))
                    ])
final_pipline.fit(X_train,y_train)
y_test_pred = final_pipline.predict(X_test)

print(accuracy_score(y_test,y_test_pred))
print(recall_score(y_test,y_test_pred))
print(precision_score(y_test,y_test_pred))
print(roc_auc_score(y_test,y_test_pred))
print(f1_score(y_test,y_test_pred))
cnf_matrix = confusion_matrix(y_test, y_test_pred)
print("tn, fp, fn, tp:",  cnf_matrix.ravel())

0.9978419897585955
0.9929094776648546
0.9931442080378251
0.9958272776701096
0.993026828980026
tn, fp, fn, tp: [23080    29    30  4201]


In [5]:
LG(X_train,y_train)

preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.1
solver:  newton-cg
f1_scores(mean):  0.6547753344048839
accuracy(mean):  0.9055039632122316
precision(mean):  0.7505924687562711
recall(mean):  0.5807453601895368
roc_auc(mean):  0.946222986226528
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.1
solver:  lbfgs
f1_scores(mean):  0.6547753344048839
accuracy(mean):  0.9055039632122316
precision(mean):  0.7505924687562711
recall(mean):  0.5807453601895368
roc_auc(mean):  0.9462229862265282
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.1
solver:  sag
f1_scores(mean):  0.6548130310742742
accuracy(mean):  0.9055196396505447
precision(mean):  0.7506886834918522
recall(mean):  0.5807453601895368
roc_auc(mean):  0.9462231181603293
-----------------

preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.3
solver:  sag
f1_scores(mean):  0.9596730176015498
accuracy(mean):  0.9876158275189277
precision(mean):  0.9645204388343546
recall(mean):  0.9549065850634066
roc_auc(mean):  0.9971548149683193
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.5
solver:  newton-cg
f1_scores(mean):  0.9622746069771196
accuracy(mean):  0.9884153111290143
precision(mean):  0.9671995528399423
recall(mean):  0.9574457904628337
roc_auc(mean):  0.997342467242567
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.5
solver:  lbfgs
f1_scores(mean):  0.9622255427020125
accuracy(mean):  0.9883996346907011
precision(mean):  0.967100998374811
recall(mean):  0.9574457904628337
roc_auc(mean):  0.9973419120854707
------------

In [8]:
# standardize ANOVA 0.9 lbfs
final_pipline = Pipeline([
                         ('preprocess',StandardScaler()),
                         ('feature_selection',SelectPercentile(f_classif, percentile=90)),
                        ('estimator',LogisticRegression(solver ="lbfgs",max_iter=1000))
                    ])
final_pipline.fit(X_train,y_train)
y_test_pred = final_pipline.predict(X_test)

print(accuracy_score(y_test,y_test_pred))
print(recall_score(y_test,y_test_pred))
print(precision_score(y_test,y_test_pred))
print(roc_auc_score(y_test,y_test_pred))
print(f1_score(y_test,y_test_pred))
cnf_matrix = confusion_matrix(y_test, y_test_pred)
print("tn, fp, fn, tp:",  cnf_matrix.ravel())

0.9891002194586687
0.9598203734341764
0.9694437813320601
0.9771407029661686
0.964608076009501
tn, fp, fn, tp: [22981   128   170  4061]


In [4]:
SVM(X_train,y_train)

preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.1
C_selection:  0.2
kernel_selection:  poly
f1_scores(mean):  0.6704274715299671
accuracy(mean):  0.922089397816757
precision(mean):  0.9650249877626891
recall(mean):  0.5138157687815971
roc_auc(mean):  0.9237596597426165
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.1
C_selection:  0.2
kernel_selection:  rbf
f1_scores(mean):  0.7046201068250203
accuracy(mean):  0.9278738720880039
precision(mean):  0.9573611580380579
recall(mean):  0.5574863686635232
roc_auc(mean):  0.948851816932925
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.1
C_selection:  0.2
kernel_selection:  sigmoid
f1_scores(mean):  0.3714028639786504
accuracy(mean):  0.8059118774312133
precision(mean):  0.3691337538640672
recall

preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.3
C_selection:  1
kernel_selection:  rbf
f1_scores(mean):  0.9672127342820669
accuracy(mean):  0.9898888803579118
precision(mean):  0.9678854583666541
recall(mean):  0.9665866617855954
roc_auc(mean):  0.998165814397146
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.3
C_selection:  1
kernel_selection:  sigmoid
f1_scores(mean):  0.4327102391664458
accuracy(mean):  0.8245365108474896
precision(mean):  0.4318785207353189
recall(mean):  0.43357951806297096
roc_auc(mean):  0.7793727636357178
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.3
C_selection:  2
kernel_selection:  poly
f1_scores(mean):  0.9579612251500216
accuracy(mean):  0.9871768786455568
precision(mean):  0.969444483743402
recall(mean

preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.5
C_selection:  5
kernel_selection:  sigmoid
f1_scores(mean):  0.5452231642112
accuracy(mean):  0.8593218983919838
precision(mean):  0.5441579267092991
recall(mean):  0.5463134421090352
roc_auc(mean):  0.8681841318071177
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.7
C_selection:  0.2
kernel_selection:  poly
f1_scores(mean):  0.9472878427711366
accuracy(mean):  0.9841513887126278
precision(mean):  0.9729225448924202
recall(mean):  0.9230154192006559
roc_auc(mean):  0.9967324794055881
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.7
C_selection:  0.2
kernel_selection:  rbf
f1_scores(mean):  0.9552000450769956
accuracy(mean):  0.9862833560641076
precision(mean):  0.9627641951608394
recall(me

preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.8
C_selection:  1
kernel_selection:  poly
f1_scores(mean):  0.9530976424598524
accuracy(mean):  0.9858130641433682
precision(mean):  0.9727272915352115
recall(mean):  0.9342888889461785
roc_auc(mean):  0.9979434863098671
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.8
C_selection:  1
kernel_selection:  rbf
f1_scores(mean):  0.9646594775956527
accuracy(mean):  0.9891207496244515
precision(mean):  0.9670427354924243
recall(mean):  0.962320897773355
roc_auc(mean):  0.9979727205907558
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.8
C_selection:  1
kernel_selection:  sigmoid
f1_scores(mean):  0.744623716742266
accuracy(mean):  0.9214152986827159
precision(mean):  0.7468552493038217
recall(mean)

preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.9
C_selection:  5
kernel_selection:  rbf
f1_scores(mean):  0.9691091762695212
accuracy(mean):  0.9904845751845558
precision(mean):  0.9708604934728555
recall(mean):  0.967399153890377
roc_auc(mean):  0.9985035011383527
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  PCA
percentage_selection:  0.9
C_selection:  5
kernel_selection:  sigmoid
f1_scores(mean):  0.625799477664584
accuracy(mean):  0.8843254009867421
precision(mean):  0.6249927118298256
recall(mean):  0.6266498235338096
roc_auc(mean):  0.9204106154378089
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.1
C_selection:  0.2
kernel_selection:  poly
f1_scores(mean):  0.9639535474012746
accuracy(mean):  0.9887601817139895
precision(mean):  0.9543262975485426
recall(m

preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.3
C_selection:  0.5
kernel_selection:  sigmoid
f1_scores(mean):  0.46927939925541934
accuracy(mean):  0.835556764390504
precision(mean):  0.4675979370080167
recall(mean):  0.47105315125562974
roc_auc(mean):  0.8037252013501052
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.3
C_selection:  1
kernel_selection:  poly
f1_scores(mean):  0.9773723850777145
accuracy(mean):  0.992977069900807
precision(mean):  0.9721793202227158
recall(mean):  0.9826329941504486
roc_auc(mean):  0.9983274117456519
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.3
C_selection:  1
kernel_selection:  rbf
f1_scores(mean):  0.9767481711144358
accuracy(mean):  0.9927732872606511
precision(mean):  0.9701673012848451
re

preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.5
C_selection:  5
kernel_selection:  poly
f1_scores(mean):  0.978349888635589
accuracy(mean):  0.9932749222287584
precision(mean):  0.9722316845850525
recall(mean):  0.9845629593728168
roc_auc(mean):  0.9980312556546822
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.5
C_selection:  5
kernel_selection:  rbf
f1_scores(mean):  0.9779262662157839
accuracy(mean):  0.9931338355125966
precision(mean):  0.9705146733572171
recall(mean):  0.9854770774414593
roc_auc(mean):  0.9986196254833601
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.5
C_selection:  5
kernel_selection:  sigmoid
f1_scores(mean):  0.5883496564588185
accuracy(mean):  0.8727720441052089
precision(mean):  0.5877053930481158
recal

preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.8
C_selection:  0.5
kernel_selection:  rbf
f1_scores(mean):  0.9674250160266957
accuracy(mean):  0.9899045518815959
precision(mean):  0.9633807185927447
recall(mean):  0.9715632919387562
roc_auc(mean):  0.9983227108554708
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.8
C_selection:  0.5
kernel_selection:  sigmoid
f1_scores(mean):  0.8475477414680352
accuracy(mean):  0.9529714357699561
precision(mean):  0.8479952981169466
recall(mean):  0.847146919382407
roc_auc(mean):  0.9350346752503041
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.8
C_selection:  1
kernel_selection:  poly
f1_scores(mean):  0.9712333717858606
accuracy(mean):  0.9911272882682237
precision(mean):  0.9719364480186012
r

preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.9
C_selection:  2
kernel_selection:  sigmoid
f1_scores(mean):  0.842602341156718
accuracy(mean):  0.9514351595591484
precision(mean):  0.8428933896656023
recall(mean):  0.8423733349145255
roc_auc(mean):  0.9373138421238428
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.9
C_selection:  5
kernel_selection:  poly
f1_scores(mean):  0.9768864531382295
accuracy(mean):  0.9928359819559883
precision(mean):  0.9728097941340066
recall(mean):  0.9810078552590534
roc_auc(mean):  0.9976348445653811
-------------------------------------------------------------------------------------
preprocess:  standardize
feature_selection:  ANOVA
percentage_selection:  0.9
C_selection:  5
kernel_selection:  rbf
f1_scores(mean):  0.9777311631586656
accuracy(mean):  0.9930711273020292
precision(mean):  0.9700297525588484
recal

In [14]:
# standardize ANOVA 0.3 5/10/100/1000 poly/rbf
final_pipline = Pipeline([
                         ('preprocess',StandardScaler()),
                         ('feature_selection',PCA(n_components=0.3)),
                        ('estimator',SVC(C=10000, kernel="rbf",verbose = True))
                    ])
final_pipline.fit(X_train,y_train)
y_test_pred = final_pipline.predict(X_test)

print(accuracy_score(y_test,y_test_pred))
print(recall_score(y_test,y_test_pred))
print(precision_score(y_test,y_test_pred))
print(roc_auc_score(y_test,y_test_pred))
print(f1_score(y_test,y_test_pred))
cnf_matrix = confusion_matrix(y_test, y_test_pred)
print("tn, fp, fn, tp:",  cnf_matrix.ravel())

[LibSVM]0.99575713240673
0.9907823209643111
0.9819629889903959
0.9937251429132429
0.9863529411764707
tn, fp, fn, tp: [23032    77    39  4192]
