In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import classification_report, accuracy_score,f1_score
from sklearn.svm import SVC
from sklearn.ensemble import IsolationForest
from sklearn.tree import DecisionTreeClassifier

#데이터 받기
folder_path = "C:/Users/milab_8/Desktop/내꺼/BGP_RIPE_datasets_for_anomaly_detection_csv_revised_19022021"
BCNET = pd.read_csv(folder_path +'/BCNET_regular.csv')
RED1=pd.read_csv(folder_path+'/Code_Red_I.csv')
Nimda=pd.read_csv(folder_path+'/Nimda.csv')
RIPE=pd.read_csv(folder_path+'/RIPE_regular.csv')
Slammer=pd.read_csv(folder_path+'/Slammer.csv')



#열이름 붙여주는 함수
def plus_col(file):
    col_name=file.columns
    new_col_name=['hour_minute','hour','minute','second','Number of announcements','Number of withdrawals','Number of announced NLRI prefixes','Number of withdrawn NLRI prefixes','Average AS-path length','Maximum AS-path length',
                  'Average unique AS-path length',"Number of duplicate announcements",'Number of implicit withdrawals','Number of duplicate withdrawals','Maximum edit distance','Arrival rate','Average edit distance',
                  'Maximum AS-path length 11','Maximum AS-path length 12','Maximum AS-path length 13','Maximum AS-path length 14','Maximum AS-path length 15',
                  'Maximum AS-path length 16','Maximum AS-path length 17','Maximum AS-path length 18','Maximum AS-path length 19','Maximum AS-path length 20', 
                  'Maximum edit distance 7','Maximum edit distance 8','Maximum edit distance 9','Maximum edit distance 10','Maximum edit distance 11',
                  'Maximum edit distance 12','Maximum edit distance 13','Maximum edit distance 14','Maximum edit distance 15','Maximum edit distance 16',
                  'Number of Interior Gateway Protocol (IGP) packets', 'Number of Exterior Gateway Protocol (EGP) packets','Number of incomplete packets','Packet size (B)','regular or anomalous']
    for i in range(len(new_col_name)):
        file.rename(columns={col_name[i]:new_col_name[i]},inplace=True)
    return file.iloc[:,4:42]

#fisher_markov selector
def fisher_markov_selector(dataset,w,N):
    X=dataset.iloc[:,:-1]
    y=dataset["regular or anomalous"]
    std=StandardScaler()
    X_std=std.fit_transform(X)

    std_dataset=pd.DataFrame(X_std,columns=X.columns)
    std_dataset["regular or anomalous"]=y.values

    std_dataset.loc[std_dataset["regular or anomalous"]==1,std_dataset.columns != "regular or anomalous"]*=w
    
    gamma=-0.5
    n, p = X.shape

    theta = np.zeros(p)

    for j in range(p):
        # Calculate the means and other statistics
        theta_j = 0
        for k in np.unique(y):
            X_k = std_dataset[y == k]
            n_k = len(X_k)
            theta_j += (1/n) * np.sum(X_k.iloc[:, j]) * np.sum(X_k.iloc[:, j]) / n_k

        theta_j -= gamma * np.sum(std_dataset.iloc[:, j])**2 / n
        theta_j += (gamma - 1) / n**2 * np.sum(std_dataset.iloc[:, j])**2

        # Set theta value
        theta[j] = theta_j
    idx=[]
    theta_score=[]
    # 피쳐 선택
    if N==37:
        idx=[i for i in range(37)]
        return idx,theta
    for i in range(N):
        m_idx = np.argmax(theta)
        idx.append(m_idx)
        theta_score.append(theta[m_idx])
        theta[m_idx] = -1000
    return idx,theta_score

data_list=[BCNET,RED1,Nimda,RIPE,Slammer]
data_name=['BCNET','RED1','Nimda','RIPE','Slammer']

#열이름을 붙여 데이터 생성
BCNET=plus_col(BCNET) ; RED1=plus_col(RED1) ; Nimda=plus_col(Nimda) ;RIPE=plus_col(RIPE) ;Slammer=plus_col(Slammer)

#데이터셋 생성
dataset1=pd.concat([Slammer,Nimda],ignore_index=True)#2.88
dataset2=pd.concat([Slammer,RED1],ignore_index=True)#8.79
dataset3=pd.concat([Nimda,RED1],ignore_index=True)#2.49


#선택된 Feature와 Score
idx1,score1=fisher_markov_selector(dataset1,2.88,10)
idx2,score2=fisher_markov_selector(dataset2,8.79,10)
idx3,score3=fisher_markov_selector(dataset3,2.49,10)
score_table=pd.DataFrame({'dataset1':idx1,'theta1':score1,
              'dataset2':idx2,'theta2':score2,
              'dataset3':idx3,'theta3':score3})


#SVM 모델 생성
def svm_model(train_set,test_set,w,n,best_params):
    select_features, _ = fisher_markov_selector(train_set,w,n)
    
    X_tr=train_set.iloc[:,select_features]
    X_train=StandardScaler().fit_transform(X_tr)
    y_train=train_set["regular or anomalous"]

    X_te=test_set.iloc[:,select_features]
    X_test=StandardScaler().fit_transform(X_te)
    y_test=test_set["regular or anomalous"]

    
    svm=SVC(**best_params)
    
    svm.fit(X_train,y_train)
    y_pred=svm.predict(X_test)
    ac=accuracy_score(y_test,y_pred)
    f1=f1_score(y_test,y_pred)
    report=classification_report(y_test,y_pred)
    return ac,f1,report

#grid_search 로 찾아낸 C,Gamma
d1={'C': 0.03125, 'gamma': 0.0625, 'kernel': 'rbf'}
d2={'C': 16, 'gamma': 0.0625, 'kernel': 'rbf'}
d3={'C': 32, 'gamma': 0.0625, 'kernel': 'rbf'}

dataset1_accuracy,dataset1_f1,_=svm_model(dataset1,RED1,2.88,10,d1);dataset1_all_accuracy,dataset1_all_f1,_=svm_model(dataset1,RED1,2.88,37,d1)
dataset2_accurcay,dataset2_f1,_=svm_model(dataset2,Nimda,8.79,10,d2);dataset2_all_accuracy,dataset2_all_f1,_=svm_model(dataset2,Nimda,8.79,37,d2)
dataset3_accuracy,dataset3_f1,_=svm_model(dataset3,Slammer,2.49,10,d3);dataset3_all_accuracy,dataset3_all_f1,_=svm_model(dataset3,Slammer,2.49,37,d3)
df_SVM=pd.DataFrame({'Dataset':['dataset1','dataset1_all','dataset2','dataset2_all','dataset3','dataset3_all'],
                 'Accuracy':[dataset1_accuracy,dataset1_all_accuracy,dataset2_accurcay,dataset2_all_accuracy,dataset3_accuracy,dataset3_all_accuracy],
                 'F1 Score':[dataset1_f1,dataset1_all_f1,dataset2_f1,dataset2_all_f1,dataset3_f1,dataset3_all_f1]
                 })
df_SVM

#의사결정트리 모델 생성
def decision_tree_model(train_set,test_set,w,n):
    select_features, _ = fisher_markov_selector(train_set, w, n)
    
    X_tr = train_set.iloc[:, select_features]
    X_train = StandardScaler().fit_transform(X_tr)
    y_train = train_set["regular or anomalous"]

    X_te = test_set.iloc[:, select_features]
    X_test = StandardScaler().fit_transform(X_te)
    y_test = test_set["regular or anomalous"]
    param_grid = {'max_depth': [3, 5, 7, 10], 'min_samples_split': [2, 5, 10]}
    grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_
    dt = DecisionTreeClassifier(**best_params)
    
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_test)

    ac = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    return ac, f1, report

dataset1_accuracy,dataset1_f1,_=decision_tree_model(dataset1,RED1,2.88,10);dataset1_all_accuracy,dataset1_all_f1,_=decision_tree_model(dataset1,RED1,2.88,37)
dataset2_accurcay,dataset2_f1,_=decision_tree_model(dataset2,Nimda,8.79,10);dataset2_all_accuracy,dataset2_all_f1,_=decision_tree_model(dataset2,Nimda,8.79,37)
dataset3_accuracy,dataset3_f1,_=decision_tree_model(dataset3,Slammer,2.49,10);dataset3_all_accuracy,dataset3_all_f1,_=decision_tree_model(dataset3,Slammer,2.49,37)
df_Trees=pd.DataFrame({'Dataset':['dataset1','dataset1_all','dataset2','dataset2_all','dataset3','dataset3_all'],
                 'Accuracy':[dataset1_accuracy,dataset1_all_accuracy,dataset2_accurcay,dataset2_all_accuracy,dataset3_accuracy,dataset3_all_accuracy],
                 'F1 Score':[dataset1_f1,dataset1_all_f1,dataset2_f1,dataset2_all_f1,dataset3_f1,dataset3_all_f1]
                })
df_Trees
#iForest 모델생성
def iForest(train_set,test_set,w,n):
    select_features, _ = fisher_markov_selector(train_set,w,n)
    
    X_tr=train_set.iloc[:,select_features]
    X_train=StandardScaler().fit_transform(X_tr)
    y_train=train_set["regular or anomalous"]

    X_te=test_set.iloc[:,select_features]
    X_test=StandardScaler().fit_transform(X_te)
    y_test=test_set["regular or anomalous"]

    iforest=IsolationForest(n_estimators=100,contamination=0.01,random_state=42)
    iforest.fit(X_train,y_train)
    y_pred=iforest.predict(X_test)

    ac=accuracy_score(y_test,y_pred)
    f1=f1_score(y_test,y_pred)
    report=classification_report(y_test,y_pred)
    return ac,f1,report



dataset1_accuracy,dataset1_f1,_=iForest(dataset1,RED1,2.88,10);dataset1_all_accuracy,dataset1_all_f1,_=iForest(dataset1,RED1,2.88,37)
dataset2_accurcay,dataset2_f1,_=iForest(dataset2,Nimda,8.79,10);dataset2_all_accuracy,dataset2_all_f1,_=iForest(dataset2,Nimda,8.79,37)
dataset3_accuracy,dataset3_f1,_=iForest(dataset3,Slammer,2.49,10);dataset3_all_accuracy,dataset3_all_f1,_=iForest(dataset3,Slammer,2.49,37)
df_iForest=pd.DataFrame({'Dataset':['dataset1','dataset1_all','dataset2','dataset2_all','dataset3','dataset3_all'],
                 'Accuracy':[dataset1_accuracy,dataset1_all_accuracy,dataset2_accurcay,dataset2_all_accuracy,dataset3_accuracy,dataset3_all_accuracy],
                 'F1 Score':[dataset1_f1,dataset1_all_f1,dataset2_f1,dataset2_all_f1,dataset3_f1,dataset3_all_f1]
                })
df_iForest


Unnamed: 0,Dataset,Accuracy,F1 Score
0,dataset1,0.085024,0.150851
1,dataset1_all,0.083356,0.146663
2,dataset2,0.347268,0.514941
3,dataset2_all,0.346969,0.514539
4,dataset3,0.081689,0.139547
5,dataset3_all,0.071687,0.115201
