In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib
from sklearn.feature_selection import mutual_info_classif
import random
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold,KFold
import scipy.sparse as ss
import warnings
from six.moves import urllib
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings('ignore')
%matplotlib inline
plt.style.use('seaborn')
from scipy.stats import norm, skew
from sklearn.model_selection import train_test_split
from IPython.display import display, HTML
from sklearn.metrics import confusion_matrix
import abnormal_features_tools
from sklearn.preprocessing import StandardScaler

In [2]:
columns_name = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","attack_type"]
df_train = pd.read_csv('../data/train.data', names  = columns_name)
# df_train.drop_duplicates(subset=None, inplace=True)
df_test = pd.read_csv('../data/test.data', names  = columns_name)
df_train['attack_type'] = df_train.attack_type.apply(lambda x:x[:-1])
df_test['attack_type'] = df_test.attack_type.apply(lambda x:x[:-1])

#map attack
mp_attack_type = {'normal':'normal'}
additional_attack = {'apache2':'dos', 'httptunnel':'r2l', 'mailbomb':'dos', 'mscan':'probe', 'named':'r2l', 'processtable':'dos', 'ps':'u2r', 
                    'saint':'probe', 'sendmail':'r2l', 'snmpgetattack':'r2l', 'snmpguess':'r2l', 'sqlattack':'u2r', 'udpstorm':'dos',
                    'worm':'dos', 'xlock':'r2l', 'xsnoop':'r2l', 'xterm':'u2r'}
with open('../data/training_attack_types') as f:
    for line in f:
        item = line.strip().split(' ')
        mp_attack_type[item[0]] = item[1]
        
mp_attack_type.update(additional_attack)

#map label
df_train['attack_class'] = df_train.attack_type.apply(lambda x:mp_attack_type[x])
df_test['attack_class'] = df_test.attack_type.apply(lambda x:mp_attack_type[x])

# df_train = df_train[df_train.attack_class != 'dos']
# df_test = df_test[df_test.attack_class != 'dos']

df_train['label_attack_class'] = df_train.attack_class.apply(lambda x : {'normal':0, 'probe':1, 'dos':2, 'u2r':3, 'r2l':4}[x])
df_test['label_attack_class'] = df_test.attack_class.apply(lambda x : {'normal':0, 'probe':1, 'dos':2, 'u2r':3, 'r2l':4}[x])
symbolic_columns = ['protocol_type', 'service', 'flag', 'land', 'logged_in', 'is_host_login', 'is_guest_login']
symbolic_features = ['feature_%s' % name for name in symbolic_columns]
continuous_features = list(set(columns_name) - set(symbolic_columns) - set(['attack_type']))

binary_cat_features = ['land', 'logged_in', 'is_host_login', 'is_guest_login']
from category_encoders import *
binary_encoder = BinaryEncoder()
binary_encoder.fit(df_train[['protocol_type', 'flag', 'service']])
df_binary_train = binary_encoder.transform(df_train[['protocol_type', 'flag', 'service']])
df_binary_test = binary_encoder.transform(df_test[['protocol_type', 'flag', 'service']])
used_features = continuous_features + binary_cat_features + list(df_binary_train.columns)

df_train['is_attack'] = df_train.attack_class.apply(lambda x:x is not 'normal')
df_test['is_attack'] = df_test.attack_class.apply(lambda x:x is not 'normal')

df_train_used = pd.concat([df_train, df_binary_train], axis = 1)
df_test_used = pd.concat([df_test, df_binary_test], axis = 1)

df_train_attack_class = df_train_used[df_train.is_attack == True]
df_test_attack_class = df_test_used[df_test.is_attack == True]

for col_name, feature_name in zip(symbolic_columns, symbolic_features):
    df_train[feature_name] = pd.factorize(df_train[col_name])[0]
    df_test[feature_name] = pd.factorize(df_test[col_name])[0]

# 工具函数

In [3]:
def plot_distribution( df , var , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , hue=target , aspect=4 , row = row , col = col )
    facet.map( sns.kdeplot , var , shade=True , cut=50)
    facet.set( xlim=( 0 , df[ var ].max() ) )
    facet.add_legend()

def evaluate(ground_truth, predict):
    """
    """
    print('All accuracy: %f' % accuracy_score(predict, ground_truth))

    print('----------confusion_matrix-------------')
    df_confusion_matrix = pd.DataFrame(confusion_matrix(ground_truth, predict))
    display(HTML(df_confusion_matrix.to_html()))
    
    print('-------recall/fscore/f1 report---------')
    df_precision_recall_fscore = pd.DataFrame(precision_recall_fscore_support(ground_truth, predict), index = ['precision', 'recall', 'f1', 'number'])
    display(HTML(df_precision_recall_fscore.to_html()))
    
from sklearn.metrics import make_scorer


def score(y_true, y_pred, show):
    cost_matrix = [[0, 1, 2, 2, 2],
                   [1, 0, 2, 2, 2],
                   [2, 1, 0, 2, 2],
                   [3, 2, 2, 0, 2],
                   [4, 2, 2, 2, 0]
                   ]
    cost = 0
    size = y_true.size
    # print count & percentage matrix
    if show:
        count = [[0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 'x']
                 ]
        for i in range(size):
            cost += cost_matrix[y_true[i]][y_pred[i]]  # y_ture should be dataframe.values
            count[y_true[i]][y_pred[i]] += 1
        for i in range(5):
            if 0 == sum(count[i]):
                count[i][5] = 0
            else:
                count[i][5] = count[i][i] / sum(count[i])
        for i in range(5):
            if 0 == sum(count[j][i] for j in range(5)):
                count[5][i] = 0
            else:
                count[5][i] = count[i][i] / sum(count[j][i] for j in range(5))
        print("-----------------------------------------------")
        for i in range(5):
            print("%7d %7d %7d %7d %7d %1.5f" % (
                count[i][0], count[i][1], count[i][2], count[i][3], count[i][4], count[i][5]))
        print("%1.5f %1.5f %1.5f %1.5f %1.5f" % (
            count[5][0], count[5][1], count[5][2], count[5][3], count[5][4]))
        print("score: ", cost / size)
        print("-----------------------------------------------")
    else:
        for i in range(size):
            cost += cost_matrix[y_true[i]][y_pred[i]]
    return cost / size


def scorer(show):
    return make_scorer(score, show=show, greater_is_better=False)
    
    
def print_mean(df, feature_name):
    mean_value = df[feature_name].mean(), df[df['attack_class'] == 'normal'][feature_name].mean(), \
    df[df['attack_class'] == 'u2r'][feature_name].mean(), df[df['attack_class'] == 'dos'][feature_name].mean(), \
    df[df['attack_class'] == 'r2l'][feature_name].mean(), \
    df[df['attack_class'] == 'probe'][feature_name].mean()
    attack_tpye = ['All', 'normal', 'u2r', 'dos', 'r2l', 'probe']

    df = pd.DataFrame(list(mean_value), index = attack_tpye)
    display(HTML(df.to_html()))

def train_eval_DT(train_X, train_y, test_X, test_y):
    """
    """
    start = time.time()
#     clf = DecisionTreeClassifier(random_state=0, max_depth = 3)
    count_num = np.bincount(train_y)
    clf = DecisionTreeClassifier(random_state=0, class_weight = 'balanced')

    clf.fit(train_X, train_y)

    print('cost time : %f' % (time.time() - start))
    
    dt_score = clf.predict_proba(test_X)

    dt_predict = np.argmax(dt_score, axis = 1)

    evaluate(test_y, dt_predict)
    
    return clf

# 重要特征寻找

In [4]:
from sklearn.tree import DecisionTreeClassifier
def feature_select(origin_features, n, df_features, target):
    """
    """
    found_features = []
    for _ in range(n):
        left_features = list(set(origin_features) - set(found_features))
        clf = DecisionTreeClassifier(random_state=0, max_depth  = 2)
        clf.fit(df_features[left_features], target)
        max_idx = np.argmax(clf.feature_importances_)
        found_features.append(left_features[max_idx])
    
    return found_features

In [5]:
target_dos = df_train_used['attack_class'] == 'dos'
target_normal = df_train_used['attack_class'] == 'normal'
target_probe = df_train_used['attack_class'] == 'probe'
target_u2r = df_train_used['attack_class'] == 'u2r'
target_r2l = df_train_used['attack_class'] == 'r2l'

In [6]:
dos_features = feature_select(used_features, 4, df_train_used, target_dos)
normal_features = feature_select(used_features, 4, df_train_used, target_normal)
probe_features = feature_select(used_features, 4, df_train_used, target_probe)
u2r_features = feature_select(used_features, 4, df_train_used, target_u2r)
r2l_features = feature_select(used_features, 4, df_train_used, target_r2l)

In [7]:
r2l_features

['dst_bytes', 'duration', 'src_bytes', 'hot']

# Abnormal feature

In [8]:
df_train_used['type'] = 'train'
df_test_used['type'] = 'test'
X = pd.concat([df_train_used, df_test_used], axis = 0)
X.index = range(X.shape[0])

## IFO

In [9]:
import time
ifo_para = [ 150, ]
start = time.time()
feature_names, probe_abnormal_features_ifo = abnormal_features_tools.get_TOS_iforest(X[probe_features], ifo_para)
print(time.time() - start)

525.739902973175


In [10]:
import time
ifo_para = [ 15, ]
start = time.time()
feature_names, u2r_abnormal_features_ifo = abnormal_features_tools.get_TOS_iforest(X[u2r_features], ifo_para)
print(time.time() - start)

48.9950156211853


In [11]:
import time
ifo_para = [ 64, ]
start = time.time()
feature_names, r2l_abnormal_features_ifo = abnormal_features_tools.get_TOS_iforest(X[r2l_features], ifo_para)
print(time.time() - start)

224.01590585708618


## hobs

In [12]:
start = time.time()
hobs_para = [1, 2, 3, 4, 5, 10, 15, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200, 250]
hobs_para = [50]

feature_list, probe_abnormal_features_hobs = abnormal_features_tools.get_TOS_hbos(X[probe_features].values, hobs_para)
print(time.time() - start)

27.195741653442383


In [13]:
start = time.time()
hobs_para = [1, 2, 3, 4, 5, 10, 15, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200, 250]
hobs_para = [50]

feature_list, u2r_abnormal_features_ifo = abnormal_features_tools.get_TOS_hbos(X[u2r_features].values, hobs_para)
print(time.time() - start)

43.46952724456787


In [14]:
start = time.time()
hobs_para = [1, 2, 3, 4, 5, 10, 15, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200, 250]
hobs_para = [50]

feature_list, r2l_abnormal_features_ifo = abnormal_features_tools.get_TOS_hbos(X[r2l_features].values, hobs_para)
print(time.time() - start)

26.332566022872925


In [15]:
X['probe_abnormal_features_ifo'] = probe_abnormal_features_ifo
X['u2r_abnormal_features_ifo'] = u2r_abnormal_features_ifo
X['r2l_abnormal_features_ifo'] = r2l_abnormal_features_ifo
X['probe_abnormal_features_hobs'] = probe_abnormal_features_hobs
X['u2r_abnormal_features_hobs'] = u2r_abnormal_features_ifo
X['r2l_abnormal_features_hobs'] = r2l_abnormal_features_ifo
abnormal_features = ['probe_abnormal_features_ifo', 'u2r_abnormal_features_ifo', 'r2l_abnormal_features_ifo', 'probe_abnormal_features_hobs',
                    'u2r_abnormal_features_hobs', 'r2l_abnormal_features_hobs']

In [45]:
(X.iloc[df_train_used.shape[0]:]['probe_abnormal_features_ifo'] > 0).value_counts()

False    252816
True      58213
Name: probe_abnormal_features_ifo, dtype: int64

In [48]:
evaluate(target_all.iloc[df_train_used.shape[0]:] == 4, X.iloc[df_train_used.shape[0]:]['r2l_abnormal_features_hobs'] > 0)

All accuracy: 0.944735
----------confusion_matrix-------------


Unnamed: 0,0,1
0,293829,855
1,16334,11


-------recall/fscore/f1 report---------


Unnamed: 0,0,1
precision,0.947337,0.012702
recall,0.997099,0.000673
f1,0.971581,0.001278
number,294684.0,16345.0


# Modoing

In [16]:
df_train_used['attack_class'].value_counts()

dos       3883370
normal     972781
probe       41102
r2l          1126
u2r            52
Name: attack_class, dtype: int64

In [17]:
df_test_used['attack_class'].value_counts()

dos       229855
normal     60593
r2l        16345
probe       4166
u2r           70
Name: attack_class, dtype: int64

In [18]:
target_dos = X['attack_class'] == 'dos'
target_normal = X['attack_class'] == 'normal'
target_probe = X['attack_class'] == 'probe'
target_r2l = X['attack_class'] == 'r2l'
target_u2r = X['attack_class'] == 'u2r'
target_all = X['label_attack_class']

In [19]:
train_X, train_y = X[used_features + abnormal_features].iloc[:df_train_used.shape[0]], target_all.iloc[:df_train_used.shape[0]]
test_X, test_y = X[used_features + abnormal_features].iloc[df_train_used.shape[0]:], target_all.iloc[df_train_used.shape[0]:]
clf_DT_all = train_eval_DT(train_X, train_y, test_X, test_y)

cost time : 30.424705
All accuracy: 0.923830
----------confusion_matrix-------------


Unnamed: 0,0,1,2,3,4
0,60001,438,150,3,1
1,490,3227,449,0,0
2,5618,189,224048,0,0
3,59,0,0,9,2
4,15879,401,6,6,53


-------recall/fscore/f1 report---------


Unnamed: 0,0,1,2,3,4
precision,0.7313,0.758402,0.997307,0.5,0.946429
recall,0.99023,0.774604,0.974736,0.128571,0.003243
f1,0.841293,0.766417,0.985892,0.204545,0.006463
number,60593.0,4166.0,229855.0,70.0,16345.0


In [20]:
train_X, train_y = X[used_features + abnormal_features].iloc[:df_train_used.shape[0]], target_dos.iloc[:df_train_used.shape[0]]
test_X, test_y = X[used_features + abnormal_features].iloc[df_train_used.shape[0]:], target_dos.iloc[df_train_used.shape[0]:]
clf_DT_dos = train_eval_DT(train_X, train_y, test_X, test_y)

cost time : 36.121006
All accuracy: 0.977584
----------confusion_matrix-------------


Unnamed: 0,0,1
0,80941,233
1,6739,223116


-------recall/fscore/f1 report---------


Unnamed: 0,0,1
precision,0.923141,0.998957
recall,0.99713,0.970682
f1,0.95871,0.984616
number,81174.0,229855.0


In [21]:
train_X, train_y = X[used_features + abnormal_features].iloc[:df_train_used.shape[0]], target_normal.iloc[:df_train_used.shape[0]]
test_X, test_y = X[used_features + abnormal_features].iloc[df_train_used.shape[0]:], target_normal.iloc[df_train_used.shape[0]:]
clf_DT_normal = train_eval_DT(train_X, train_y, test_X, test_y)

cost time : 40.022022
All accuracy: 0.926589
----------confusion_matrix-------------


Unnamed: 0,0,1
0,228261,22175
1,658,59935


-------recall/fscore/f1 report---------


Unnamed: 0,0,1
precision,0.997126,0.729935
recall,0.911454,0.989141
f1,0.952367,0.839996
number,250436.0,60593.0


In [22]:
train_X, train_y = X[used_features + abnormal_features].iloc[:df_train_used.shape[0]], target_probe.iloc[:df_train_used.shape[0]]
test_X, test_y = X[used_features + abnormal_features].iloc[df_train_used.shape[0]:], target_probe.iloc[df_train_used.shape[0]:]
clf_DT_probe = train_eval_DT(train_X, train_y, test_X, test_y)

cost time : 38.401389
All accuracy: 0.994711
----------confusion_matrix-------------


Unnamed: 0,0,1
0,306236,627
1,1018,3148


-------recall/fscore/f1 report---------


Unnamed: 0,0,1
precision,0.996687,0.833907
recall,0.997957,0.755641
f1,0.997321,0.792847
number,306863.0,4166.0


In [23]:
train_X, train_y = X[used_features + abnormal_features].iloc[:df_train_used.shape[0]], target_r2l.iloc[:df_train_used.shape[0]]
test_X, test_y = X[used_features + abnormal_features].iloc[df_train_used.shape[0]:], target_r2l.iloc[df_train_used.shape[0]:]
clf_DT_r2l = train_eval_DT(train_X, train_y, test_X, test_y)

cost time : 18.068490
All accuracy: 0.948757
----------confusion_matrix-------------


Unnamed: 0,0,1
0,294677,7
1,15931,414


-------recall/fscore/f1 report---------


Unnamed: 0,0,1
precision,0.94871,0.983373
recall,0.999976,0.025329
f1,0.973669,0.049386
number,294684.0,16345.0


In [24]:
train_X, train_y = X[used_features + abnormal_features].iloc[:df_train_used.shape[0]], target_u2r.iloc[:df_train_used.shape[0]]
test_X, test_y = X[used_features + abnormal_features].iloc[df_train_used.shape[0]:], target_u2r.iloc[df_train_used.shape[0]:]
clf_DT_u2r = train_eval_DT(train_X, train_y, test_X, test_y)

cost time : 7.728306
All accuracy: 0.999791
----------confusion_matrix-------------


Unnamed: 0,0,1
0,310949,10
1,55,15


-------recall/fscore/f1 report---------


Unnamed: 0,0,1
precision,0.999823,0.6
recall,0.999968,0.214286
f1,0.999895,0.315789
number,310959.0,70.0


## Mergeing

In [25]:
test_X = X[used_features + abnormal_features].iloc[df_train_used.shape[0]:]
predict = np.zeros(test_y.shape[0]) - 1

dos_score = clf_DT_dos.predict_proba(test_X)[:, 1]
predict[np.where(dos_score > 0.5)] = 2

normal_score = clf_DT_normal.predict_proba(test_X)[:, 1]
predict[np.where(normal_score > 0.5)] = 0

probe_score = clf_DT_probe.predict_proba(test_X)[:, 1]
predict[np.where(probe_score > 0.5)] = 1

r2l_score = clf_DT_r2l.predict_proba(test_X)[:, 1]
predict[np.where(r2l_score > 0.5)] = 4

u2r_score = clf_DT_u2r.predict_proba(test_X)[:, 1]
predict[np.where(u2r_score > 0.5)] = 3

predict[predict == -1] = clf_DT_all.predict(test_X.iloc[np.where(predict == -1)[0]])

## 最终效果

In [26]:
evaluate(target_all.iloc[df_train_used.shape[0]:], predict)

All accuracy: 0.924634
----------confusion_matrix-------------


Unnamed: 0,0,1,2,3,4
0,60037,492,57,3,4
1,631,3353,182,0,0
2,5866,221,223768,0,0
3,52,1,0,16,1
4,15596,328,0,7,414


-------recall/fscore/f1 report---------


Unnamed: 0,0,1,2,3,4
precision,0.730537,0.762912,0.998933,0.615385,0.988067
recall,0.990824,0.804849,0.973518,0.228571,0.025329
f1,0.841002,0.78332,0.986062,0.333333,0.049392
number,60593.0,4166.0,229855.0,70.0,16345.0


In [39]:
(0.841002 + 0.783320 + 0.986062 + 0.333333 + 0.049392) / 5.

0.5986218

In [36]:
from sklearn.metrics import f1_score
f1_score(target_all.iloc[df_train_used.shape[0]:], predict, average='macro')

0.5986216021592836

## baseline 模型对比

In [28]:
train_X, train_y = X[used_features].iloc[:df_train_used.shape[0]], target_all.iloc[:df_train_used.shape[0]]
test_X, test_y = X[used_features].iloc[df_train_used.shape[0]:], target_all.iloc[df_train_used.shape[0]:]
clf_DT_all_origin_features = train_eval_DT(train_X, train_y, test_X, test_y)

cost time : 27.936640
All accuracy: 0.923496
----------confusion_matrix-------------


Unnamed: 0,0,1,2,3,4
0,60241,247,102,3,0
1,819,3173,167,1,6
2,5864,205,223786,0,0
3,53,0,0,10,7
4,15787,527,3,4,24


-------recall/fscore/f1 report---------


Unnamed: 0,0,1,2,3,4
precision,0.727865,0.76421,0.998786,0.555556,0.648649
recall,0.994191,0.761642,0.973596,0.142857,0.001468
f1,0.840433,0.762924,0.98603,0.227273,0.00293
number,60593.0,4166.0,229855.0,70.0,16345.0


In [38]:
(0.840433 + 0.762924 + 0.986030 + 0.227273 + 0.002930) / 5.

0.5639179999999999

In [30]:
predict_origin_feature = clf_DT_all_origin_features.predict(test_X)

In [37]:
from sklearn.metrics import f1_score
f1_score(target_all.iloc[df_train_used.shape[0]:], predict_origin_feature, average='macro')

0.5639180477291231