In [None]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import LocalOutlierFactor
from functools import lru_cache
from Common_functions import *


In [34]:
def get_result_file(idx):
    return {1 : 'Results/FirstDataset_firstmlwr_lof.pkl',
            2 : 'Results/SecondDataset_firstmlwr_lof.pkl'
           }[idx]

In [35]:
# normal dataset: 1 --- Correspoding anomalous part is 1
#                 2 --- Corresponding anomalous parts are 2 and 3
normal_dataset = 1

# anomalous dataset: 1 --- Emotet malware;  2 --- DarkVNC; 3 --- Simba
anomaly_dataset = 1

# Set to True to run training.
should_train = False


global_features = [
    'clientDestinationPortTotalBytesUDPEstablished',
'clientDestinationPortNumberOfFlowsTCPEstablished',
'clientDestinationPortNumberOfFlowsUDPNotEstablished',
'clientDestinationPortTotalPacketsTCPEstablished',
'clientDestinationPortNumberOfFlowsUDPEstablished',
'clientDestinationPortTotalPacketsTCPNotEstablished',
'clientDestinationPortTotalBytesUDPNotEstablished',
'clientDestinationPortTotalBytesTCPEstablished',
'clientDestinationPortTotalPacketsUDPNotEstablished',
'clientDestinationPortNumberOfFlowsTCPNotEstablished',
'clientDestinationPortTotalBytesTCPNotEstablished',
'clientDestinationPortTotalPacketsUDPEstablished']

feature_abbrv = {k:''.join([c for c in k if c.isupper()]) for k in global_features }
scalers = {}

result_file = get_result_file(normal_dataset)
experiment_results = pd.read_pickle(result_file)


### Parameter search using grid search

In [36]:
if should_train:
    test_datasets = {}
    experiment_results = pd.DataFrame(columns=['parameters','evaluation']+global_features)
    experiment_results.set_index(['parameters','evaluation'], inplace=True)
    should_scale = True

    for i, feature_name in enumerate(global_features):

        print('feature: ',i,' name:', feature_name)

        normal_features = generate_normal_features(feature_name, dataset=normal_dataset)    

        anomaly_features = generate_anomaly_features(feature_name, dataset=anomaly_dataset)

        labels_anomaly = np.array([-1]*anomaly_features.shape[0])
        anomaly_count = labels_anomaly.shape[0]

        X_train, X_val, labels_train, labels_val = train_test_split(normal_features, [1]*len(normal_features), test_size=0.2, random_state=42)
        X_train, X_test, labels_train, labels_test = train_test_split(X_train, labels_train, test_size=0.25, random_state=42)
        test_datasets[feature_name] = X_test

        if should_scale:
                scaler = StandardScaler(with_std=True, with_mean=True).fit(X_train)
                X_train = scaler.transform(X_train)
                X_val = scaler.transform(X_val)

        np.random.seed(42)
        idx= np.random.choice(range(anomaly_count),int(anomaly_count/2), replace=False)
        anomaly_validation = anomaly_features[idx,:]

        if should_scale:
                anomaly_validation = scaler.transform(anomaly_validation)
        idx = [x for x in range(anomaly_count) if x in set(idx)]
        anomaly_test = anomaly_features[idx,:]

        labels_val = np.append(np.array(labels_test), np.array([-1]*anomaly_validation.shape[0]))
        labels_test = np.append(np.array(labels_test), np.array([-1]*anomaly_test.shape[0]))


        benign_validation_range = range(0, labels_val.shape[0]-anomaly_validation.shape[0])
        anomaly_vallidation_range = range(labels_val.shape[0]-anomaly_validation.shape[0], labels_val.shape[0])

        for k in range(1,11):
            for cntmnt in np.linspace(0.01, 0.1, 50):
                model = LocalOutlierFactor(n_neighbors=k, contamination=cntmnt, n_jobs=4)
                predicted = []
                kernel_string = 'k='+str(k)+' contam='+str(cntmnt)
                for x in X_val:
                    label = model.fit_predict(np.append(X_train,x.reshape(1,-1),axis=0))[-1]
                    predicted.append(label)

                for an in anomaly_validation:
                    label = model.fit_predict(np.append(X_train,an.reshape(1,-1),axis=0))[-1]
                    predicted.append(label)

                predicted=np.array(predicted)
                true_positive, false_positive, true_negative, false_negative = \
                                                    get_evaluation_matrix(labels=labels_val, predicted=predicted, 
                                                              benign_range=benign_validation_range, anomaly_range=anomaly_vallidation_range)


                precision, recall, accuracy = compute_precision_recall_accuracy(true_positive=true_positive,
                                                                                true_negative=true_negative,
                                                                                false_positive=false_positive,
                                                                                false_negative=false_negative)
                experiment_results.loc[(kernel_string,'tp'), feature_name]=true_positive
                experiment_results.loc[(kernel_string,'fp'), feature_name]=false_positive
                experiment_results.loc[(kernel_string,'tn'), feature_name]=true_negative
                experiment_results.loc[(kernel_string,'fn'), feature_name]=false_negative
                experiment_results.loc[(kernel_string,'precision'), feature_name]=precision
                experiment_results.loc[(kernel_string,'recall'), feature_name]=recall
                experiment_results.loc[(kernel_string,'accuracy'), feature_name]=accuracy
                experiment_results.loc[(kernel_string,'FPR'), feature_name]=false_positive/(false_positive+true_negative)
                experiment_results.loc[(kernel_string,'TPR'), feature_name]=true_positive/(true_positive+false_negative)  

### Select parameters of models based on the results 

In [28]:

model_params = {k : {} for k in global_features}
min_tpr = 0.167 # 0.167*6 = 1.002 it means we will detect an attack during first 30 minutes
for feature in global_features:
    tmp = experiment_results.unstack(1)[feature]
    tpr_max_val_fpr_less_001 = tmp[tmp['FPR'] < 0.01]['TPR'].max()
    tpr_max_val_fpr_min = tmp[tmp['FPR'] == tmp['FPR'].min()]['TPR'].max()
    print('==================================')
    print(feature)
    if tpr_max_val_fpr_min> min_tpr:
        print(tmp[tmp['FPR'] == tmp['FPR'].min()][['FPR','TPR', 'precision', 'recall']])
        params = tmp[tmp['FPR'] == tmp['FPR'].min()].TPR.astype(float).idxmax().split(' ')
    elif tpr_max_val_fpr_less_001 > tpr_max_val_fpr_min:
        print(tmp[tmp['FPR'] < 0.01][['FPR','TPR','precision', 'recall']])
        params = tmp[tmp['FPR'] < 0.01].TPR.astype(float).idxmax().split(' ')
    else:
        print(tmp[tmp['FPR'] == tmp['FPR'].min()][['FPR','TPR','precision', 'recall']])
        params = tmp[tmp['FPR'] == tmp['FPR'].min()].TPR.astype(float).idxmax().split(' ')
    for p in params:
        p = p.split('=')
#         print(p)
        p_name = p[0]
        p_value = float(p[1])
        if p_name == 'contam':
            p_name = 'contamination'
        else:
            p_value = int(p_value)
        model_params[feature][p_name] = p_value
for feature in global_features:
    print(model_params[feature])

clientDestinationPortTotalBytesUDPEstablished
evaluation                 FPR       TPR precision    recall
parameters                                                  
k=1 contam=0.01              0  0.388889         1  0.388889
k=2 contam=0.01              0  0.388889         1  0.388889
k=2 contam=0.0118367346939   0  0.388889         1  0.388889
k=2 contam=0.0136734693878   0  0.388889         1  0.388889
k=2 contam=0.0155102040816   0  0.388889         1  0.388889
k=2 contam=0.0173469387755   0  0.388889         1  0.388889
k=3 contam=0.01              0  0.388889         1  0.388889
k=3 contam=0.0118367346939   0  0.388889         1  0.388889
k=3 contam=0.0136734693878   0  0.388889         1  0.388889
k=3 contam=0.0155102040816   0  0.388889         1  0.388889
k=3 contam=0.0173469387755   0  0.388889         1  0.388889
k=4 contam=0.01              0  0.388889         1  0.388889
k=4 contam=0.0118367346939   0  0.388889         1  0.388889
k=4 contam=0.0136734693878   0  0.38888

In [31]:
should_scale = True
predictions = []
for feature_name in global_features:
    predicted = []
    normal_features = generate_normal_features(feature_name, dataset=normal_dataset)    

    anomaly_features = generate_anomaly_features(feature_name, dataset=anomaly_dataset)

    labels_anomaly = np.array([-1]*anomaly_features.shape[0])
    anomaly_count = labels_anomaly.shape[0]

    X_train, X_val, labels_train, labels_val = train_test_split(normal_features, [1]*len(normal_features), test_size=0.2, random_state=42)
    X_train, X_test, labels_train, labels_test = train_test_split(X_train, labels_train, test_size=0.25, random_state=42)
    
    if should_scale:
            scaler = StandardScaler(with_std=True, with_mean=True).fit(X_train)
            X_train = scaler.transform(X_train)
            X_val = scaler.transform(X_val)
            X_test = scaler.transform(X_test)
            
    np.random.seed(42)
    idx= np.random.choice(range(anomaly_count),int(anomaly_count/2), replace=False)
    anomaly_validation = anomaly_features[idx,:]
    
    if should_scale:
            anomaly_validation = scaler.transform(anomaly_validation)
    idx = [x for x in range(anomaly_count) if x in set(idx)]
    anomaly_test = anomaly_features[idx,:]
    if should_scale:
            anomaly_test = scaler.transform(anomaly_test)
    
    labels_val = np.append(np.array(labels_test), np.array([-1]*anomaly_validation.shape[0]))
    labels_test = np.append(np.array(labels_test), np.array([-1]*anomaly_test.shape[0]))
    
    
    benign_test_range = range(0, labels_test.shape[0]-anomaly_test.shape[0])
    anomaly_test_range = range(labels_test.shape[0]-anomaly_test.shape[0], labels_test.shape[0])

    
    model = LocalOutlierFactor(n_neighbors=model_params[feature_name]['k'], contamination=model_params[feature_name]['contamination'], n_jobs=4)
    
    for x in X_test:
        label = model.fit_predict(np.append(X_train,x.reshape(1,-1),axis=0))[-1]
        predicted.append(label)

    for an in anomaly_test:
        label = model.fit_predict(np.append(X_train,an.reshape(1,-1),axis=0))[-1]
        predicted.append(label)

    predicted=np.array(predicted)
    predictions.append(predicted)
    
    
    true_positive, false_positive, true_negative, false_negative = \
                                                get_evaluation_matrix(labels=labels_val, predicted=predicted, 
                                                          benign_range=benign_test_range, anomaly_range=anomaly_test_range)
    
    precision, recall, accuracy = compute_precision_recall_accuracy(true_positive=true_positive,
                                                                        true_negative=true_negative,
                                                                        false_positive=false_positive,
                                                                        false_negative=false_negative)
    
    fpr = false_positive/(false_positive+true_negative)
    tpr = true_positive/(true_positive+false_negative) 
    print('======================')
    print(feature_name)
    print('FPR={}\nTPR={}\nPrecision={}'.format(fpr, tpr, precision))

clientDestinationPortTotalBytesUDPEstablished
FPR=0.01098901098901099
TPR=0.3888888888888889
Precision=0.875
clientDestinationPortNumberOfFlowsTCPEstablished
FPR=0.01098901098901099
TPR=0.05555555555555555
Precision=0.5
clientDestinationPortNumberOfFlowsUDPNotEstablished
FPR=0.02197802197802198
TPR=0.3333333333333333
Precision=0.75
clientDestinationPortTotalPacketsTCPEstablished
FPR=0.0
TPR=0.05555555555555555
Precision=1.0


  precision = true_positive/(true_positive + false_positive)


clientDestinationPortNumberOfFlowsUDPEstablished
FPR=0.0
TPR=0.0
Precision=nan
clientDestinationPortTotalPacketsTCPNotEstablished
FPR=0.02197802197802198
TPR=1.0
Precision=0.9
clientDestinationPortTotalBytesUDPNotEstablished
FPR=0.01098901098901099
TPR=0.3333333333333333
Precision=0.8571428571428571
clientDestinationPortTotalBytesTCPEstablished
FPR=0.02197802197802198
TPR=0.7777777777777778
Precision=0.875
clientDestinationPortTotalPacketsUDPNotEstablished
FPR=0.02197802197802198
TPR=0.2777777777777778
Precision=0.7142857142857143




clientDestinationPortNumberOfFlowsTCPNotEstablished
FPR=0.02197802197802198
TPR=1.0
Precision=0.9
clientDestinationPortTotalBytesTCPNotEstablished
FPR=0.02197802197802198
TPR=1.0
Precision=0.9
clientDestinationPortTotalPacketsUDPEstablished
FPR=0.0
TPR=0.2222222222222222
Precision=1.0


In [32]:
majority_voting = sum(predictions)
majority_voting[majority_voting>=0] = 1
majority_voting[majority_voting<0] = -1


true_positive, false_positive, true_negative, false_negative = \
                                            get_evaluation_matrix(labels=labels_test, predicted=majority_voting, 
                                                      benign_range=benign_test_range, anomaly_range=anomaly_test_range)

precision, recall, accuracy = compute_precision_recall_accuracy(true_positive=true_positive,
                                                                        true_negative=true_negative,
                                                                        false_positive=false_positive,
                                                                        false_negative=false_negative)
fpr = false_positive/(false_positive+true_negative)
tpr = true_positive/(true_positive+false_negative)
print('FPR={}\nTPR={}\nPrecision={}'.format(fpr, tpr, precision))

FPR=0.0
TPR=0.3333333333333333
Precision=1.0
