In [None]:
import pandas as pd
import numpy as np
import sys
import sklearn
import io
import random

In [None]:
train_nsl_kdd_dataset_path = "KDDTrain+.txt"
test_nsl_kdd_dataset_path = "KDDTest+.txt"

In [None]:
col_names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label", "difficulty_level"]

df = pd.read_csv(train_nsl_kdd_dataset_path,header=None, names = col_names)

df_test = pd.read_csv(test_nsl_kdd_dataset_path, header=None, names = col_names)

print('Dimensions of the Training set:',df.shape)
print('Dimensions of the Test set:',df_test.shape)

In [None]:
pd.set_option("display.max_columns", None)

In [None]:
df = df.drop(columns='difficulty_level')
df_test = df_test.drop(columns='difficulty_level')

In [None]:
df = df.replace({'label' : {'^(?!normal).*$': 'anomaly'}}, regex=True)

In [None]:
df_test = df_test.replace({'label' : {'^(?!normal).*$': 'anomaly'}}, regex=True)

In [None]:
df['protocol_type'].value_counts()

In [None]:
pd.crosstab(df_test['num_outbound_cmds'], df_test['label'])

In [None]:
df = df.drop(columns='num_outbound_cmds')
df_test = df_test.drop(columns='num_outbound_cmds')

In [None]:
df[['protocol_type', 'service', 'flag']].head()

In [None]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
categorical_columns=['protocol_type', 'service', 'flag']
df_categorical_values = df[categorical_columns]
testdf_categorical_values = df_test[categorical_columns]


In [None]:
# protocol type
unique_protocol=sorted(df.protocol_type.unique())
string1 = 'Protocol_type_'
unique_protocol2=[string1 + x for x in unique_protocol]
print(unique_protocol2)

# service
unique_service=sorted(df.service.unique())
string2 = 'service_'
unique_service2=[string2 + x for x in unique_service]
print(unique_service2)


# flag
unique_flag=sorted(df.flag.unique())
string3 = 'flag_'
unique_flag2=[string3 + x for x in unique_flag]
print(unique_flag2)


# put together
dumcols=unique_protocol2 + unique_service2 + unique_flag2


#do it for test set
unique_service_test=sorted(df_test.service.unique())
unique_service2_test=[string2 + x for x in unique_service_test]
testdumcols=unique_protocol2 + unique_service2_test + unique_flag2

In [None]:
df_categorical_values_enc=df_categorical_values.apply(LabelEncoder().fit_transform)

print(df_categorical_values.head())
print('--------------------')
print(df_categorical_values_enc.head())

# test set
testdf_categorical_values_enc=testdf_categorical_values.apply(LabelEncoder().fit_transform)

In [None]:

enc = OneHotEncoder(categories='auto')
df_categorical_values_encenc = enc.fit_transform(df_categorical_values_enc)
df_cat_data = pd.DataFrame(df_categorical_values_encenc.toarray(),columns=dumcols)


# test set
testdf_categorical_values_encenc = enc.fit_transform(testdf_categorical_values_enc)
testdf_cat_data = pd.DataFrame(testdf_categorical_values_encenc.toarray(),columns=testdumcols)

df_cat_data.head()

In [None]:
trainservice=df['service'].tolist()
testservice= df_test['service'].tolist()
difference=list(set(trainservice) - set(testservice))
string = 'service_'
difference=[string + x for x in difference]
difference

In [None]:
for col in difference:
    testdf_cat_data[col] = 0

print(df_cat_data.shape)    
print(testdf_cat_data.shape)

In [None]:
print(df.shape)

In [None]:
print(df_test.shape)

In [None]:
difference = list(set(testservice) - set(trainservice))
string = 'service_'
difference = [string + x for x in difference]
difference

In [None]:
for col in difference:
    df_cat_data[col] = 0
    
print(df_cat_data.shape)    
print(testdf_cat_data.shape)

In [None]:
newdf=df.join(df_cat_data)
newdf.drop('flag', axis=1, inplace=True)
newdf.drop('protocol_type', axis=1, inplace=True)
newdf.drop('service', axis=1, inplace=True)

# test data
newdf_test=df_test.join(testdf_cat_data)
newdf_test.drop('flag', axis=1, inplace=True)
newdf_test.drop('protocol_type', axis=1, inplace=True)
newdf_test.drop('service', axis=1, inplace=True)

print(newdf.shape)
print(newdf_test.shape)

In [None]:
x_train = newdf.drop('label', 1)
y_train = newdf.label

x_test = newdf_test.drop('label', 1)
y_test = newdf_test.label

In [None]:
colNames=list(x_train)
colNames_test=list(x_test)

In [None]:
from sklearn import preprocessing

scaler1 = preprocessing.StandardScaler().fit(x_train)
x_train=scaler1.transform(x_train)

scaler2 = preprocessing.StandardScaler().fit(x_test)
x_test=scaler2.transform(x_test)

In [None]:
y_train = y_train.replace({'normal': 0, 'anomaly': 1})
y_test = y_test.replace({'normal': 0, 'anomaly': 1})

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

clf = RandomForestClassifier(n_estimators=10,n_jobs=2)
# clf = SVC(kernel='linear', C=1.0, random_state=0)
# clf = KNeighborsClassifier()
clf.fit(x_train, y_train.astype(int))

In [None]:
y_pred = clf.predict(x_test)

In [None]:
pd.crosstab(y_test, y_pred, rownames=['Actual attacks'], colnames=['Predicted attacks'])

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics
# accuracy = cross_val_score(clf, x_test, y_test, cv=10, scoring='accuracy')
# print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
# precision = cross_val_score(clf, x_test, y_test, cv=10, scoring='precision')
# print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
# recall = cross_val_score(clf, x_test, y_test, cv=10, scoring='recall')
# print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
# f = cross_val_score(clf, x_test, y_test, cv=10, scoring='f1')
# print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
accuracy_score = metrics.accuracy_score(y_test, y_pred)
print("Accuracy score: %0.5f" % (accuracy_score))
recall_score = metrics.recall_score(y_test, y_pred)
print("Recall score: %0.5f" % (recall_score))
precision_score = metrics.precision_score(y_test, y_pred)
print("Precision score: %0.5f" % (precision_score))
f_score = metrics.f1_score(y_test, y_pred)
print("F1 score: %0.5f" % (f_score))

In [None]:
from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling, margin_sampling

from IPython import display
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

n_initial = 200

initial_idx = np.random.choice(range(len(x_train)), size=n_initial, replace=False)

x_initial, y_initial = x_train[initial_idx], y_train[initial_idx]
x_pool, y_pool = np.delete(x_train, initial_idx, axis=0), np.delete(y_train, initial_idx, axis=0)

learner = ActiveLearner(
    estimator=RandomForestClassifier(n_estimators=10, n_jobs=-1),
    query_strategy=margin_sampling,
    X_training=x_initial, y_training=y_initial
)

n_queries = 500

pred = learner.predict(x_test)

accuracy_scores = [learner.score(x_test, y_test)]
f1_scores = [metrics.f1_score(y_test, pred)]
precision_scores = [metrics.precision_score(y_test, pred)]
recall_scores = [metrics.recall_score(y_test, pred)]

In [None]:
for i in range(n_queries):
    display.clear_output(wait=True)
    query_idx, query_inst = learner.query(x_pool)
    
    
    with plt.style.context('seaborn-white'):
        plt.figure(figsize=(15, 10))
        plt.subplot(2, 2, 1)
        plt.title('Accuracy of your model')
        plt.plot(range(i+1), accuracy_scores, label='accuracy')
        plt.scatter(range(i+1), accuracy_scores)
        plt.xlabel('number of queries')
        plt.ylabel('accuracy')
        plt.subplot(2, 2, 2)
        plt.title('F1 of your model')
        plt.plot(range(i+1), f1_scores, label='f1')
        plt.scatter(range(i+1), f1_scores)
        plt.xlabel('number of queries')
        plt.ylabel('f1')
        plt.subplot(2, 2, 3)
        plt.title('Precision of your model')
        plt.plot(range(i+1), precision_scores, label='precision')
        plt.scatter(range(i+1), precision_scores)
        plt.xlabel('number of queries')
        plt.ylabel('precision')
        plt.subplot(2, 2, 4)
        plt.title('Recall of your model')
        plt.plot(range(i+1), recall_scores, label='recall')
        plt.scatter(range(i+1), recall_scores)
        plt.xlabel('number of queries')
        plt.ylabel('recall')
        display.display(plt.gcf())
        plt.close('all')
    
    pred = learner.predict(query_inst)
    actual = y_pool[query_idx]
    print('Prediction: {}'.format(pred))
    print('Actual: {}'.format(actual))

    # If you want to use Human-in-the-loop version, uncomment it and comment out automated version block.
    
    #Human-in-the-Loop version:
#     print("Is it correct (+) or false (-)?")
#     print("Is it normal (0) or anomaly (1)?")
#     if input() == '+':
#         y_new = np.array(pred, dtype=int)
#     else:
#         y_new = np.array(actual, dtype=int)

    # Automated version for experiments with large number of queries:
    y_new = np.array(actual, dtype=int)
    learner.teach(query_inst, y_new)
    x_pool, y_pool = np.delete(x_pool, query_idx, axis=0), np.delete(y_pool, query_idx, axis=0)
    
    accuracy_scores.append(learner.score(x_test, y_test))
    pred = learner.predict(x_test)
    f1_scores.append(metrics.f1_score(y_test, pred))
    precision_scores.append(metrics.precision_score(y_test, pred))
    recall_scores.append(metrics.recall_score(y_test, pred))
    
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(10, 5))
    plt.title('Accuracy of the classifier during the active learning')
    plt.plot(range(n_queries+1), accuracy_scores, label='accuracy')
    plt.scatter(range(n_queries+1), accuracy_scores)
    plt.xlabel('number of queries')
    plt.ylabel('accuracy')
    plt.show()
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(10, 5))
    plt.title('Precision of the classifier during the active learning')
    plt.plot(range(n_queries+1), precision_scores, label='precision')
    plt.scatter(range(n_queries+1), precision_scores)
    plt.xlabel('number of queries')
    plt.ylabel('precision')
    plt.show()
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(10, 5))
    plt.title('F1 of the classifier during the active learning')
    plt.plot(range(n_queries+1), f1_scores, label='f1')
    plt.scatter(range(n_queries+1), f1_scores)
    plt.xlabel('number of queries')
    plt.ylabel('F1')
    plt.show()
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(10, 5))
    plt.title('Recall of the classifier during the active learning')
    plt.plot(range(n_queries+1), recall_scores, label='recall')
    plt.scatter(range(n_queries+1), recall_scores)
    plt.xlabel('number of queries')
    plt.ylabel('recall')
    plt.show()