In [66]:
from google.colab import files
import io
import numpy as np
import pandas as pd
%matplotlib inline
from sklearn import preprocessing
import random

In [67]:
#Splitting of Train data and Test data
def split_train_test(df, test_size):
    
    if isinstance(test_size, float):
        test_size = round(test_size * len(df))

    indices = df.index.tolist()
    test_indices = random.sample(population=indices, k=test_size)

    test_df = df.loc[test_indices]
    train_df = df.drop(test_indices)
    
    return train_df, test_df

In [92]:
#Determine Accuracy for the models implemented 
def calculate_accuracy(predictions, labels):
    predictions_correct = predictions == labels
    accuracy = predictions_correct.mean()
    print (predictions)
    return accuracy

Decision Tree Algorithm

In [93]:
# Find purity of the data
def purity_check(data):
    
    label_column = data[:, -1]
    unique_classes = np.unique(label_column)

    if len(unique_classes) == 1:
        return True
    else:
        return False

    
# Classification of the data
def data_classification(data):
    
    label_column = data[:, -1]
    unique_classes, counts_unique_classes = np.unique(label_column, return_counts=True)

    index = counts_unique_classes.argmax()
    classification = unique_classes[index]
    
    return classification


#Find potential splits for the decision tree
def get_potential_splits(data):
    
    potential_splits = {}
    _, n_columns = data.shape
    for column_index in range(n_columns - 1):  # excluding the last column which is the label
        values = data[:, column_index]
        unique_values = np.unique(values)
        
        potential_splits[column_index] = unique_values
    
    return potential_splits


# Determine the Lowest Overall Entropy?
def entropy_calculation(data):
    
    label_column = data[:, -1]
    _, counts = np.unique(label_column, return_counts=True)

    probabilities = counts / counts.sum()
    entropy = sum(probabilities * -np.log2(probabilities))
     
    return entropy


def overall_entropy_calculation(data_below, data_above):
    
    n = len(data_below) + len(data_above)
    p_data_below = len(data_below) / n
    p_data_above = len(data_above) / n

    overall_entropy =  (p_data_below * entropy_calculation(data_below) 
                      + p_data_above * entropy_calculation(data_above))
    
    return overall_entropy


def determine_best_split(data, potential_splits):
    
    overall_entropy = 9999
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            data_below, data_above = split_data(data, split_column=column_index, split_value=value)
            current_overall_entropy = overall_entropy_calculation(data_below, data_above)
            
            if current_overall_entropy <= overall_entropy:
                overall_entropy = current_overall_entropy
                best_split_column = column_index
                best_split_value = value
    
    return best_split_column, best_split_value


# Spliting data into continuous and categorical features
def split_data(data, split_column, split_value):
    
    split_column_values = data[:, split_column]

    type_of_feature = FEATURE_TYPES[split_column]
    if type_of_feature == "continuous":
        data_below = data[split_column_values <= split_value]
        data_above = data[split_column_values >  split_value]
    
    # feature is categorical   
    else:
        data_below = data[split_column_values == split_value]
        data_above = data[split_column_values != split_value]
    
    return data_below, data_above


# Decision Tree Algorithm
def decision_tree_algorithm(df, counter=0, min_samples=2, max_depth=5, random_subspace=None):
    
    # data preparations
    if counter == 0:
        global COLUMN_HEADERS, FEATURE_TYPES
        COLUMN_HEADERS = df.columns
        FEATURE_TYPES = determine_type_of_feature(df)
        data = df.values
    else:
        data = df           
    
    
    # base cases
    if (purity_check(data)) or (len(data) < min_samples) or (counter == max_depth):
        classification = data_classification(data)
        
        return classification

    
    # recursive part
    else:    
        counter += 1

        # helper functions 
        potential_splits = get_potential_splits(data)
        split_column, split_value = determine_best_split(data, potential_splits)
        data_below, data_above = split_data(data, split_column, split_value)
        
        # check for empty data
        if len(data_below) == 0 or len(data_above) == 0:
            classification = data_classification(data)
            return classification
        
        # determine question
        feature_name = COLUMN_HEADERS[split_column]
        type_of_feature = FEATURE_TYPES[split_column]
        if type_of_feature == "continuous":
            question = "{} <= {}".format(feature_name, split_value)
            
        # feature is categorical
        else:
            question = "{} = {}".format(feature_name, split_value)
        
        # instantiate sub-tree
        sub_tree = {question: []}
        
        # find answers (recursion)
        yes_answer = decision_tree_algorithm(data_below, counter, min_samples, max_depth, random_subspace)
        no_answer = decision_tree_algorithm(data_above, counter, min_samples, max_depth, random_subspace)
        
        # If the answers are the same, then there is no point in asking the qestion.
        # This could happen when the data is classified even though it is not pure
        # yet (min_samples or max_depth base case).
        if yes_answer == no_answer:
            sub_tree = yes_answer
        else:
            sub_tree[question].append(yes_answer)
            sub_tree[question].append(no_answer)
        
        return sub_tree


# Make predictions
# One example prediction 
def predict_example(example, tree):
    question = list(tree.keys())[0]
    feature_name, comparison_operator, value = question.split(" ")

    # ask question
    if comparison_operator == "<=":
        if example[feature_name] <= float(value):
            answer = tree[question][0]
        else:
            answer = tree[question][1]
    
    # feature is categorical
    else:
        if str(example[feature_name]) == value:
            answer = tree[question][0]
        else:
            answer = tree[question][1]

    # base case
    if not isinstance(answer, dict):
        return answer
    
    # recursive part
    else:
        residual_tree = answer
        return predict_example(example, residual_tree)

    
# All examples of the test data
def decision_tree_predictions(test_df, tree):
    predictions = test_df.apply(predict_example, args=(tree,), axis=1)
    return predictions

# Distinguish categorical and continuous features
def determine_type_of_feature(df):
    
    feature_types = []
    n_unique_values_treshold = 15
    for feature in df.columns:
        if feature != "label":
            unique_values = df[feature].unique()
            example_value = unique_values[0]

            if (isinstance(example_value, str)) or (len(unique_values) <= n_unique_values_treshold):
                feature_types.append("categorical")
            else:
                feature_types.append("continuous")
    
    return feature_types    

Random Forest Algorithm

In [94]:
#Bootstrapping the the data for randomization across rows
def bootstrapping(train_df, n_bootstrap):
    bootstrap_indices = np.random.randint(low=0, high=len(train_df), size=n_bootstrap)
    df_bootstrapped = train_df.iloc[bootstrap_indices]
    
    return df_bootstrapped

#Random subspace method to randomize the data across columns
def random_forest_algorithm(train_df, n_trees, n_bootstrap, n_features, dt_max_depth):
    forest = []
    for i in range(n_trees):
        df_bootstrapped = bootstrapping(train_df, n_bootstrap)
        tree = decision_tree_algorithm(df_bootstrapped, max_depth=dt_max_depth, random_subspace=n_features)
        forest.append(tree)
    
    return forest


def random_forest_predictions(test_df, forest):
    df_predictions = {}
    for i in range(len(forest)):
        column_name = "tree_{}".format(i)
        predictions = decision_tree_predictions(test_df, tree=forest[i])
        df_predictions[column_name] = predictions

    df_predictions = pd.DataFrame(df_predictions)
    random_forest_predictions = df_predictions.mode(axis=1)[0]
    
    return random_forest_predictions

Import Data

In [95]:
uploaded_train = files.upload()
df_train = pd.read_csv(io.BytesIO(uploaded_train['Train_data.csv']))

Saving Train_data.csv to Train_data (3).csv


In [97]:
df_train

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.00,0.00,0.00,150,25,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00,normal
1,0,udp,other,SF,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,1,0.0,0.0,0.0,0.0,0.08,0.15,0.00,255,1,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,normal
2,0,tcp,private,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,123,6,1.0,1.0,0.0,0.0,0.05,0.07,0.00,255,26,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,anomaly
3,0,tcp,http,SF,232,8153,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,5,5,0.2,0.2,0.0,0.0,1.00,0.00,0.00,30,255,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,30,32,0.0,0.0,0.0,0.0,1.00,0.00,0.09,255,255,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25187,0,tcp,exec,RSTO,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,100,7,0.0,0.0,1.0,1.0,0.07,0.07,0.00,255,7,0.03,0.06,0.00,0.00,0.00,0.00,1.00,1.00,anomaly
25188,0,tcp,ftp_data,SF,334,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.00,0.00,0.00,1,39,1.00,0.00,1.00,0.18,0.00,0.00,0.00,0.00,anomaly
25189,0,tcp,private,REJ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,105,7,0.0,0.0,1.0,1.0,0.07,0.07,0.00,255,13,0.05,0.07,0.00,0.00,0.00,0.00,1.00,1.00,anomaly
25190,0,tcp,nnsp,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,129,18,1.0,1.0,0.0,0.0,0.14,0.06,0.00,255,20,0.08,0.06,0.00,0.00,1.00,1.00,0.00,0.00,anomaly


In [98]:
features_int = df_train.columns[df_train.dtypes == 'int64']
df_train.loc[: , features_int]

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,dst_host_count,dst_host_srv_count
0,0,491,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,150,25
1,0,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,1,255,1
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,123,6,255,26
3,0,232,8153,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,5,5,30,255
4,0,199,420,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,30,32,255,255
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25187,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,100,7,255,7
25188,0,334,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,39
25189,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,105,7,255,13
25190,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,129,18,255,20


In [99]:
df_train = df_train.select_dtypes(include=[object])
df_train.shape
df_train.columns

Index(['protocol_type', 'service', 'flag', 'class'], dtype='object')

In [100]:
le = preprocessing.LabelEncoder()
df_train_fit = df_train.apply(le.fit_transform)
df_train_fit.head()

Unnamed: 0,protocol_type,service,flag,class
0,1,19,9,1
1,2,41,9,1
2,1,46,5,0
3,1,22,9,1
4,1,22,9,1


In [101]:
df_train_fit = df_train_fit.rename(columns={"class": "label"})
df_train_fit

Unnamed: 0,protocol_type,service,flag,label
0,1,19,9,1
1,2,41,9,1
2,1,46,5,0
3,1,22,9,1
4,1,22,9,1
...,...,...,...,...
25187,1,16,2,0
25188,1,19,9,0
25189,1,46,1,0
25190,1,38,5,0


In [134]:
train_df, test_df = split_train_test(df_train_fit, test_size=0.2)

In [138]:
df_train_fit.size

100768

In [115]:
forest = random_forest_algorithm(train_df, n_trees=4, n_bootstrap=800, n_features=2, dt_max_depth=4)

print(forest)

[{'flag = 9': [{'protocol_type = 0': [{'service <= 14': [0, 1]}, {'service <= 41': [1, {'service <= 46': [0, 1]}]}]}, {'service <= 22': [{'service <= 21': [{'service <= 0': [1, 0]}, {'flag = 1': [1, 0]}]}, 0]}]}, {'flag = 9': [{'protocol_type = 0': [{'service <= 14': [0, 1]}, {'service <= 40': [1, {'service <= 46': [0, 1]}]}]}, {'flag = 5': [0, {'service <= 22': [{'service <= 21': [0, 1]}, 0]}]}]}, {'flag = 9': [{'protocol_type = 0': [{'service <= 14': [0, 1]}, {'service <= 41': [1, {'service <= 46': [0, 1]}]}]}, {'flag = 5': [0, {'service <= 22': [{'service <= 21': [0, 1]}, 0]}]}]}, {'flag = 9': [{'protocol_type = 0': [{'service <= 14': [0, 1]}, {'service <= 44': [1, {'service <= 46': [0, 1]}]}]}, {'flag = 5': [0, {'service <= 22': [{'service <= 21': [0, 1]}, 0]}]}]}]


In [104]:
predictions = random_forest_predictions(test_df, forest)
print(predictions)

13500    1.0
13702    0.0
24633    1.0
12416    0.0
3090     1.0
        ... 
13781    1.0
15034    0.0
19823    1.0
16873    1.0
2117     1.0
Name: 0, Length: 5038, dtype: float64


In [105]:
type(predictions)

pandas.core.series.Series

In [112]:
predictions_predicted=predictions.to_numpy()
predictions_predicted
predictions_predicted.size

5038

In [106]:
accuracy = calculate_accuracy(predictions, test_df.label)
print (accuracy)

13500    1.0
13702    0.0
24633    1.0
12416    0.0
3090     1.0
        ... 
13781    1.0
15034    0.0
19823    1.0
16873    1.0
2117     1.0
Name: 0, Length: 5038, dtype: float64
0.9589122667725288


Confusion Matrix for Random Forest

In [117]:
from sklearn import metrics

#create confusion matrix
c_matrix = metrics.confusion_matrix(test_df.label, predictions_predicted)

#print confusion matrix
print(c_matrix)

[[2215  112]
 [  95 2616]]


In [119]:
#print accuracy of model
print(metrics.accuracy_score(test_df.label, predictions_predicted))


#print precision value of model
print(metrics.precision_score(test_df.label, predictions_predicted))

#print recall value of model
print(metrics.recall_score(test_df.label, predictions_predicted))

0.9589122667725288
0.9589442815249267
0.9649575802286979


In [142]:
from sklearn.metrics import f1_score
#calculate F1 score
print(f1_score(test_df.label, predictions_predicted))


0.5381750465549348
