In [2]:
import os
import pickle as pkl

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.manifold import TSNE
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [3]:
DATASET_ROOT_PATH = '/home/ubuntu/projects/ddos/cicddos_data/core'
X_TRAIN_PATH = os.path.join(DATASET_ROOT_PATH, 'X_train.csv')
Y_TRAIN_PATH = os.path.join(DATASET_ROOT_PATH, 'Y_train.csv')
X_TEST_PATH = os.path.join(DATASET_ROOT_PATH, 'X_test.csv')
Y_TEST_PATH = os.path.join(DATASET_ROOT_PATH, 'Y_test.csv')

In [4]:
X_train = pd.read_csv(X_TRAIN_PATH, low_memory=False)
Y_train = pd.read_csv(Y_TRAIN_PATH, low_memory=False)

X_test = pd.read_csv(X_TEST_PATH, low_memory=False)
Y_test = pd.read_csv(Y_TEST_PATH, low_memory=False)

In [5]:
label_cols = ['Label']

numeric_cols = ['URG Flag Count', 'Fwd PSH Flags',
    'RST Flag Count', 'CWE Flag Count', 'Packet Length Std',
    'Init_Win_bytes_forward', 'Fwd Packet Length Std', 'Active Mean',
    'Idle Mean', 'Avg Fwd Segment Size',
    'Fwd Packet Length Mean', 'Fwd Packets/s', 'Packet Length Mean',
    'Average Packet Size', 'Fwd IAT Min', 'Flow Duration',
    'Fwd IAT Total', 'Active Std', 'SYN Flag Count', 'Fwd IAT Std', 
    'Flow IAT Std', 'Subflow Fwd Bytes', 'Total Length of Fwd Packets']

categoric_cols = [
    'Protocol',
    'Inbound'
]

## Custom Transformers

In [6]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, col_names):
        self.col_names = col_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.col_names].values

# Pipeline

#### individual pipelines

In [7]:
num_pipeline = Pipeline([
    ('selector', DataFrameSelector(numeric_cols)),
    ('std_scaler', StandardScaler())
])

In [8]:
cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(categoric_cols)),
    ('one_hot_encoder', OneHotEncoder(sparse=False))
])

#### Feature Union

In [9]:
full_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline)
])

# Run pipeline

In [10]:
X = full_pipeline.fit_transform(X_train)

# Train

In [20]:
from sklearn.ensemble import RandomForestClassifier

In [21]:
rf_clf = RandomForestClassifier()

In [22]:
rf_clf.fit(X, Y_train.values.reshape(Y_train.values.shape[0], ))

RandomForestClassifier()

# Validation

#### calculating error

In [14]:
scores = cross_val_score(rf_clf, full_pipeline.fit_transform(X_test), Y_test.values.reshape(Y_test.shape[0], ), 
                         scoring='neg_mean_squared_error', cv=10)

In [15]:
tree_rmse_scores = np.sqrt(-scores)

In [16]:
tree_rmse_scores

array([0.02132007, 0.02132007, 0.02132007, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.03015113, 0.        ])

#### calculating accuracy

In [18]:
accuracies = cross_val_score(rf_clf, full_pipeline.fit_transform(X_test), Y_test.values.reshape(X_test.shape[0], ), cv=10, scoring='accuracy')
print("Model Accuracy:")
for i in accuracies:
    print(i)

Model Accuracy:
0.9995454545454545
0.9995454545454545
0.9995454545454545
1.0
1.0
1.0
1.0
0.9995454545454545
0.9986363636363637
1.0


In [19]:
accuracies = cross_val_score(rf_clf, X, Y_train.values.reshape(Y_train.shape[0], ), cv=10, scoring='accuracy')
print("Model Accuracy:")
for i in accuracies:
    print(i)

Model Accuracy:
0.9999863636363636
0.9999636363636364
0.9999772727272728
0.9999681818181818
0.999990909090909
0.9999681818181818
0.9999818181818182
0.9999363636363636
0.99995
0.9999


# Pickling the model

In [23]:
MODEL_PICKLE_ROOT = '/home/ubuntu/projects/ddos/pkls/models'

PICKLE_FILE_PATH = os.path.join(MODEL_PICKLE_ROOT, 'rf_28.pkl')

In [24]:
with open(PICKLE_FILE_PATH, 'wb') as f:
    pkl.dump(rf_clf, f)