In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import math
import zipfile
import json
import collections


def __preprocess_dataframe(df, features, metadata=None):
    """
    If metadata is passed, return also the labels
    """
    # Trim columns name, replace whitespaces from columns name
    df = df.rename(columns=lambda x: x.strip().replace(" ", "_"))
    # Keep only features and label
    df = df[features + ["Label"]]
    if metadata is None:
        # Remove label column from features
        df.pop('Label')
    else:
        # Labels between training and set dataset may be different, map them using metadata
        labels = df['Label'].unique()
        for l in labels:
            if l in metadata['Label']['01-12->03-11']:
                df['Label'].replace(to_replace=l, value=metadata['Label']['01-12->03-11'][l], inplace=True)
    return df


def __get_features(archive, metadata):
    """
    Retrieve the features from the archive
    """
    features = []
    # Get a flat list of all best features
    for dataset_label in metadata['Label']['Features']:
        features += metadata['Label']['Features'][dataset_label]
    features = list(set(features))
    features = [fc.replace(" ", "_") for fc in features]
    # Use the first csv file in the archive to extract column features
    file = next((file for file in archive.namelist() if file.endswith(".csv")), None)
    df = __preprocess_dataframe(pd.read_csv(archive.open(file), dtype={85: str}), features)
    feature_columns = []
    for key in df.keys():
        feature_columns.append(tf.feature_column.numeric_column(key=key))
    return feature_columns


def load_data(data_path=".", train_csv=None, test_csv=None, chunk_size=10**10):
    labels = ["BENIGN", "Syn", "UDPLag", "UDP", "LDAP", "MSSQL", "NetBIOS", "WebDDoS"]
    LoadedData = collections.namedtuple("LoadedData", "feature_columns labels train_dfs test_dfs")

    # Load metadata
    with open(data_path + "/metadata.json") as metadata_file:
        metadata = json.load(metadata_file)

    train_dfs = None
    feature_columns = None
    if train_csv is not None:
        train_archive = zipfile.ZipFile(data_path + "/CSV-01-12.zip", 'r')
        # Feature columns describe how to use the input
        feature_columns = __get_features(train_archive, metadata)
        train_sets = []
        for file in train_archive.namelist():
            if any(file.endswith(t) for t in train_csv):
                df = __preprocess_dataframe(
                    df=pd.read_csv(
                        train_archive.open(file),
                        dtype={85: str}
                    ),
                    features=[fc.key.replace(" ", "_") for fc in feature_columns],
                    metadata=metadata
                )
                # Load csv to dataframe
                train_sets.append(df)
        # Merge the dataframes into a single one and shuffle it, random_state assures reproducibility
        train_sets = pd.concat(train_sets).sample(frac=1, random_state=1)
        print("Number of rows: ", train_sets.shape[0])
        # Split the dataframes in multiple chunks
        train_chunks = np.split(train_sets,
                                range(chunk_size, math.ceil(train_sets.shape[0] / chunk_size) * chunk_size, chunk_size))
        del train_sets
        train_dfs = []
        for train_chunk in train_chunks:
            train_dfs.append({
                "labels": train_chunk.pop("Label"),
                "features": train_chunk
            })
        del train_chunks

    test_dfs = None
    if test_csv is not None:
        test_archive = zipfile.ZipFile(data_path + "/CSV-03-11.zip", 'r')
        # Feature columns describe how to use the input
        if feature_columns is None:
            feature_columns = __get_features(test_archive, metadata)
        test_dfs = []
        for file in test_archive.namelist():
            if any(file.endswith(t) for t in test_csv):
                file_test_dfs = []
                for chunk in pd.read_csv(test_archive.open(file), dtype={85: str}, chunksize=chunk_size):
                    df = __preprocess_dataframe(
                        chunk,
                        features=[fc.key.replace(" ", "_") for fc in feature_columns],
                        metadata=metadata
                    )
                    file_test_dfs.append({
                        "labels": df.pop("Label"),
                        "features": df
                    })
                test_dfs.append({
                    "file": file,
                    "dataframe": file_test_dfs
                })

    return LoadedData(feature_columns, labels, train_dfs, test_dfs)

In [2]:
# Preprocess the dataset

feature_columns, labels, train_dfs, test_dfs = load_data(
    data_path = "/content/drive/My Drive/Colab Notebooks/Research_project/data",
    train_csv = ['UDPLag.csv', 'Syn.csv', 'DrDoS_UDP.csv', 'DrDoS_NetBIOS.csv', 'DrDoS_MSSQL.csv', 'DrDoS_LDAP.csv'],
    test_csv = ['Syn.csv', 'UDPLag.csv', 'UDP.csv', 'LDAP.csv', 'MSSQL.csv', 'NetBIOS.csv'],
    chunk_size=9**6
)

Number of rows:  15891114


In [3]:
# Instantiate the model

classifier = tf.estimator.DNNClassifier(
        hidden_units=[60, 30, 20],
        feature_columns=feature_columns,
        n_classes=len(labels),
        label_vocabulary=labels,
        batch_norm=True,
        optimizer=lambda: tf.keras.optimizers.Adam(
            learning_rate=tf.compat.v1.train.exponential_decay(
                learning_rate=0.1,
                global_step=tf.compat.v1.train.get_global_step(),
                decay_steps=10000,
                decay_rate=0.96)
        )
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpi693743e', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [4]:
def input_fn(df, training, batch_size=32):
    '''
    An input function for training or evaluating
    '''
    # Convert the inputs to a Dataset
    dataset = tf.data.Dataset.from_tensor_slices((dict(df["features"]), df["labels"]))
    # Shuffle and repeat if you are in training mode
    if training:
      dataset = dataset.shuffle(1000).repeat()
    return dataset.batch(batch_size)

In [5]:
# Train the model

for train_df in train_dfs:
  classifier.train(input_fn=lambda: input_fn(train_df, training=True), steps=10**4)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
INFO:tensorflow:loss = 0.28482038, step = 71100 (0.715 sec)
INFO:tensorflow:global_step/sec: 136.828
INFO:tensorflow:loss = 0.14650893, step = 71200 (0.730 sec)
INFO:tensorflow:global_step/sec: 137.126
INFO:tensorflow:loss = 0.3123533, step = 71300 (0.731 sec)
INFO:tensorflow:global_step/sec: 138.88
INFO:tensorflow:loss = 0.7919556, step = 71400 (0.720 sec)
INFO:tensorflow:global_step/sec: 136.298
INFO:tensorflow:loss = 0.36732227, step = 71500 (0.733 sec)
INFO:tensorflow:global_step/sec: 134.918
INFO:tensorflow:loss = 0.82930887, step = 71600 (0.739 sec)
INFO:tensorflow:global_step/sec: 136.579
INFO:tensorflow:loss = 0.46855998, step = 71700 (0.736 sec)
INFO:tensorflow:global_step/sec: 136.911
INFO:tensorflow:loss = 0.59718525, step = 71800 (0.728 sec)
INFO:tensorflow:global_step/sec: 132.449
INFO:tensorflow:loss = 0.24741675, step = 71900 (0.753 sec)
INFO:tensorflow:global_step/sec: 139.438
INFO:tensorflow:loss = 0.5080

In [6]:
# Test the model
metrics = []
for file_test_df in test_dfs:
  file_measures = [] 
  for test_df in file_test_df["dataframe"]:
    file_measures.append(classifier.evaluate(input_fn=lambda: input_fn(test_df, training=False)))
  metrics.append({
      "file": file_test_df["file"],
      "measures": file_measures
  })

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-07-09T08:16:47Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpi693743e/model.ckpt-300000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Inference Time : 89.62413s
INFO:tensorflow:Finished evaluation at 2020-07-09-08:18:17
INFO:tensorflow:Saving dict for global step 300000: accuracy = 0.7929027, average_loss = 228.7457, global_step = 300000, loss = 228.73926
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 300000: /tmp/tmpi693743e/model.ckpt-300000
INFO:tensorflow:Calli

In [7]:
metrics

[{'file': '03-11/UDPLag.csv',
  'measures': [{'accuracy': 0.7929027,
    'average_loss': 228.7457,
    'global_step': 300000,
    'loss': 228.73926},
   {'accuracy': 0.996717,
    'average_loss': 3.8815887,
    'global_step': 300000,
    'loss': 3.881512}]},
 {'file': '03-11/UDP.csv',
  'measures': [{'accuracy': 0.69588536,
    'average_loss': 20.373722,
    'global_step': 300000,
    'loss': 20.373165},
   {'accuracy': 0.70726573,
    'average_loss': 2.758233,
    'global_step': 300000,
    'loss': 2.758173},
   {'accuracy': 0.71398145,
    'average_loss': 1.3683,
    'global_step': 300000,
    'loss': 1.3682741},
   {'accuracy': 0.7011089,
    'average_loss': 1.4906925,
    'global_step': 300000,
    'loss': 1.4906626},
   {'accuracy': 0.7002057,
    'average_loss': 2.5152645,
    'global_step': 300000,
    'loss': 2.5152142},
   {'accuracy': 0.70382226,
    'average_loss': 3.1241224,
    'global_step': 300000,
    'loss': 3.12404},
   {'accuracy': 0.68840003,
    'average_loss': 2.3

In [8]:
feature_columns

[NumericColumn(key='Fwd_IAT_Mean', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='Destination_Port', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='Average_Packet_Size', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='Flow_Duration', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='Fwd_Packets/s', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='Fwd_Header_Length.1', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='Fwd_Packet_Length_Std', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='Fwd_IAT_Total', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='Fwd_Header_Length', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericCol

In [10]:
EXPORT = True
if EXPORT:
  serving_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(
    tf.feature_column.make_parse_example_spec(feature_columns))
  # Save the model
  estimator_path = classifier.export_saved_model("/content/drive/My Drive/Colab Notebooks/Research_project/", serving_input_fn)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.utils.build_tensor_info or tf.compat.v1.saved_model.build_tensor_info.
INFO:tensorflow:Signatures INCLUDED in export for Classify: ['serving_default', 'classification']
INFO:tensorflow:Signatures INCLUDED in export for Regress: None
INFO:tensorflow:Signatures INCLUDED in export for Predict: ['predict']
INFO:tensorflow:Signatures INCLUDED in export for Train: None
INFO:tensorflow:Signatures INCLUDED in export for Eval: None
INFO:tensorflow:Restoring parameters from /tmp/tmpi693743e/model.ckpt-300000
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: /content/drive/My Drive/Colab Notebooks/Research_project/temp-1594286133/saved_model.pb


In [11]:
estimator_path

b'/content/drive/My Drive/Colab Notebooks/Research_project/1594286133'

# References
* https://www.tensorflow.org/tutorials/load_data/pandas_dataframe
* https://www.tensorflow.org/tutorials/structured_data/feature_columns
* https://www.tensorflow.org/tutorials/estimator/premade
* https://www.tensorflow.org/datasets/performances
* https://www.tensorflow.org/guide/data#batching_dataset_elements