# Generative Adversarial Network

In this notebook, we'll be building a generative adversarial network (GAN) trained on the network flow dataset.

## Import Libraries

In [1]:
from __future__ import absolute_import, division, print_function

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from importlib import reload
import os
import time
from sklearn.model_selection import train_test_split

import tensorflow as tf
import tensorflow.contrib.eager as tfe

tf.enable_eager_execution()

print("TensorFlow version: {}".format(tf.VERSION))
print("Eager execution: {}".format(tf.executing_eagerly()))

  from ._conv import register_converters as _register_converters


TensorFlow version: 1.8.0
Eager execution: True


In [2]:
import utils, models
reload(utils)
reload(models)
from models import Generator, Discriminator
from utils import max_norm, parse_feature_label, train_one_epoch

## Define Constants

In [3]:
# 输出的summary的保存目录:不要改
OUTPUT_DIR = 'SUMMARY/'

# 输出的模型的保存目录:不要改
CHECKPOINT_DIR = 'CHECKPOINT/'

# 相关feature:不太需要改应该
RELEVANT_FEATURES = [' Source Port', ' Destination Port', ' Flow Duration', 'Total Length of Fwd Packets', ' Total Length of Bwd Packets', 'Bwd Packet Length Max', ' Bwd Packet Length Min', 'Flow Bytes/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std', ' Bwd IAT Min', 'Fwd PSH Flags', ' Bwd PSH Flags', ' Fwd URG Flags', ' Fwd Header Length', ' Bwd Packets/s', ' Packet Length Mean', ' ACK Flag Count', ' Down/Up Ratio', ' Avg Fwd Segment Size', ' Fwd Header Length.1', 'Fwd Avg Bytes/Bulk', ' Fwd Avg Packets/Bulk', ' Bwd Avg Bytes/Bulk', 'Bwd Avg Bulk Rate', 'Subflow Fwd Packets', ' Subflow Fwd Bytes', 'Init_Win_bytes_forward', ' act_data_pkt_fwd', ' Active Std', ' Active Min', ' Idle Max']

# Label的名字:不太需要改应该
LABEL_NAME = ' Label'

# 记录summary的频率:不太需要改应该
LOG_INTERVAL = 100

# 数据集路径
IDS_DATASET = os.path.join('data', 'ids2017_sampled.csv')

# benign flow的label
BENIGN_LABEL = 0

# attack flow的label
ATTACK_LABEL = 2

# training dataset占总dataset的比例
TRAIN_FRAC = 0.3

# 要修改的feature的数量
FEATURE_NUM_MODIFIED = 2

# learning rate
LEARNING_RATE = 0.01

# 训练的epochs数
EPOCHS = 20

## Load and normalize data

In [4]:
# quick view of dataset
!head -n5 {IDS_DATASET}

Flow ID, Source IP, Source Port, Destination IP, Destination Port, Protocol, Timestamp, Flow Duration, Total Fwd Packets, Total Backward Packets,Total Length of Fwd Packets, Total Length of Bwd Packets, Fwd Packet Length Max, Fwd Packet Length Min, Fwd Packet Length Mean, Fwd Packet Length Std,Bwd Packet Length Max, Bwd Packet Length Min, Bwd Packet Length Mean, Bwd Packet Length Std,Flow Bytes/s, Flow Packets/s, Flow IAT Mean, Flow IAT Std, Flow IAT Max, Flow IAT Min,Fwd IAT Total, Fwd IAT Mean, Fwd IAT Std, Fwd IAT Max, Fwd IAT Min,Bwd IAT Total, Bwd IAT Mean, Bwd IAT Std, Bwd IAT Max, Bwd IAT Min,Fwd PSH Flags, Bwd PSH Flags, Fwd URG Flags, Bwd URG Flags, Fwd Header Length, Bwd Header Length,Fwd Packets/s, Bwd Packets/s, Min Packet Length, Max Packet Length, Packet Length Mean, Packet Length Std, Packet Length Variance,FIN Flag Count, SYN Flag Count, RST Flag Count, PSH Flag Count, ACK Flag Count, URG Flag Count, CWE Flag Count, ECE Flag Count, Down/Up Ratio, Average Packet Size, Av

In [5]:
# read csv
df = pd.read_csv(IDS_DATASET)

In [6]:
# extract relevant features and label name
df = df[RELEVANT_FEATURES + [LABEL_NAME]]

In [7]:
# extract bengin and attack flows we want
benign_df, attack_df = df[(df[LABEL_NAME] == BENIGN_LABEL)], df[(df[LABEL_NAME] == ATTACK_LABEL)]

In [8]:
# rewrite label values
benign_df.loc[:, LABEL_NAME] = 0
attack_df.loc[:, LABEL_NAME] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [9]:
# convert to numpy
benign, attack = benign_df.values, attack_df.values

In [10]:
# max normalization
benign[:, :len(RELEVANT_FEATURES)] = max_norm(benign[:, :len(RELEVANT_FEATURES)])
attack[:, :len(RELEVANT_FEATURES)] = max_norm(attack[:, :len(RELEVANT_FEATURES)])

In [11]:
# do train, test split separately on benign and attack
benign_train, benign_test = train_test_split(benign, train_size=TRAIN_FRAC)
attack_train, attack_test = train_test_split(attack, train_size=TRAIN_FRAC)



In [12]:
# concat to get testing data
test_np = np.concatenate([benign_test, attack_test])

## Generate Dataset

In [13]:
# convert to benign train dataset and attack train dataset
benign_train_dataset, attack_train_dataset = tf.data.Dataset.from_tensor_slices(benign_train), tf.data.Dataset.from_tensor_slices(attack_train)

In [14]:
# deal with benign train dataset
benign_train_dataset = benign_train_dataset.map(parse_feature_label)
benign_train_dataset = benign_train_dataset.shuffle(buffer_size=benign_train.shape[0] * 5)  # randomize
benign_train_dataset = benign_train_dataset.batch(100) # make batch

# View a single example entry from a batch
benign_features, benign_label = iter(benign_train_dataset).next()
print("benign example features:", benign_features[0])
print("benign example label:", benign_label[0])

benign example features: tf.Tensor(
[7.74075883e-01 1.24760226e-03 4.48925052e-02 4.56819093e-03
 2.55811052e-04 1.58476027e-01 0.00000000e+00 1.19549734e-02
 3.65357181e-03 1.48063524e-02 4.12737115e-02 4.68018713e-08
 4.30117500e-03 5.46759534e-04 7.08206345e-04 1.54237288e-06
 4.41663417e-02 3.20820884e-03 2.05297597e-02 1.69491525e-08
 0.00000000e+00 0.00000000e+00 0.00000000e+00 5.10567687e-04
 2.78477168e-06 4.55249622e-01 0.00000000e+00 1.42857143e-01
 3.32997940e-02 5.10567687e-04 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 4.29000429e-04 4.56819093e-03
 1.25015259e-01 4.34971727e-04 0.00000000e+00 0.00000000e+00
 0.00000000e+00], shape=(41,), dtype=float64)
benign example label: tf.Tensor(0, shape=(), dtype=int32)


In [15]:
# deal with attack train dataset
attack_train_dataset = attack_train_dataset.map(parse_feature_label)
attack_train_dataset = attack_train_dataset.shuffle(buffer_size=attack_train.shape[0] * 5)  # randomize
attack_train_dataset = attack_train_dataset.batch(100) # make batch

# View a single example entry from a batch
attack_features, attack_label = iter(attack_train_dataset).next()
print("attack example features:", attack_features[0])
print("attack example label:", attack_label[0])

attack example features: tf.Tensor(
[3.37726757e-01 0.00000000e+00 9.73550970e-01 1.54320988e-01
 1.00000000e+00 3.74644243e-01 0.00000000e+00 9.82431870e-06
 2.27229637e-01 4.70989761e-01 8.44915253e-01 1.10229277e-04
 9.74789916e-01 4.91094141e-01 5.90236686e-01 3.84172109e-05
 9.74789916e-01 1.89215686e-01 8.12714777e-01 4.60784314e-07
 0.00000000e+00 0.00000000e+00 0.00000000e+00 3.94230769e-01
 6.04510190e-08 5.32086707e-01 0.00000000e+00 5.00000000e-01
 1.75438596e-01 3.94230769e-01 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 4.16666667e-01 1.54320988e-01
 0.00000000e+00 3.33333333e-01 0.00000000e+00 3.44123615e-01
 8.44915254e-01], shape=(41,), dtype=float64)
attack example label: tf.Tensor(1, shape=(), dtype=int32)


## Build Model

In [16]:
generator = Generator(input_shape=len(RELEVANT_FEATURES), output_shape=FEATURE_NUM_MODIFIED)
discriminator = Discriminator()

## Build Optimizer

In [17]:
generator_optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
discriminator_optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)

## Build Tensorflow Training Environment

In [18]:
step_counter = tf.train.get_or_create_global_step()
summary_writer = tf.contrib.summary.create_file_writer(
      OUTPUT_DIR, flush_millis=1000)
checkpoint_prefix = os.path.join(CHECKPOINT_DIR, 'ckpt')
latest_cpkt = tf.train.latest_checkpoint(CHECKPOINT_DIR)
if latest_cpkt:
    print('Using latest checkpoint at ' + latest_cpkt)
model_objects = {
    'generator': generator,
    'discriminator': discriminator,
    'generator_optimizer': generator_optimizer,
    'discriminator_optimizer': discriminator_optimizer,
    'step_counter': step_counter
}
checkpoint = tfe.Checkpoint(**model_objects)
# Restore variables on creation if a checkpoint exists.
checkpoint.restore(latest_cpkt)

<tensorflow.contrib.eager.python.checkpointable_utils.InitializationOnlyStatus at 0x2af817cb6278>

## Train

In [19]:
with tf.device('/gpu:0'):
    train_start = time.time()
    for _ in range(EPOCHS):
        start = time.time()
        with summary_writer.as_default():
            train_one_epoch(benign_dataset=benign_train_dataset,
                        attack_dataset=attack_train_dataset,
                        log_interval=LOG_INTERVAL,
                        modified_feature_num=FEATURE_NUM_MODIFIED,
                        **model_objects)
        end = time.time()
        checkpoint.save(checkpoint_prefix)
        print('\nTrain time for epoch #%d (step %d): %f' %
            (checkpoint.save_counter.numpy(),
             checkpoint.step_counter.numpy(),
             end - start))
    print('\nTotal training time for {epoch} epoch(s) is {second}'.format(
        second=time.time() - train_start,
        epoch=EPOCHS
    ))


Train time for epoch #1 (step 15): 1.396186

Train time for epoch #2 (step 30): 1.259199

Train time for epoch #3 (step 45): 1.293359

Train time for epoch #4 (step 60): 1.380395

Train time for epoch #5 (step 75): 1.286117

Train time for epoch #6 (step 90): 1.241150

Train time for epoch #7 (step 105): 1.300222

Train time for epoch #8 (step 120): 1.239007

Train time for epoch #9 (step 135): 1.161514

Train time for epoch #10 (step 150): 1.287961

Train time for epoch #11 (step 165): 1.327828

Train time for epoch #12 (step 180): 1.202986

Train time for epoch #13 (step 195): 1.290864

Train time for epoch #14 (step 210): 1.241826

Train time for epoch #15 (step 225): 1.132176

Train time for epoch #16 (step 240): 1.208208

Train time for epoch #17 (step 255): 1.298326

Train time for epoch #18 (step 270): 1.252905

Train time for epoch #19 (step 285): 1.265866

Train time for epoch #20 (step 300): 1.246461

Total training time for 20 epoch(s) is 25.76100778579712
