This file contains code for sampling cleaned CIC-IDS-2017 dataset to be used for experiments.

In [12]:
import sys
import os

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, Normalizer, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.utils import shuffle, resample
from sklearn.utils.random import sample_without_replacement

script_dir = os.path.dirname(os.path.abspath("experiments"))
sys.path.append(os.path.dirname(script_dir))

from experiments.predictions import labels

import warnings
warnings.filterwarnings("ignore")

'.'

In [2]:
#clean = pd.read_csv("./clean_2018.csv")
clean = pd.read_csv("./clean1.csv")

`preproc_data` function implements data preprocessing steps from `preprocessing.ipynb` file - but as a single function. 

In [3]:
def preproc_data(dataset, train_sample: float, pca_dim=31):
    
    # Label encode
    le = LabelEncoder()
    dataset['label'] = le.fit_transform(dataset['label'])
    
    # Train test split
    x_train, x_test, y_train, y_test = train_test_split(dataset.iloc[:,:-1], 
                                                        dataset['label'], 
                                                        test_size=1-train_sample, 
                                                        random_state=0)
    # Standard scaling
    ss = StandardScaler().fit(x_train)

    x_train = ss.transform(x_train)
    x_test = ss.transform(x_test)
    
    # PCA
    pca = PCA(n_components=31).fit(x_train)

    x_train = pca.transform(x_train)
    x_test = pca.transform(x_test)
    
    # Normalization
    norm = Normalizer().fit(x_train)

    x_train = norm.transform(x_train)
    x_test = norm.transform(x_test)
    
    # Reshaping 
    y_train = y_train.values.reshape(-1,1)
    y_test = y_test.values.reshape(-1,1)
    
    return x_train, x_test, y_train, y_test

In [4]:
low_member_thresh = 2000

low_member_labels = list(clean['label'].value_counts()[clean['label'].value_counts() < low_member_thresh].index)
#low_member_labels = list(clean['Label'].value_counts()[clean['Label'].value_counts() < low_member_thresh].index)
low_member_labels

['Bot',
 'Web_Attack_Brute_Force',
 'Web_Attack_XSS',
 'Infiltration',
 'Web_Attack_Sql_Injection',
 'Heartbleed']

### 10%

This dataset sample contains only 10% of each class members except those which contain <2000 intances. They are taken in their entirety.

In [5]:
low_member_data = clean[clean.label.str.contains("|".join(low_member_labels))]
low_member_data

Unnamed: 0,destination_port,flow_duration,total_fwd_packets,total_backward_packets,total_length_of_fwd_packets,total_length_of_bwd_packets,fwd_packet_length_max,fwd_packet_length_min,fwd_packet_length_mean,fwd_packet_length_std,...,min_seg_size_forward,active_mean,active_std,active_max,active_min,idle_mean,idle_std,idle_max,idle_min,label
535861,8080,60202640,9,9,322,256,322,0,35.777778,107.333333,...,32,63678.2,22252.53596,103175,50911,10200000.0,34941.27201,10200000,10100000,Bot
536465,8080,57891,1,1,0,0,0,0,0.000000,0.000000,...,32,0.0,0.00000,0,0,0.0,0.00000,0,0,Bot
568134,8080,134812,4,3,206,134,194,0,51.500000,95.042096,...,20,0.0,0.00000,0,0,0.0,0.00000,0,0,Bot
568159,1841,84,1,1,6,6,6,6,6.000000,0.000000,...,20,0.0,0.00000,0,0,0.0,0.00000,0,0,Bot
568354,8080,127781,4,3,206,134,194,0,51.500000,95.042096,...,20,0.0,0.00000,0,0,0.0,0.00000,0,0,Bot
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2732770,444,119259012,2801,2069,12264,7879536,4344,0,4.378436,83.107845,...,32,0.0,0.00000,0,0,0.0,0.00000,0,0,Heartbleed
2732956,444,119257653,2802,2067,20858,7812389,5792,0,7.443969,126.045811,...,32,0.0,0.00000,0,0,0.0,0.00000,0,0,Heartbleed
2733041,444,119299621,2805,2028,13712,7878627,5792,0,4.888414,110.120790,...,32,0.0,0.00000,0,0,0.0,0.00000,0,0,Heartbleed
2733047,444,119296592,2797,2006,13712,7878088,5792,0,4.902395,110.277907,...,32,0.0,0.00000,0,0,0.0,0.00000,0,0,Heartbleed


In [9]:
del(data)

In [10]:
data = clean[~clean.label.str.contains("|".join(low_member_labels))].sample(frac=.1, random_state=1)
data = pd.concat([data, low_member_data])
data

Unnamed: 0,destination_port,flow_duration,total_fwd_packets,total_backward_packets,total_length_of_fwd_packets,total_length_of_bwd_packets,fwd_packet_length_max,fwd_packet_length_min,fwd_packet_length_mean,fwd_packet_length_std,...,min_seg_size_forward,active_mean,active_std,active_max,active_min,idle_mean,idle_std,idle_max,idle_min,label
2256289,80,67429791,9,7,384,11595,384,0,42.666667,128.000000,...,32,2047.0,0.0,2047,2047,67300000.0,0.0,67300000,67300000,DoS_Hulk
1788392,53510,2653,2,1,0,0,0,0,0.000000,0.000000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2794319,443,145276,1,1,0,0,0,0,0.000000,0.000000,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1970600,54230,146,1,1,0,0,0,0,0.000000,0.000000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
339126,1218,79,1,1,2,6,2,2,2.000000,0.000000,...,24,0.0,0.0,0,0,0.0,0.0,0,0,PortScan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2732770,444,119259012,2801,2069,12264,7879536,4344,0,4.378436,83.107845,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Heartbleed
2732956,444,119257653,2802,2067,20858,7812389,5792,0,7.443969,126.045811,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Heartbleed
2733041,444,119299621,2805,2028,13712,7878627,5792,0,4.888414,110.120790,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Heartbleed
2733047,444,119296592,2797,2006,13712,7878088,5792,0,4.902395,110.277907,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Heartbleed


In [11]:
data = data.append(low_member_data, ignore_index=True)

In [59]:
x_train, x_test, y_train, y_test = preproc_data(data, train_sample=0.75, pca_dim=31)

In [61]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(214914, 31) (71638, 31) (214914, 1) (71638, 1)


In [13]:
folder = "data/preserve10_2018/"
if not os.path.exists(folder):
    os.makedirs(folder)
x_train.to_csv("data/preserve10_2018/x_train.csv", x_train)
pd.save("data/preserve10_2018/y_train.csv", y_train)
pd.save("data/preserve10_2018/x_test.csv", x_test)
pd.save("data/preserve10_2018/y_test.csv", y_test)

### 25%

In [20]:
low_member_data = clean[clean.label.str.contains("|".join(low_member_labels))]

In [21]:
data = clean[~clean.label.str.contains("|".join(low_member_labels))].sample(frac=.25, random_state=1)

In [22]:
data = data.append(low_member_data, ignore_index=True)

In [23]:
x_train, x_test, y_train, y_test = preproc_data(data, train_sample=0.75, pca_dim=31)

In [24]:
folder = "data/preserve25/"
if not os.path.exists(folder):
    os.makedirs(folder)
np.save("data/preserve25/x_train.npy", x_train)
np.save("data/preserve25/y_train.npy", y_train)
np.save("data/preserve25/x_test.npy", x_test)
np.save("data/preserve25/y_test.npy", y_test)

### 50%

In [33]:
data.shape

(1411846, 79)

In [31]:
low_member_data = clean[clean.label.str.contains("|".join(low_member_labels))]

In [32]:
data = clean[~clean.label.str.contains("|".join(low_member_labels))].sample(frac=.5, random_state=1)

In [34]:
data = data.append(low_member_data, ignore_index=True)

In [35]:
x_train, x_test, y_train, y_test = preproc_data(data, train_sample=0.75, pca_dim=31)

In [36]:
folder = "data/preserve50/"
if not os.path.exists(folder):
    os.makedirs(folder)
np.save("data/preserve50/x_train.npy", x_train)
np.save("data/preserve50/y_train.npy", y_train)
np.save("data/preserve50/x_test.npy", x_test)
np.save("data/preserve50/y_test.npy", y_test)

### 100%

In [30]:
x_train, x_test, y_train, y_test = preproc_data(clean, train_sample=0.75, pca_dim=31)

In [31]:
folder = "data/preserve100/"
if not os.path.exists(folder):
    os.makedirs(folder)
np.save("data/preserve100/x_train.npy", x_train)
np.save("data/preserve100/y_train.npy", y_train)
np.save("data/preserve100/x_test.npy", x_test)
np.save("data/preserve100/y_test.npy", y_test)

**csv_2017 / 2018 데이터 샘플링**

benign: 5만 attack: 5만

In [20]:
import pandas as pd

# CSV 파일 읽기
# cic 2017
#df = pd.read_csv('clean1.csv')
# cic 2018
df = pd.read_csv('clean_2018.csv')

# label 컬럼 치환 (benign은 0, 나머지는 1로 치환)
df['label'] = df['label'].apply(lambda x: 0 if x == 'Benign' or x == 0 else 1)
print(df)

# 라벨이 0인 데이터에서 5만개 샘플링
label_0_sample = df[df['label'] == 0].sample(n=50000, random_state=42)

# 라벨이 1인 데이터에서 5만개 샘플링
label_1_sample = df[df['label'] == 1].sample(n=50000, random_state=42)

# 두 샘플링 데이터를 합침
sampled_df = pd.concat([label_0_sample, label_1_sample])

# 샘플링된 데이터 확인
print(sampled_df)

# 샘플링된 데이터를 새로운 CSV 파일로 저장
sampled_df.to_csv('sampled_2018data.csv', index=False)




         dst_port  protocol  flow_duration  tot_fwd_pkts  tot_bwd_pkts  \
0             443         6         141385             9             7   
1           49684         6            281             2             1   
2             443         6         279824            11            15   
3             443         6            132             2             0   
4             443         6         274016             9            13   
...           ...       ...            ...           ...           ...   
5482412     20000         6              2             1             1   
5482413        23         6              3             1             1   
5482414       425         6              2             1             1   
5482415        23         6             22             1             1   
5482416       443         6              2             1             1   

         totlen_fwd_pkts  totlen_bwd_pkts  fwd_pkt_len_max  fwd_pkt_len_min  \
0                  553.0        

train, test 데이터를 csv로 추출

In [23]:
def train_to_csv(dataset, train_sample: float):

    # Label encode
    le = LabelEncoder()
    dataset['label'] = le.fit_transform(dataset['label'])

    # Train test split
    x_train, x_test, y_train, y_test = train_test_split(dataset.iloc[:,:-1],
                                                        dataset['label'],
                                                        test_size=1-train_sample,
                                                        random_state=0)
    return x_train, x_test, y_train, y_test

In [62]:
clean2018 = pd.read_csv("./sampled_cic2018_data.csv")
x_train, x_test, y_train, y_test = preproc_data(clean2018, train_sample=0.75, pca_dim=31)

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

np.save("./train_to_csv2018/x_train.npy", x_train)
np.save("./train_to_csv2018/x_test.npy", x_test)
np.save("./train_to_csv2018/y_train.npy", y_train)
np.save("./train_to_csv2018/y_test.npy", y_test)

(75000, 31) (25000, 31) (75000, 1) (25000, 1)
