# CICIDS2017   
# 单独平衡了训练集
------------------------------------------------


使用所有的.csv文件建立一个干净的CICIDS2017数据集.


</div>
    <b>数据集描述:</b>  <a href="https://www.unb.ca/cic/datasets/ids-2017.html">加拿大网络安全研究所（CIC）</a>创建, 由标记的网络流组成。 CICIDS2017包含良性和最新的常见攻击。它由 2,830,743 条记录组成，共有 78 个特征。
</div>

In [1]:
import pandas as pd
import numpy as np
import glob
import os

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
sns.set_theme(style="white", color_codes=True)
pd.set_option('display.max_columns', None, 'max_colwidth', None, 'display.expand_frame_repr', False)

## 清洗数据
------------------------------------------------

In [3]:
DATA_DIR  = os.path.join(os.path.abspath("../"), "data")
IMAGE_DIR = os.path.join(os.path.abspath("../"), "images")
print(DATA_DIR)

d:\桌面\苏媛媛-毕设代码\CICIDS2017\data


### 获取数据

In [4]:
def clean_column_name(column):
    column = column.strip(' ')
    column = column.replace('/', '_')
    column = column.replace(' ', '_')
    column = column.lower()
    return column

In [None]:
# 读所有的.csv文件
filenames = glob.glob(os.path.join(DATA_DIR,  'raw', '*.csv'))
datasets = [pd.read_csv(filename) for filename in filenames]

# 移除空白，重命名列
for dataset in datasets:
    dataset.columns = [clean_column_name(column) for column in dataset.columns]

# 连接数据集
dataset = pd.concat(datasets, axis=0, ignore_index=True)

对数据进行初步检查

In [None]:
dataset.info()

In [None]:
dataset.head(5)  #前5行

In [None]:
dataset.describe(include=[int, float])  #整型，浮点型数据的统计信息

In [None]:
dataset.describe(include=[object]).transpose()  # 非数值数据的统计信息

In [None]:
dataset.label.value_counts()

###  处理重复项

首先在合并八个.csv文件后检查是否有重复项。

In [None]:
dataset.duplicated().any()


删除重复项

In [None]:
print('删除具有重复值的实例之前的数据大小：', dataset.shape[0], end='\n\n')

# 删除重复行
dataset.drop_duplicates(inplace=True, keep=False, ignore_index=True)

print('删除具有重复值的实例之后的数据大小： ', dataset.shape[0])


### 处理缺失值


检查每个特征中是否有缺失值

In [None]:
dataset.isnull().sum().sum()

看到有334个缺失值

   有以下方法处理缺失值
1. 除去相应的例子（行）
2. 除去相应的属性（列）
3. 缺失值设置为0，平均值，或者中位数
4. 插补

In [None]:
dataset.isnull().sum() / dataset.shape[0]

 
由于数据集足够大，所以可以通过移除实例的方法处理  
首先确保缺失值与特定标签无关，然后删除该实例

In [None]:
dataset.columns[dataset.isnull().any()]


如上，缺失值来源：`flow_bytes_s`流字节率（即每秒传输的数据包字节数）。该特征不影响标签分类

In [None]:
print('删除具有缺失值的实例之前的数据大小： ', dataset.shape[0], end='\n\n')

# 删除缺失行
dataset.dropna(axis=0, inplace=True, how="any")

print('删除具有缺失的实例之后的数据大小： ', dataset.shape[0])

### 处理无穷值

检查是否所有值都是有限的

In [None]:
np.all(np.isfinite(dataset.drop(['label'], axis=1)))

In [None]:
# 将无穷大值替换为 NaN 
dataset.replace([-np.inf, np.inf], np.nan, inplace=True)

# 检查哪些标签与无限值相关 
dataset[(dataset['flow_bytes_s'].isnull()) & (dataset['flow_packets_s'].isnull())].label.unique()

In [None]:
print('删除具有无穷大值的实例之前的数据大小：', dataset.shape[0], end='\n\n')

# 去除无限值的行
dataset.dropna(axis=0, how='any', inplace=True)

print('删除具有无穷大值的实例之后的数据大小：', dataset.shape[0])

### 处理具有准零标准偏差（quasi null std deviation）的特征

Standard deviation denoted by sigma (σ) is the average of the squared root differences from the mean.

In [None]:
dataset_std = dataset.std(numeric_only=True)
dataset_std

In [None]:
# 寻找满足阈值的特征
constant_features = [column for column, std in dataset_std.iteritems() if std < 0.01]

# 丢弃这些常量特征
dataset.drop(labels=constant_features, axis=1, inplace=True)

例如特征： `bwd_psh_flags`, `fwd_urg_flags`, `bwd_urg_flags`, `cwe_flag_count`, `fwd_avg_bytes_bulk`, `fwd_avg_packets_bulk`, `fwd_avg_bulk_rate`, `bwd_avg_bytes_bulk`, `bwd_avg_packets_bulk`, `bwd_avg_bulk_rate`  并不改变

### 观察异常值

In [None]:
Q1 = dataset.quantile(0.25)
Q3 = dataset.quantile(0.75)
IQR = Q3 - Q1

# Identifying outliers with interquartile range使用四分位数间距观察异常值
filt = (dataset < (Q1 - 1.5 * IQR)) | (dataset > (Q3 + 1.5 * IQR))
print(filt.sum())

In [None]:
fig = plt.figure(figsize=(15, 8))
sns.boxplot(data=dataset[["average_packet_size", "avg_bwd_segment_size"]], orient="h")

plt.title('Summary of some variables containing outliers', fontsize=18)
plt.show()

转换特征类型

In [None]:
dataset[['flow_bytes_s', 'flow_packets_s']] = dataset[['flow_bytes_s', 'flow_packets_s']].apply(pd.to_numeric)

## 数据探索
------------------------------------------------

### 相关矩阵（Correlation Matrix）

In [None]:
dataset_corr = dataset.corr() #计算列的成对相关性，不包括 NA/null 值。
dataset_corr.head(5)

In [None]:
import matplotlib.colors as colors
fig = plt.figure(figsize=(15, 15))
sns.set(font_scale=1.0)

# cmap = colors.LinearSegmentedColormap.from_list('mycmap', ['#ADD8E6', '#00008B'])
ax = sns.heatmap(dataset_corr, annot=False,cmap='Blues')#
# fig.savefig(os.path.join(IMAGE_DIR, 'correlation matrix.pdf'))

  
可以看到，某些特征似乎是高度相关的。因此，可能需要删除它们，因为会带来多余的信息

In [None]:
# 建立和应用被罩矩阵
mask = np.triu(np.ones_like(dataset_corr, dtype=bool)) #返回一个bool类型的相同大小的矩阵，mask为对应的上三角矩阵
tri_df = dataset_corr.mask(mask) #应用被罩矩阵

# 寻找满足阈值的特征
correlated_features = [c for c in tri_df.columns if any(tri_df[c] > 0.98)]

# 移除高相关性的元素
dataset.drop(labels=correlated_features, axis=1, inplace=True)

In [None]:
fig = plt.figure(figsize=(15, 15))
sns.set(font_scale=1.0)
ax = sns.heatmap(tri_df, annot=False)
# fig.savefig(os.path.join(IMAGE_DIR, 'correlation matrix_dropped.pdf'))

### 标签分布 

In [None]:
fig = plt.figure(figsize=(30, 10))

attack = dataset['label'].value_counts()

attack_count = attack.values
attack_type = attack.index

bar = plt.bar(attack_type, attack_count, align='center')

for rect in bar:
    height = rect.get_height()
    plt.text(rect.get_x() + rect.get_width() / 2.0, height, format(height, ','), ha='center', va='bottom')
#数据集中不同类型的网络活动的分布
plt.title('Distribution of different type of network activity in the dataset')
plt.xlabel('Network activity')
plt.ylabel('Number of instances')
plt.grid(True)
plt.show()
# fig.savefig(os.path.join(IMAGE_DIR, 'network_activity.pdf'))

 
数据集显然是不均衡的  
可以合并几个具有相似特征和行为的少数类，形成新的攻击类。

In [None]:
dataset['label'] = dataset['label'].str.replace('Web Attack �', 'Web Attack', regex=False)

# 分组
attack_group = {
    'BENIGN': 'Benign',
    'PortScan': 'PortScan',
    'DDoS': 'DDoS',
    'DoS Hulk': 'DoS',
    'DoS GoldenEye': 'DoS',
    'DoS slowloris': 'DoS', 
    'DoS Slowhttptest': 'DoS',
    'Heartbleed': 'DoS',
    'FTP-Patator': 'Brute Force',
    'SSH-Patator': 'Brute Force',
    'Bot': 'Botnet ARES',
    'Web Attack Brute Force': 'Web Attack',
    'Web Attack Sql Injection': 'Web Attack',
    'Web Attack XSS': 'Web Attack',
    'Infiltration': 'Infiltration'
}

# 建立组标签列
dataset['label_category'] = dataset['label'].map(lambda x: attack_group[x])
dataset['label_category'].value_counts()

In [None]:
fig = plt.figure(figsize=(12, 5))

attack = dataset['label_category'].value_counts()

attack_count = attack.values
attack_type = attack.index

bar = plt.bar(attack_type, attack_count, align='center')

for rect in bar:
    height = rect.get_height()
    plt.text(rect.get_x() + rect.get_width() / 2.0, height, format(height, ','), ha='center', va='bottom', fontsize=12)

plt.title('Distribution of different type of network activity in the dataset', fontsize=18)
plt.xlabel('Network activity', fontsize=16)
plt.ylabel('Number of instances', fontsize=16)
plt.grid(True)
plt.show()
# fig.savefig(os.path.join(IMAGE_DIR, 'network_activity_category.pdf'))

In [None]:
numeric_features = dataset.select_dtypes(exclude=[object]).columns
columns = numeric_features.tolist()
# X_data = pd.DataFrame(dataset, columns=columns) 
# y_data = pd.DataFrame(dataset, columns=["label_category"])
# # 保存清洗后的数据
# X_data.to_pickle(os.path.join(DATA_DIR, 'processed', 'raw/data_features.pkl'))
# y_data.to_pickle(os.path.join(DATA_DIR, 'processed', 'raw/data_labels.pkl')) 


## 划分并平衡数据
------------------------------------------------

### 划分数据


按照 6：2：2 将清洗后的数据集划分成训练、验证和测试集。


In [None]:
labels = dataset['label_category']
features = dataset.drop(labels=['label', 'label_category'], axis=1)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.4, random_state=42, stratify=labels)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [None]:
X_train.shape, X_test.shape, X_val.shape

### 缩放数据特征的范围

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, QuantileTransformer
from sklearn.compose import ColumnTransformer

In [None]:
categorical_features = features.select_dtypes(exclude=["int64", "float64"]).columns
numeric_features = features.select_dtypes(exclude=[object]).columns

preprocessor = ColumnTransformer(transformers=[
    ('categoricals', OneHotEncoder(drop='first', sparse=False, handle_unknown='error'), categorical_features),
    ('numericals', QuantileTransformer(), numeric_features)
])

预处理特征

In [None]:
columns = numeric_features.tolist()

X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns=columns)  #拟合+标准化
X_test = pd.DataFrame(preprocessor.transform(X_test), columns=columns)  # 标准化
X_val = pd.DataFrame(preprocessor.transform(X_val), columns=columns)


In [None]:

# 顺便处理一下原始数据
X_data = pd.DataFrame(preprocessor.fit_transform(dataset), columns=columns) 

预处理标签

In [None]:
le = LabelEncoder()

y_train = pd.DataFrame(le.fit_transform(y_train), columns=["label"])
y_test = pd.DataFrame(le.transform(y_test), columns=["label"])
y_val = pd.DataFrame(le.transform(y_val), columns=["label"])


In [None]:
y_data = pd.DataFrame(dataset, columns=["label_category"]) #dataset['label_category']
y_data = pd.DataFrame(le.transform(y_data), columns=["label"])

保存结果

In [None]:
# 保存清洗后的数据
X_data.to_pickle(os.path.join(DATA_DIR, 'processed', 'raw/data_features.pkl'))
y_data.to_pickle(os.path.join(DATA_DIR, 'processed', 'raw/data_labels.pkl')) 

In [None]:
# X_train.to_pickle(os.path.join(DATA_DIR, 'processed', 'train/train_features.pkl'))
# X_val.to_pickle(os.path.join(DATA_DIR, 'processed', 'val/val_features.pkl'))
# X_test.to_pickle(os.path.join(DATA_DIR, 'processed', 'test/test_features.pkl'))

# y_train.to_pickle(os.path.join(DATA_DIR, 'processed', 'train/train_labels.pkl'))
# y_val.to_pickle(os.path.join(DATA_DIR, 'processed', 'val/val_labels.pkl'))
# y_test.to_pickle(os.path.join(DATA_DIR, 'processed', 'test/test_labels.pkl'))

In [None]:
X_train.shape

In [None]:
y_train.value_counts()

   
使用`SMOTE`和`RandomUnderSampler`的组合来平衡训练集
------------------------

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

def balance_dataset(X, y, undersampling_strategy, oversampling_strategy):

    under_sampler = RandomUnderSampler(sampling_strategy=undersampling_strategy, random_state=0)  #抽取数据
    X_under, y_under = under_sampler.fit_resample(X, y)
    #sampling_strategy包含要对数据集进行采样的信息的字典。键对应于要从中采样的类标签，值是要采样的样本数。random_state随机种子
    #X是采样的矩阵，y是每个样本的相应标签

    over_sampler = SMOTE(sampling_strategy=oversampling_strategy)
    X_bal, y_bal = over_sampler.fit_resample(X_under, y_under)    #对下采样数据进行过采样    ，增加数据
    
    return X_bal, y_bal

***Label Encoder Transformation***
```json
{
    'Benign': 0,
    'DoS': 4,  
    'DDoS':3,
    'PortScan': 7,
    'Brute Force': 2,
    'Web Attack': 8,
    'Botnet ARES': 1,
    'Infiltration':6,
    'Heartbleed':5

}
```
```json
{
    'Benign': 0,
    'DoS: 4,
    'DDoS': 3,
    'PortScan': 6,
    'Brute Force': 2,
    'Web Attack': 7,
    'Botnet ARES': 1,
    'Infiltration':5

}
```


Benign          2035505 
DoS              192264
DDoS             128005
PortScan          57305
Brute Force        8551
Web Attack         2118
Botnet ARES        1943
Infiltration         36

label
0        1221303
4         115358
3          76803
6          34383
2           5130
7           1271
1           1166
5             22

In [None]:
undersampling_strategy = {
    0: 600000,
    4: 115358,
    3: 76803,
    6: 34383,
    2: 5130,
    7: 1271,
    1: 1166,
    5: 22,
}

oversampling_strategy = {
    0: 600000,
    4: 115358,
    3: 76803,
    6: 34383,
    2: 25130,
    7: 21271,
    1: 21166,
    5: 522,
}

# Balance the training set
X_train_bal, y_train_bal = balance_dataset(X_train, y_train, undersampling_strategy, oversampling_strategy)

# Save the balanced training set
X_train_bal.to_pickle(os.path.join(DATA_DIR, 'processed', 'train/train_features_balanced.pkl'))
y_train_bal.to_pickle(os.path.join(DATA_DIR, 'processed', 'train/train_labels_balanced.pkl'))

In [None]:
fig, ax = plt.subplots(figsize=(15, 10))

imbalanced = y_train.value_counts()
balanced = y_train_bal.value_counts()

indexes = np.arange(8)
width = 0.4
rect1 = plt.bar(indexes, imbalanced.values, width, color="steelblue", label="imbalanced")
rect2 = plt.bar(indexes + width, balanced.values, width, color="indianred", label="balanced")

def add_text(rect):
    """Add text to top of each bar."""
    for r in rect:
        h = r.get_height()
        plt.text(r.get_x() + r.get_width()/2, h*1.01, s=format(h, ",") ,fontsize=12, ha='center', va='bottom')

add_text(rect1)
add_text(rect2)

ax.set_xticks(indexes + width / 2)
ax.set_xticklabels(['Benign', 'DoS','DDoS', 'PortScan', 'Brute Force', 'Web Attack', 'Botnet ARES','Infiltration'])
plt.xlabel('Traffic Activity', fontsize=16)
plt.ylabel('Instances', fontsize=16)
plt.legend()
plt.grid()
plt.show()
fig.savefig(os.path.join(IMAGE_DIR, 'balanced_dataset.pdf'))