In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.nn.init as init
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 加载数据集  
train_df = pd.read_csv('./数据/KDDTrain+afterP.csv') 
# train_df = pd.read_csv('../KDDTrain_ADASYN.csv') 
test_df = pd.read_csv('./数据/KDDTest+afterP.csv')

# 划分训练集和测试集的标签和特征
y_train = train_df['attack_type']
X_train = train_df.drop(columns = ['attack_type'])
y_test = test_df['attack_type']
X_test = test_df.drop(columns = ['attack_type'])

# 需要把类别特征进行数值化：['dos','normal','probe','r2l','u2r']分别映射为0 1 2 3 4
from sklearn.preprocessing import LabelEncoder  
# 初始化LabelEncoder  
le = LabelEncoder()  
  
# 对y_train中的类别特征进行数值化  
y_train_encoded = le.fit_transform(y_train)  

# 对y_test中的类别特征进行数值化  
y_test_encoded = le.fit_transform(y_test)  

# 将数据转换为PyTorch张量格式,才能进行后续的运算 （需要Numpy格式进行转换）
X_train = torch.tensor(X_train.values, dtype=torch.float32)  
y_train_encoded = torch.tensor(y_train_encoded, dtype=torch.long)  
X_test = torch.tensor(X_test.values, dtype=torch.float32)  
y_test_encoded = torch.tensor(y_test_encoded, dtype=torch.long) 

# 使用one_hot函数将类别数字转换为独热编码, 条件变分自编码器需要输入标签最好是进行了独热编码  
y_train_encoded = F.one_hot(y_train_encoded, num_classes=5) 

In [3]:
# print(y_train_encoded)

In [4]:
# 创建数据加载器  
train_dataset = TensorDataset(X_train, y_train_encoded)  
train_loader = DataLoader(dataset=train_dataset, batch_size=128, shuffle=True)  

test_dataset = TensorDataset(X_test, y_test_encoded)  
test_loader = DataLoader(dataset=test_dataset, batch_size=16, shuffle=False) 

In [2]:
# 条件批量归一化：把数据的类别信息融合到归一化过程中，用类别信息控制缩放（weight）和bias
# 公式结合了标准批归一化和类别特定的偏移和缩放，使得归一化过程能够依赖于输入的类别标签
class ConditionalBatchNorm(nn.Module):  
    def __init__(self, num_features, num_classes):  
        super(ConditionalBatchNorm, self).__init__()  
        # num_features: 批归一化层需要处理的特征数量。
        # num_classes: 类别标签的数量。
        self.num_features = num_features  
        self.num_classes = num_classes  

        # gamma: 全局可学习的缩放因子，大小为num_features。
        # beta: 全局可学习的偏移量，初始化为零，大小为num_features。
        # weight: 类别特定的缩放因子，大小为num_classes x num_features。
        # bias: 类别特定的偏移量，大小为num_classes x num_features。
        # 这些都可以通过参数进行学习
        self.gamma = nn.Parameter(torch.Tensor(num_features))
        #######使用正态分布初始化缩放因子，防止输出过大或者过小
        nn.init.normal_(self.gamma, mean=0, std=1)
        self.beta = nn.Parameter(torch.zeros(num_features))  
        self.weight = nn.Parameter(torch.Tensor(num_classes, num_features))  # 这些全都是可以训练的参数 一个表格存储各个类的w和b
        self.bias = nn.Parameter(torch.zeros(num_classes, num_features)) 
        ####### 使用 Xavier Uniform 初始化，防止初始化值异常导致输出异常，出现损失函数异常
        nn.init.xavier_uniform_(self.weight)  
  
    def forward(self, x, y):  
        batch_size = x.size(0)  
        # 通过y索引相应类别的w和b；重塑w和b与x相同形状
        # 使用矩阵乘法实现通过独热编码y选择
        y = y.float()
        weight = torch.matmul(y, self.weight) 
        bias = torch.matmul(y, self.bias) 
        # 沿着批量维度做归一化（减去均值除以方差）
        x_mean = x.mean(dim = 0, keepdim=True)
        # print(x_mean.shape) 1,feature_dim
        x_var = x.var(dim = 0, keepdim=True)
        # 特定类别的weight和bias缩放
        return self.gamma * (x - x_mean) / torch.sqrt(x_var + 1e-5) + self.beta + weight * x_mean + bias 

In [3]:
# 定义Variational Autoencoder模型

# 条件变分自编码器：定义，需要输入标签
class ConditionalEncoder(nn.Module):  
    def __init__(self, input_dim, label_dim, latent_dim, hidden_dim):  
        super(ConditionalEncoder, self).__init__()  
        self.fc1 = nn.Linear(input_dim + label_dim, hidden_dim)
        self.fc21 = nn.Linear(hidden_dim,hidden_dim)
        self.fc22 = nn.Linear(hidden_dim,hidden_dim)
        self.fc23 = nn.Linear(hidden_dim,hidden_dim)
        self.bn1 = ConditionalBatchNorm(hidden_dim,label_dim)
        self.bn2 = nn.BatchNorm1d(hidden_dim)
        self.fc_mean = nn.Linear(hidden_dim, latent_dim)
        self.fc_log_var = nn.Linear(hidden_dim, latent_dim)
        init.kaiming_normal_(self.fc1.weight, mode='fan_out', nonlinearity='relu')
        init.constant_(self.fc1.bias, 0)
        init.kaiming_normal_(self.fc21.weight, mode='fan_out', nonlinearity='relu')
        init.constant_(self.fc21.bias, 0)
        init.kaiming_normal_(self.fc22.weight, mode='fan_out', nonlinearity='relu')
        init.constant_(self.fc22.bias, 0)
        init.kaiming_normal_(self.fc23.weight, mode='fan_out', nonlinearity='relu')
        init.constant_(self.fc23.bias, 0)
        init.kaiming_normal_(self.fc_mean.weight, mode='fan_out', nonlinearity='linear')  
        init.constant_(self.fc_mean.bias, 0)  
        init.kaiming_normal_(self.fc_log_var.weight, mode='fan_out', nonlinearity='linear')  
        init.constant_(self.fc_log_var.bias, 0) 



    def forward(self, x, labels):  
        combined = torch.cat((x, labels), 1)  # 将标签和特征拼接
        x = self.fc1(combined)
        x = self.bn1(x,labels)
        x = F.leaky_relu(x)
        x = self.fc21(x)
        x = self.bn2(x)        
        x = F.leaky_relu(x)
        x = self.fc22(x)
        x = self.bn1(x,labels)
        x = F.leaky_relu(x)
        x = self.fc23(x)
        x = self.bn1(x, labels)
        mean = self.fc_mean(x)  
        log_var = self.fc_log_var(x)  
        return mean, log_var  
    
    
    
class ConditionalDecoder(nn.Module):  
    def __init__(self, latent_dim, label_dim, output_dim, hidden_dim):  
        super(ConditionalDecoder, self).__init__()  
        self.fc1 = nn.Linear(latent_dim + label_dim, hidden_dim) 
        self.fc21 = nn.Linear(hidden_dim,hidden_dim)
        self.fc22 = nn.Linear(hidden_dim,hidden_dim)
        self.fc23 = nn.Linear(hidden_dim,hidden_dim)
        self.bn1 = ConditionalBatchNorm(hidden_dim,label_dim)
        self.bn2 = nn.BatchNorm1d(hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)  
        init.kaiming_normal_(self.fc1.weight, mode='fan_out', nonlinearity='relu')
        init.constant_(self.fc1.bias, 0)
        init.kaiming_normal_(self.fc21.weight, mode='fan_out', nonlinearity='relu')
        init.constant_(self.fc21.bias, 0)
        init.kaiming_normal_(self.fc22.weight, mode='fan_out', nonlinearity='relu')
        init.constant_(self.fc22.bias, 0)
        init.kaiming_normal_(self.fc23.weight, mode='fan_out', nonlinearity='relu')
        init.constant_(self.fc23.bias, 0)
  
    def forward(self, z, labels):  
        combined = torch.cat((z, labels), 1)  # 把隐藏和标签拼接
        x = self.fc1(combined)  
        x = self.bn1(x, labels)  
        x = F.leaky_relu(x)  
        x = self.fc21(x)
        x = self.bn2(x)
        x = F.leaky_relu(x)
        x = self.fc22(x)
        x = self.bn1(x, labels)
        x = F.leaky_relu(x)
        x = self.fc23(x)
        x = self.bn1(x, labels)
        x = self.fc3(x)  
        return x 

In [23]:
# cbn = ConditionalBatchNorm(60,label_dim)
# x = torch.randn(1,60)
# labels = torch.randn(1,5)
# flops, params = profile(cbn, inputs=(x,labels))
# flops, params = clever_format([flops, params], '%.1f')

# print('Network Parameters：',params)
# print('FLOPs per sample：',flops)

In [4]:
# 条件变分自编码器
class CVAE(nn.Module):  
    def __init__(self, input_dim, latent_dim, label_dim, hidden_dim):  
        super(CVAE, self).__init__()  
        self.encoder = ConditionalEncoder(input_dim, label_dim, latent_dim, hidden_dim)  
        self.decoder = ConditionalDecoder(latent_dim, label_dim, input_dim, hidden_dim)  
  
    def forward(self, x, labels):  
        mean, log_var = self.encoder(x, labels)  
        z = self.sample_latent(mean, log_var)  
        x_recon = self.decoder(z, labels)  
        return x_recon, mean, log_var  
  
    def sample_latent(self, mean, log_var):  
        epsilon = torch.randn_like(mean)  
        return mean + torch.exp(0.5 * log_var) * epsilon  

# 定义VAE模型和优化器
hidden_dim = 60
input_dim =  123
latent_dim = 32 # 隐变量维度  
label_dim = 5 # 标签维度（进行了独热编码）
  
# 定义CVAE模型和优化器  
cvae = CVAE(input_dim, latent_dim, label_dim, hidden_dim)  

# 定义CVAE损失函数
def cvae_loss(x, x_recon, mean, log_var, y):
    # 计算重构损失
    # 既然x重构是通过y和隐藏z计算出来的，那重构损失就已经包含了类比信息了
    recon_loss = F.mse_loss(x_recon, x, reduction='sum')
    # 计算KL散度
    kl_divergence = -0.5 * torch.sum(1 + log_var - mean.pow(2) - log_var.exp())
    # 考虑条件信息的重构损失
    # conditional_loss = F.cross_entropy(y, x_recon)
    # 总损失
    total_loss = recon_loss + kl_divergence
    return total_loss


In [6]:
from thop import profile
from thop import clever_format
x = torch.randn(1,123)
labels = torch.randn(1,5)
flops, params = profile(cvae.encoder, inputs=(x,labels))
flops, params = clever_format([flops, params], '%.3f')

print('Network Parameters：',params)
print('FLOPs per sample：',flops)

[INFO] Register count_linear() for <class 'torch.nn.modules.linear.Linear'>.
[INFO] Register count_normalization() for <class 'torch.nn.modules.batchnorm.BatchNorm1d'>.
Network Parameters： 22.744K
FLOPs per sample： 22.560K


In [8]:
optimizer = torch.optim.Adam(cvae.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99)

In [9]:

# 检查是否有GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 将模型移动到GPU上
cvae = cvae.to(device)
# 将数据移动到GPU上
X_train = X_train.to(device)
y_train_encoded = y_train_encoded.to(device)
X_test = X_test.to(device)
y_test_encoded = y_test_encoded.to(device)

In [210]:
#############加载模型
# 选择一个加载模型参数的文件路径  
load_path ='newcvae_model_para_cbn_100.pth'  
# 使用 torch.load() 函数加载模型参数  
loaded_parameters = torch.load(load_path)  
# 加载模型参数到模型中  
cvae.load_state_dict(loaded_parameters)   
# 打印加载成功的消息  
print(f"AutoEncoder model parameters have been loaded from {load_path}")

AutoEncoder model parameters have been loaded from newcvae_model_para_cbn_120.pth


In [306]:
# 训练模型
num_epochs = 20

# 全加了batchNorm之后重构的x和loss就不是nan

for epoch in range(num_epochs):
    cvae.train()  # 设置模型为训练模式  
    train_loss = 0  # 初始化训练损失  
    for i,batch in enumerate(train_loader):  
        # 获取批次数据  
        data, labels = batch
        data = data.to(device)
        labels = labels.to(device)
        # 清零梯度
        optimizer.zero_grad()
        # 前向传播
        x_recon, mean, log_var = cvae(data,labels)
        # print(x_recon)
        # print(x_recon.shape)
        # 计算损失
        loss = cvae_loss(data, x_recon, mean, log_var, labels)
        # 反向传播
        loss.backward()
        # 更新权重  
        optimizer.step()   
        # 累加损失以便后续打印平均损失  
        train_loss += loss.item()  
        # if(i+1)%500 == 0:
        #     print(f'i [{i+1}], Loss: {loss.item():.4f}') 
    scheduler.step()
    # 打印每个epoch的平均损失  
    train_loss /= len(train_loader.dataset)  
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {train_loss:.4f}')  

Epoch [1/20], Loss: 90876431662511.1094
Epoch [2/20], Loss: 50742423482288.9062
Epoch [3/20], Loss: 50741959937335.5625
Epoch [4/20], Loss: 50728897495560.1953
Epoch [5/20], Loss: 50615217514560.2891
Epoch [6/20], Loss: 50630132849350.8438
Epoch [7/20], Loss: 50607756862758.9453
Epoch [8/20], Loss: 50748685567006.2891
Epoch [9/20], Loss: 50629530390447.9844
Epoch [10/20], Loss: 50577263103877.2891
Epoch [11/20], Loss: 50559698770831.1250
Epoch [12/20], Loss: 50516246738038.1953
Epoch [13/20], Loss: 50489963836058.1016
Epoch [14/20], Loss: 50363764157277.7422
Epoch [15/20], Loss: 50590268250981.8516
Epoch [16/20], Loss: 50406724756417.4688
Epoch [17/20], Loss: 50154959613598.1328
Epoch [18/20], Loss: 50096604365851.1797
Epoch [19/20], Loss: 50032378578137.8750
Epoch [20/20], Loss: 50190733167939.7500


In [300]:
# 假设 CVAE 已经被训练好了  
# 获取模型参数  
cvae_model_parameters = cvae.state_dict()  
# 选择一个保存模型参数的文件路径  
save_path = 'newcvae_model_para_cbn_100.pth'  
# 使用 torch.save() 函数保存模型参数  
torch.save(cvae_model_parameters, save_path)  
# 打印保存成功的消息  
print(f"cvae model parameters have been saved to {save_path}")

cvae model parameters have been saved to newcvae_model_para_cbn_100.pth


In [307]:
from torch.distributions import Normal

# 通过cvae可以指定类别生成新数据：与训练所用数据相似但不相同  
def generate_new_data(cvae,target_category, latent_dim, num_samples = 1):  
    cvae.eval()
    # 准备目标类别的独热编码y  
    # target_category = 2  #（索引从0开始） 
    num_classes = 5
    y_onehot = torch.zeros(num_samples, num_classes)  # num_classes是类别的总数  
    y_onehot[:, target_category] = 1  # 设置每一个样本目标类别的位置为1  
    y_onehot = y_onehot.to(device)
    # 从标准正态分布中随机采样隐变量  
    z = Normal(0, 1).sample((num_samples, latent_dim))  
    z = z.to(device)
    # 使用解码器生成数据  
    with torch.no_grad():  
        # 解码隐变量和标签以生成数据  
        generated_data = cvae.decoder(z, y_onehot)  
  
    return generated_data
  
# 设置要生成的数据数量  
num_samples = 10  
target_category = 0
# 生成新数据  
new_data = generate_new_data(cvae, target_category, latent_dim, num_samples)  
  
# 打印生成的数据的形状  
print(new_data.shape)

torch.Size([10, 123])


In [308]:
# 分别生成 DOS 0类型样本 21416 个，Probe 2类型样本55687 个
# R2L 3类型样本 66348，U2R 4类型样本 67291 个
# 生成的样本与原样本混合，形成 VAE 过采样后的训练集，使得每种类别样本的数量都为 67343个。
# 设置要生成的数据数量  
num_samples = 21416  
target_category = 0
# 生成新数据  
new_data_dos = generate_new_data(cvae, target_category, latent_dim, num_samples)  

#生成标签的一维Tensor
labels = torch.full((num_samples,), target_category, dtype=torch.long)    
labels = labels.view(num_samples, 1)  
labels = labels.to(device)
# 可以沿着最后一个维度拼接数据和标签  
new_data_dos = torch.cat((new_data_dos, labels), dim=-1)  
  
# 输出结果以验证  
print(new_data_dos)  


tensor([[  362.7616, 66005.1328, 19819.4531,  ...,    84.0847, -1418.9258,
             0.0000],
        [  362.5180, 66009.0156, 19838.7891,  ...,    84.0353, -1418.9094,
             0.0000],
        [  362.9855, 66002.9297, 19807.8945,  ...,    84.1126, -1418.9357,
             0.0000],
        ...,
        [  362.8918, 66002.5156, 19806.6387,  ...,    84.1174, -1418.9368,
             0.0000],
        [  362.3427, 66012.6484, 19856.4512,  ...,    83.9867, -1418.8964,
             0.0000],
        [  362.3982, 66009.4531, 19841.8340,  ...,    84.0362, -1418.9056,
             0.0000]], device='cuda:0')


In [309]:
# 设置要生成的数据数量  
num_samples = 55687  
target_category = 2
# 生成新数据  
new_data_probe = generate_new_data(cvae, target_category, latent_dim, num_samples)  

#生成标签的一维Tensor
labels = torch.full((num_samples,), target_category, dtype=torch.long)  
# 为了拼接将标签Tensor扩展到与数据Tensor相同的维度  
labels = labels.view(num_samples, 1)  
labels = labels.to(device)
# 沿着最后一个维度拼接数据和标签  
new_data_probe = torch.cat((new_data_probe, labels), dim=-1)  
  
# 输出结果以验证  
print(new_data_probe)  

tensor([[ 8.1738e+02,  8.8684e+05,  3.0966e+06,  ..., -8.6188e+01,
         -8.9344e+02,  2.0000e+00],
        [ 8.1725e+02,  8.8684e+05,  3.0965e+06,  ..., -8.6168e+01,
         -8.9345e+02,  2.0000e+00],
        [ 8.1738e+02,  8.8684e+05,  3.0965e+06,  ..., -8.6175e+01,
         -8.9345e+02,  2.0000e+00],
        ...,
        [ 8.1753e+02,  8.8684e+05,  3.0965e+06,  ..., -8.6148e+01,
         -8.9345e+02,  2.0000e+00],
        [ 8.1485e+02,  8.8688e+05,  3.0968e+06,  ..., -8.6856e+01,
         -8.9329e+02,  2.0000e+00],
        [ 8.1742e+02,  8.8684e+05,  3.0966e+06,  ..., -8.6198e+01,
         -8.9344e+02,  2.0000e+00]], device='cuda:0')


In [310]:
# 设置要生成的数据数量  
num_samples = 66348  
target_category = 3
# 生成新数据  
new_data_r2l = generate_new_data(cvae, target_category, latent_dim, num_samples)  

#生成标签的一维Tensor
labels = torch.full((num_samples,), target_category, dtype=torch.long)  
# 为了拼接将标签Tensor扩展到与数据Tensor相同的维度  
labels = labels.view(num_samples, 1)  
labels = labels.to(device)
# 沿着最后一个维度拼接数据和标签  
new_data_r2l = torch.cat((new_data_r2l, labels), dim=-1)  
  
# 输出结果以验证  
print(new_data_r2l)  

tensor([[-1.2041e+02,  1.3076e+04,  9.4091e+03,  ...,  1.2300e+00,
         -2.5358e+02,  3.0000e+00],
        [-1.2200e+02,  1.3107e+04,  9.5609e+03,  ...,  8.1371e-01,
         -2.5345e+02,  3.0000e+00],
        [-1.2087e+02,  1.3084e+04,  9.4499e+03,  ...,  1.1235e+00,
         -2.5354e+02,  3.0000e+00],
        ...,
        [-1.2124e+02,  1.3091e+04,  9.4851e+03,  ...,  1.0273e+00,
         -2.5351e+02,  3.0000e+00],
        [-1.2111e+02,  1.3089e+04,  9.4726e+03,  ...,  1.0608e+00,
         -2.5352e+02,  3.0000e+00],
        [-1.2119e+02,  1.3091e+04,  9.4821e+03,  ...,  1.0316e+00,
         -2.5352e+02,  3.0000e+00]], device='cuda:0')


In [311]:
# 设置要生成的数据数量  
num_samples = 67291  
target_category = 4
# 生成新数据  
new_data_u2r = generate_new_data(cvae, target_category, latent_dim, num_samples)  
print(new_data_u2r.shape)

#生成标签的一维Tensor
labels = torch.full((num_samples,), target_category, dtype=torch.long)  
# 为了拼接将标签Tensor扩展到与数据Tensor相同的维度  
labels = labels.view(num_samples, 1)  
labels = labels.to(device)
# 沿着最后一个维度拼接数据和标签  
new_data_u2r = torch.cat((new_data_u2r, labels), dim=-1)  
  
# 输出结果以验证  
print(new_data_u2r)  

torch.Size([67291, 123])
tensor([[ 1.7178e+03,  1.2175e+03, -7.4007e+04,  ...,  2.0418e+02,
         -7.9522e+02,  4.0000e+00],
        [ 1.7175e+03,  1.2241e+03, -7.3974e+04,  ...,  2.0408e+02,
         -7.9519e+02,  4.0000e+00],
        [ 1.7177e+03,  1.2195e+03, -7.3998e+04,  ...,  2.0415e+02,
         -7.9521e+02,  4.0000e+00],
        ...,
        [ 1.7181e+03,  1.2125e+03, -7.4032e+04,  ...,  2.0425e+02,
         -7.9524e+02,  4.0000e+00],
        [ 1.7173e+03,  1.2262e+03, -7.3964e+04,  ...,  2.0405e+02,
         -7.9518e+02,  4.0000e+00],
        [ 1.7175e+03,  1.2235e+03, -7.3977e+04,  ...,  2.0409e+02,
         -7.9519e+02,  4.0000e+00]], device='cuda:0')


In [312]:
# 沿着第0维（通常是批次维度）拼接数据  
combined_data = torch.cat((new_data_u2r, new_data_r2l,new_data_probe,new_data_dos), dim=0)  

In [313]:
combined_data = combined_data.cpu()
combined_data = combined_data.numpy()
# print(new_data[1])

In [314]:
combined_data = pd.DataFrame(combined_data)

In [315]:
train_df = pd.read_csv('./数据/KDDTrain+afterP.csv')  

In [316]:
# 添加特征属性表头
columns = train_df.columns
print(columns)
columns = columns.drop('attack_type')
columns = columns.append(pd.Index(['attack_type']))
combined_data.columns = columns

Index(['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
       'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
       ...
       'flag_REJ', 'flag_RSTO', 'flag_RSTOS0', 'flag_RSTR', 'flag_S0',
       'flag_S1', 'flag_S2', 'flag_S3', 'flag_SF', 'flag_SH'],
      dtype='object', length=124)


In [317]:
# 将最后一列的float类型转换为整数类型  
combined_data['attack_type'] = combined_data['attack_type'].astype(int)  
combined_data

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,attack_type
0,1717.838623,1217.472656,-74007.390625,-912.247498,-222.200470,-343.736877,547.270264,1039.450806,894.286133,-505.291901,...,-635.063049,168.865021,179.309067,56.370983,1168.229614,19.720213,-28.625343,204.179352,-795.217224,4
1,1717.476196,1224.123535,-73974.398438,-912.182251,-221.793182,-343.740295,547.227173,1039.229858,894.282349,-505.650940,...,-635.472351,168.968445,179.032990,56.753258,1168.811890,19.537388,-28.489155,204.081268,-795.189819,4
2,1717.733887,1219.452881,-73997.625000,-912.228210,-222.078262,-343.737549,547.257202,1039.383667,894.286621,-505.400635,...,-635.186340,168.895981,179.225952,56.485699,1168.403809,19.665051,-28.584297,204.150208,-795.209290,4
3,1717.404907,1225.328735,-73968.476562,-912.170654,-221.720398,-343.741119,547.218811,1039.190308,894.281738,-505.717682,...,-635.545959,168.986923,178.982864,56.822346,1168.916626,19.505129,-28.464733,204.063797,-795.184387,4
4,1716.874146,1235.025879,-73920.679688,-912.076721,-221.126297,-343.745605,547.148376,1038.867798,894.276001,-506.252075,...,-636.142639,169.139114,178.575546,57.384777,1169.767822,19.242712,-28.265415,203.922821,-795.145996,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210737,362.676697,66007.906250,19832.576172,-1987.499268,-434.537720,416.925385,-160.259979,38.416546,1998.897339,85.749146,...,273.860107,-358.166779,-73.463745,80.422165,-77.307655,-456.718719,472.619934,84.043076,-1418.917969,0
210738,362.832672,66004.140625,19814.478516,-1987.536987,-434.775604,416.927338,-160.222198,38.547977,1998.909424,85.952538,...,274.094727,-358.226288,-73.301628,80.198029,-77.653091,-456.616211,472.543427,84.095963,-1418.930420,0
210739,362.891846,66002.515625,19806.638672,-1987.553101,-434.880737,416.930878,-160.212708,38.611187,1998.916748,86.038834,...,274.193970,-358.254974,-73.230949,80.102028,-77.794952,-456.571838,472.511810,84.117393,-1418.936768,0
210740,362.342651,66012.648438,19856.451172,-1987.459106,-434.270996,416.925415,-160.269730,38.275959,1998.907104,85.511299,...,273.580627,-358.091339,-73.649513,80.679153,-76.916855,-456.843353,472.713867,83.986679,-1418.896362,0


In [318]:
# 把attack_type的操作应用在原始数据集中
train_df = train_df[columns] 

In [319]:
last_column = 'attack_type'
  
# 定义字符串标签到整数的映射  
label_mapping = {'dos': 0, 'normal': 1, 'probe':2, 'r2l':3, 'u2r':4}  

train_df[last_column] = train_df[last_column].replace(label_mapping)


In [320]:
train_df

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,attack_type
0,0,491,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
1,0,146,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,232,8153,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,1
4,0,199,420,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125968,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
125969,8,105,145,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
125970,0,2231,384,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,1
125971,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [321]:
# 使用concat函数上下拼接  
result = pd.concat([train_df, combined_data])  

In [322]:
result #这就是用CVAE数据平衡化处理得到的最终结果，实现各个类别平衡，每个类别有67343个样本，总共有336715个样本

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,attack_type
0,0.000000,491.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,1
1,0.000000,146.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,1
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
3,0.000000,232.000000,8153.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,1
4,0.000000,199.000000,420.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210737,362.676697,66007.906250,19832.576172,-1987.499268,-434.537720,416.925385,-160.259979,38.416546,1998.897339,85.749146,...,273.860107,-358.166779,-73.463745,80.422165,-77.307655,-456.718719,472.619934,84.043076,-1418.917969,0
210738,362.832672,66004.140625,19814.478516,-1987.536987,-434.775604,416.927338,-160.222198,38.547977,1998.909424,85.952538,...,274.094727,-358.226288,-73.301628,80.198029,-77.653091,-456.616211,472.543427,84.095963,-1418.930420,0
210739,362.891846,66002.515625,19806.638672,-1987.553101,-434.880737,416.930878,-160.212708,38.611187,1998.916748,86.038834,...,274.193970,-358.254974,-73.230949,80.102028,-77.794952,-456.571838,472.511810,84.117393,-1418.936768,0
210740,362.342651,66012.648438,19856.451172,-1987.459106,-434.270996,416.925415,-160.269730,38.275959,1998.907104,85.511299,...,273.580627,-358.091339,-73.649513,80.679153,-76.916855,-456.843353,472.713867,83.986679,-1418.896362,0


In [323]:
# 保存新的数据集
result.to_csv('KDDTrain_CVAE_CBN_100_20new.csv', index=False)