In [None]:
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler


# Mark the nominal columns and consolidate the data (extract all the nominal columns)
# 标出名词性列，整合数据（把名词性的列都提取出来）
def combine_dataset(files, col_names, processed=False):
    dtypes = {}
    if processed == False:
        for col_name in col_names:
            nominal_names = set(['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state',
                                 'service', 'ct_ftp', 'label_10'])  # Nominal column
            if col_name in nominal_names:
                dtypes[col_name] = str
            else:
                dtypes[col_name] = np.float32
    else:
        for col_name in col_names:
            dtypes[col_name] = np.float32

    records = []
    for file in files:
        data = pd.read_csv(file, header=None, names=col_names, dtype=dtypes)
        records.append(data)

    records_all = pd.concat(records)  # 当没有索引时、concat不管列名，直接加到一起
    # When there is no index, concat adds them together regardless of the column names,

    return records_all


## Make new col names for categorical features after one-hot encoding
# 为one-hot编码之后的列起个新列名
def get_nominal_names(dataset, cols_nominal):
    data_nominal = dataset[cols_nominal]

    new_col_names = []
    for col_name in cols_nominal:
        name_unique = sorted(dataset[col_name].unique())  # 名词性列的不同的值。Different values for noun columns
        new_col_name = [col_name + '_' + x for x in name_unique]
        new_col_names.extend(new_col_name)

    return new_col_names


# Remove the unimportant feature, one-hot encoding, and convert the attack class to numeric
# 删除不重要的特征，one-hot编码，将攻击类别转换为数值型
def select_feature_and_encoding(dataset, cols_to_drop, cols_nominal, cols_nominal_all):
    # Drop the features has no meaning such as src ip. 删除不重要的特征
    for cols in cols_to_drop:
        dataset.drop(cols, axis=1, inplace=True)

    # Save the label and then drop it from dataset 保留标签然后将它从数据集中删除（提取出标签列）
    label_10 = dataset['label_10']
    label_2 = dataset['label_2']
    dataset.drop('label_2', axis=1, inplace=True)
    dataset.drop('label_10', axis=1, inplace=True)

    # replace the label with specific code  将标签数值化
    replace_dict = {np.nan: 0, 'Analysis': 1, 'Backdoors': 2, 'Backdoor': 2, 'DoS': 3,
                    'Exploits': 4, ' Fuzzers': 5, ' Fuzzers ': 5, 'Generic': 6,
                    'Reconnaissance': 7, ' Shellcode ': 8, 'Shellcode': 8,
                    'Worms': 9, ' Reconnaissance ': 7, }
    new_label_10 = label_10.replace(replace_dict)
    new_label_10.to_frame()
    label_2.to_frame
    del label_10

    # replace the lost values  用0替换缺失值
    replace_dict = {np.nan: 0, ' ': 0}
    for cols in ['ct_ftp', 'ct_flw', 'is_ftp']:
        dataset[cols] = dataset[cols].replace(replace_dict)

    # 'is_ftp' column is wrong, correct it(I found that the value of it is
    # all the same with ct_ftp_cmd, so if the value is not 0, is_ftp should
    # be 1)
    for x in dataset['is_ftp']:
        if x != 0:
            x = 1

    # select and process the categorical features 选择并处理分类特征
    data_nominal = dataset[cols_nominal]  # cols_nominal是名词性列的列名，提取出名词性列的数据
    data_temp_1 = data_nominal.apply(LabelEncoder().fit_transform)  # 将名词性列进行编号
    del data_nominal

    new_col_names = []
    for col_name in cols_nominal:
        name_unique = sorted(dataset[col_name].unique())
        new_col_name = [col_name + '_' + x for x in name_unique]

        new_col_names.extend(new_col_name)
        dataset.drop(col_name, axis=1, inplace=True)

    # one-hot
    enc = OneHotEncoder()
    data_temp_2 = enc.fit_transform(data_temp_1)
    del data_temp_1

    data_encoded = pd.DataFrame(data_temp_2.toarray(), columns=new_col_names)
    del data_temp_2

    # complement the nominal columns 补充名词性列
    diff = set(cols_nominal_all) - set(new_col_names)

    if diff:
        for cols in diff:
            data_encoded[cols] = 0.
        data_encoded = data_encoded[cols_nominal_all]

    dataset = dataset.join(data_encoded)
    del data_encoded

    dataset = dataset.join(new_label_10)
    dataset = dataset.join(label_2)

    return dataset  # Complete data set (including data and labels)
    # 完整的数据集（包括数据和标签）

In [None]:
# Split the training set and test set and save the file as a CSV file
# 分裂训练集和测试集,并将文件保存成CSV文件
def split_dataset(dataset, file_train, file_test):
    cols = dataset.columns
    # trainset, testset = train_test_split(dataset, test_size = 0.2)
    trainset, testset = train_test_split(dataset, test_size=0.2, random_state=40, stratify=dataset['label_10'])
    train = pd.DataFrame(trainset, columns=cols)
    test = pd.DataFrame(testset, columns=cols)

    train.to_csv(file_train)
    test.to_csv(file_test)


# Standardize, and save the file in CSV and tf formats
# 标准化，并将文件保存成csv格式和tf格式
def scaling(files_train, files_test, col_names_scaling, scaling_type):
    if scaling_type == 'min_max':
        scaler = MinMaxScaler()
        file_folder = 'min_max/'
    else:
        scaler = StandardScaler()
        file_folder = 'normalized/'

    if not os.path.exists(file_folder):
        os.mkdir(file_folder)
    cols = []
    for file in files_train:
        # col 0 is the index in the file
        trainset = pd.read_csv(file, index_col=0, dtype=np.float32)
        if len(cols) == 0:
            cols = trainset.columns
        scaler.partial_fit(trainset[col_names_scaling])

    del trainset
    cols_keep = list(set(cols) - set(col_names_scaling))

    for file in files_train:
        trainset = pd.read_csv(file, dtype=np.float32)
        train_scaled = scaler.transform(trainset[col_names_scaling])
        train_changed = pd.DataFrame(train_scaled, columns=col_names_scaling)
        train_unchanged = trainset[cols_keep]
        trainset_final = pd.concat((train_changed, train_unchanged),
                                   axis=1)
        trainset_final = trainset_final[cols]
        print("train:", trainset_final.shape)  # trainset shape
        file_csv = file_folder + file
        trainset.to_csv(file_csv, index=False)
        len_tail = len('.csv')
        file_tfr = file_folder + file[:-1 * len_tail] + '.tfrecords'
        make_tfrecords(trainset_final, file_tfr)

    for file in files_test:
        testset = pd.read_csv(file, dtype=np.float32)
        test_scaled = scaler.transform(testset[col_names_scaling])
        test_changed = pd.DataFrame(test_scaled, columns=col_names_scaling)
        test_unchanged = testset[cols_keep]
        testset_final = pd.concat((test_changed, test_unchanged), axis=1)
        testset_final = testset_final[cols]
        print("test:", testset_final.shape)
        file_csv = file_folder + file
        testset.to_csv(file_csv, index=False)
        len_tail = len('.csv')
        file_tfr = file_folder + file[:-1 * len_tail] + '.tfrecords'
        make_tfrecords(testset_final, file_tfr)


# Save the file in tf format
# 将文件保存成tf格式
def make_tfrecords(dataset, file_to_save):
    try:
        data = dataset.values
    except:
        data = dataset
    with tf.compat.v1.python_io.TFRecordWriter(file_to_save) as writer:  # python_io在tfv1中
        for rows in data:
            features, label_10, label_2 = rows[:-2], rows[-2], rows[-1]
            feature = {'features': tf.train.Feature(float_list=tf.train.FloatList(value=features)),
                       'label_2': tf.train.Feature(float_list=tf.train.FloatList(value=[label_2])),
                       'label_10': tf.train.Feature(float_list=tf.train.FloatList(value=[label_10]))}
            example = tf.train.Example(features=tf.train.Features(feature=feature))
            writer.write(example.SerializeToString())


def next_batch(filename, batch_size):
    len_feature = 202  # 特征数（不包含标签）。 Number of features (not including tags)
    len_label = 1  # 标签长度。 The length of the label

    def read_data(examples):
        features = {"features": tf.io.FixedLenFeature([len_feature], tf.float32),
                    "label_2": tf.io.FixedLenFeature([len_label], tf.float32),
                    "label_10": tf.io.FixedLenFeature([len_label], tf.float32)}
        parsed_features = tf.io.parse_single_example(examples, features)
        return parsed_features['features'], parsed_features['label_2'], \
               parsed_features['label_10']

    data = tf.data.TFRecordDataset(filename)
    data = data.map(read_data)
    data = data.batch(batch_size)
    iterator = tf.compat.v1.data.make_one_shot_iterator(data)
    next_data, next_label_2, next_label_10 = iterator.get_next()

    return next_data, next_label_10, next_label_2


# Integrate the four separate data sets
# 将分开的4个数据集整合到一起
def make_whole_datasets(tfrecords_train, num_train_example, tfrecords_test,
                        num_test_example):
    with tf.compat.v1.Session() as sess:
        data_test, label_10_test, label_2_test = next_batch(tfrecords_test, num_test_example)
        data, label_10, label_2 = sess.run([data_test, label_10_test, label_2_test])
    dataset = np.concatenate([data, label_10, label_2], axis=1)
    print("test:", dataset.shape)
    make_tfrecords(dataset, 'normalized/test.tfrecords')


if __name__ == '__main__':
    file_folder = 'F:/数据挖掘/SGM-CNN/UNSW_NB15/'  # 读取的原始文件所在的位置。 The location where the original file was read
    col_names = ['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur',
                 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss',
                 'service', 'sload', 'dload', 'spkts', 'dpkts', 'swin', 'dwin',
                 'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'trans_depth',
                 'res_bdy_len', 'sjit', 'djit', 'stime', 'ltime', 'sintpkt',
                 'dintpkt', 'tcprtt', 'synack', 'ackdat', 'is_sm_ips',
                 'ct_state_ttl', 'ct_flw', 'is_ftp', 'ct_ftp', 'ct_srv_src',
                 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm', 'ct_src_dport',
                 'ct_dst_sport', 'ct_dst_src', 'label_10', 'label_2']  # 特证名（列名）。 listed name

    cols_to_drop = ['srcip', 'dstip', 'stime', 'ltime', 'sport', 'dsport']
    cols_nominal = ['proto', 'service', 'state']  # 名词性特征。Nominal features

    files = [file_folder + 'UNSW-NB15_' + str(i + 1) + '.csv' for i in range(4)]
    # dataset = combine_dataset(files, col_names)
    file_folder = 'normalized/'  # 数据标准化后存放的文件夹。A folder where data is stored after standardization
    files_train = [file_folder + str(x + 1) + '_train.tfrecords' for x in range(4)]
    files_test = [file_folder + str(x + 1) + '_test.tfrecords' for x in range(4)]
    num_train_example = 2032035  # trainset size
    num_test_example = 508012  # testset size
    make_whole_datasets(files_train, num_train_example, files_test, num_test_example)

In [None]:
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import heapq
import os
import matplotlib.pyplot as plt
from pandas import read_csv
from sklearn.model_selection import train_test_split
from tensorflow import compat


class DAE(object):
    """
	Denoising autoencoder. Gaussian noise is added. The scale and standard deviation
	of it are noise_scale and noise_std, respectively.
	"""

    def __init__(self, n_feature, n_hidden, noise_scale, noise_std, reg_lamda=0.01):
        self.n_hidden = n_hidden
        self.n_feature = n_feature
        self.reg_lamda = reg_lamda
        self.noise_scale = noise_scale
        self.noise_std = noise_std

        self.data = compat.v1.placeholder(shape=[None, n_feature],
                                          dtype=compat.v1.float64)
        self.noise = self.noise_scale * compat.v1.random_normal([n_feature], dtype=compat.v1.float64,
                                                                stddev=self.noise_std)
        data_with_noise = self.data + self.noise

        self.weight_encoder = compat.v1.get_variable(name='weight_encoder',
                                                     shape=[self.n_feature, self.n_hidden],
                                                     dtype=compat.v1.float64)
        self.bias_encoder = compat.v1.Variable(compat.v1.zeros([self.n_hidden],
                                                               dtype=compat.v1.float64),
                                               name='bias_encoder')

        weight_decoder = compat.v1.get_variable(name='weight_decoder',
                                                shape=[self.n_hidden, self.n_feature],
                                                dtype=compat.v1.float64)
        bias_decoder = compat.v1.Variable(compat.v1.zeros([self.n_feature], dtype=compat.v1.float64),
                                          name='bias_decoder')

        with compat.v1.name_scope('Encoder'):
            data_encoded = compat.v1.add(compat.v1.matmul(data_with_noise, self.weight_encoder),
                                         self.bias_encoder)
            data_encoded = compat.v1.nn.tanh(data_encoded)

        with compat.v1.name_scope('Decoder'):
            data_recons = compat.v1.add(compat.v1.matmul(data_encoded, weight_decoder),
                                        bias_decoder)
            self.data_recons = compat.v1.tanh(data_recons)

        with compat.v1.name_scope('Loss'):
            diff = self.data_recons - self.data
            self.loss_mse = 0.5 * compat.v1.reduce_mean(compat.v1.reduce_sum(diff ** 2, axis=1))
            loss_reg = compat.v1.reduce_sum(compat.v1.sqrt(compat.v1.reduce_sum(self.weight_encoder ** 2, axis=1)))
            self.loss_reg = self.reg_lamda * loss_reg
            self.l2_loss = compat.v1.nn.l2_loss(weight_decoder) * 1E-3

            self.loss = self.loss_mse + self.loss_reg + self.l2_loss

        with compat.v1.name_scope('weight_vector'):
            self.weight_vector = compat.v1.reduce_sum(self.weight_encoder ** 2, axis=1)

In [None]:
class unbalanced_DAE(object):
    """
	An unbalanced version of DAE. the differences is that a weight pos_weight is added
	to the MSE reconstruction loss for positive examples. For this purpose, the labels
	of the examples are used.
	"""

    def __init__(self, n_feature, n_hidden, noise_scale, noise_std,
                 posi_weight=1.0, reg_lamda=0.00):
        self.n_hidden = n_hidden
        self.n_feature = n_feature
        self.reg_lamda = reg_lamda
        self.noise_scale = noise_scale
        self.noise_std = noise_std
        self.posi_weight = posi_weight

        self.data = compat.v1.placeholder(shape=[None, n_feature], dtype=compat.v1.float64)
        self.label = compat.v1.placeholder(shape=[None, 1], dtype=compat.v1.float64)
        self.noise = self.noise_scale * compat.v1.random_normal([n_feature], dtype=compat.v1.float64,
                                                                stddev=self.noise_std)
        data_with_noise = self.data + self.noise

        self.weight_encoder = compat.v1.get_variable(name='weight_encoder',
                                                     shape=[self.n_feature, self.n_hidden],
                                                     dtype=compat.v1.float64)
        self.bias_encoder = compat.v1.Variable(compat.v1.zeros([self.n_hidden],
                                                               dtype=compat.v1.float64),
                                               name='bias_encoder')

        weight_decoder = compat.v1.get_variable(name='weight_decoder',
                                                shape=[self.n_hidden, self.n_feature],
                                                dtype=compat.v1.float64)
        bias_decoder = compat.v1.Variable(compat.v1.zeros([self.n_feature], dtype=compat.v1.float64),
                                          name='bias_decoder')

        with compat.v1.name_scope('Encoder'):
            data_encoded = compat.v1.add(compat.v1.matmul(data_with_noise, self.weight_encoder),
                                         self.bias_encoder)
            data_encoded = compat.v1.nn.sigmoid(data_encoded)

        with compat.v1.name_scope('Decoder'):
            data_recons = compat.v1.add(compat.v1.matmul(data_encoded, weight_decoder),
                                        bias_decoder)
            self.data_recons = compat.v1.nn.sigmoid(data_recons)

        with compat.v1.name_scope('Loss'):
            diff = self.data_recons - self.data
            weights = self.label * (posi_weight - 1) + 1
            weights = compat.v1.reshape(weights, shape=[-1])
            self.loss_mse = 0.5 * compat.v1.reduce_mean(compat.v1.reduce_sum(diff ** 2, axis=1) * weights)
            loss_reg = compat.v1.reduce_sum(compat.v1.sqrt(compat.v1.reduce_sum(self.weight_encoder ** 2, axis=1)))
            self.loss_reg = self.reg_lamda * loss_reg
            self.l2_loss = compat.v1.nn.l2_loss(weight_decoder) * 1E-3

            self.loss = self.loss_mse + self.loss_reg + self.l2_loss

        with compat.v1.name_scope('weight_vector'):
            self.weight_vector = compat.v1.reduce_sum(self.weight_encoder ** 2, axis=1)

In [None]:
def P_R_F1(confusion_matrix):
    category = confusion_matrix.shape[0]
    precision = []
    recall = []
    f1 = []
    for i in range(category):
        TP = confusion_matrix[i, i]

        precsion_temp = TP / np.sum(confusion_matrix[:, i])
        recall_temp = TP / np.sum(confusion_matrix[i, :])
        f1_temp = 2 * precsion_temp * recall_temp / (precsion_temp + recall_temp)

        precision.append(precsion_temp)
        recall.append(recall_temp)
        f1.append(f1_temp)

    return precision, recall, f1


# shaping labels to one-hot vectors for trainning
def label_coding(label, batch_size, category):
    new_label = compat.v1.cast(label, dtype=compat.v1.int32)
    new_label = compat.v1.reshape(new_label, [batch_size, 1])
    new_label = compat.v1.one_hot(new_label, depth=category)
    return compat.v1.reshape(new_label, [batch_size, category])


# get next batch of data and label
def next_batch(filename, batch_size, conf, buffer_size=0):
    len_feature = conf.len_feature
    len_label = conf.len_label
    num_classes = conf.num_classes
    one_hot_encoding = conf.one_hot_encoding

    def read_data(examples):
        features = {"features": compat.v1.FixedLenFeature([len_feature], compat.v1.float32),
                    "label_2": compat.v1.FixedLenFeature([len_label], compat.v1.float32),
                    "label_10": compat.v1.FixedLenFeature([len_label], compat.v1.float32)}
        parsed_features = compat.v1.parse_single_example(examples, features)
        return parsed_features['features'], parsed_features['label_2'], \
               parsed_features['label_10']

    data = compat.v1.data.TFRecordDataset(filename)
    data = data.map(read_data)
    if buffer_size != 0:
        data = data.shuffle(buffer_size=buffer_size)
    data = data.repeat()
    data = data.batch(batch_size)
    iterator = data.make_one_shot_iterator()
    next_data, next_label_2, next_label_10 = iterator.get_next()

    if one_hot_encoding == True:
        if num_classes == 2:
            next_label_2 = label_coding(next_label_2, batch_size,
                                        num_classes)
        else:
            next_label_10 = label_coding(next_label_10, batch_size,
                                         num_classes)

    return next_data, next_label_2, next_label_10


def trans_dataset(file_tfr, file_txt, num_examples, num_classes):
    with compat.v1.Session() as sess:
        all_data, all_label = next_batch(file_tfr, num_examples)
        all_label = label_coding(all_label, num_examples, num_classes)

        record = np.concatenate([sess.run(all_data), sess.run(all_label)], axis=1)
        np.savetxt(file_txt, record, fmt='%.6e')


def split_dataset(file_train, file_test, k, file_folder_new):  # k is refer to k_fold

    trainset = np.loadtxt(file_train)
    testset = np.loadtxt(file_test)
    dataset = np.concatenate((trainset, testset))

    for i in range(k - 1):
        trainset, testset = train_test_split(dataset, test_size=1 / (k - i))
        dataset = trainset
        np.savetxt(file_folder_new + str(i) + '.txt', testset)

    np.savetxt(file_folder_new + str(k - 1) + '.txt', trainset)


def get_dataset(file_folder, index_test, indices_train):
    testset = np.loadtxt(file_folder + str(index_test) + '.txt')

    count = 0
    for other in indices_train:
        temp = np.loadtxt(file_folder + str(other) + '.txt')

        if count == 0:
            trainset = temp
        else:
            trainset = np.concatenate((trainset, temp))

        count += 1

    return trainset, testset


def parse_pos_neg(dataset):
    label = dataset[:, -1]

    record_posi = []
    record_neg = []

    records_len = dataset.shape[-1]
    records_num = dataset.shape[0]

    for index in range(records_num):
        record = dataset[index, :]
        record = np.reshape(record, (1, records_len))
        if label[index] == 0.:
            record_posi.append(record)
        else:
            record_neg.append(record)

    posi = np.concatenate(record_posi)
    neg = np.concatenate(record_neg)

    return posi, neg


compat.v1.reset_default_graph()

In [None]:
# system parameters

class Configures(object):

    def __init__(self):
        # parameter of records
        self.len_feature = 202
        self.len_label = 1
        # self.num_classes = 2
        self.num_classes = 10
        self.one_hot_encoding = False
        self.num_records_train = 1625628
        self.num_records_test = 508012

        # parameters for training
        self.batch_size = 256
        self.batch_size_test = 2048
        self.training_epochs = 2
        self.learn_rate_start = 1E-4

        self.batch_train = self.num_records_train // self.batch_size
        self.batch_test = self.num_records_test // self.batch_size_test


n_hidden = 64
noise_scale = 0.
noise_std = 0.1
conf = Configures()


# training op


def get_indices(num_select, num_feature, file_weights):
    """
    This function is to select maximum k features according to their
    weights.

    Pram:
        num_select: An interger, the number of the selected features
        num_feature: An interger, the number of the original features
        file_weights: A txt file storing a numpy array. Each row of the
                      array is the weight for a feature
    Return:
        a list containing the indices of selected features
    """

    x = np.arange(1, num_feature + 1)
    y = np.loadtxt(file_weights)
    indices = heapq.nlargest(num_select, range(len(y)), y.take)
    plt.scatter(x, y)
    plt.show()
    plt.savefig('weights_dis.eps', format='eps')
    print(indices)

    return indices


def read_data(examples):
    features = {"features": compat.v1.FixedLenFeature([num_feature], compat.v1.float32),
                "label_2": compat.v1.FixedLenFeature([len_label], compat.v1.float32),
                "label_10": compat.v1.FixedLenFeature([len_label], compat.v1.float32)}
    parsed_features = compat.v1.parse_single_example(examples, features)
    return parsed_features['features'], parsed_features['label_2'], \
           parsed_features['label_10']


# get next batch of data and label
def next_batch(filename, num_examples):
    data = compat.v1.data.TFRecordDataset(filename)
    data = data.map(read_data)
    data = data.batch(num_examples)
    iterator = data.make_one_shot_iterator()
    next_data, next_label_2, next_label_10 = iterator.get_next()
    return next_data, next_label_2, next_label_10


def make_tfrecords(dataset, file_to_save):
    [features, label_2, label_10] = dataset

    with compat.v1.python_io.TFRecordWriter(file_to_save) as writer:
        for index in range(features.shape[0]):
            feature = {
                'features': compat.v1.train.Feature(float_list=compat.v1.train.FloatList(value=features[index, :])),
                'label_2': compat.v1.train.Feature(float_list=compat.v1.train.FloatList(value=label_2[index, :])),
                'label_10': compat.v1.train.Feature(float_list=compat.v1.train.FloatList(value=label_10[index, :]))}
            example = compat.v1.train.Example(features=compat.v1.train.Features(feature=feature))
            writer.write(example.SerializeToString())


def selection(data, indices):
    """
    select the columns (indicating the features) according to the indices
    """

    return data[:, indices]


def select_feature(file, num_examples, indices):
    """
    The main function of feature selection.

    Params:
      file: The .tfrecords file containing original data.包含原始数据的.tfrecords文件
      num_examples: The number of examples in the file  文件中的记录数
      indices: The indices of features to be selected  被选择特征的索引

    Return:
      None
      In the function, a new .tfrecords file with tail of 'selected'
      will be created in the same folder with the original data
    """

    with compat.v1.Session() as sess:
        data, label_2, label_10 = sess.run(next_batch(file, num_examples))

    data_select = selection(data, indices)

    file_name = file.split('\\')[-1]
    file_tail = len('.npy')
    file_to_save = file_name[:-1 * file_tail] + '_select_' + str(len(indices)) + '_data_.npy'
    np.save(file_to_save,data_select)
    file_to_save = file_name[:-1 * file_tail] + '_select_' + str(len(indices)) + '_label_2_.npy'
    np.save(file_to_save,label_2)
    file_to_save = file_name[:-1 * file_tail] + '_select_' + str(len(indices)) + '_label_10_.npy'
    np.save(file_to_save,label_10)
    # make_tfrecords([data_select, label_2, label_10], file_to_save)


def show_feature_name(indices):
    """
    The function to convert indices to feature names

    Params:
      indices:the indices of the features
    Return:
      None.
      The name of features will be print
    """

    file = os.path.join('normalized/', '1_test.csv')
    data = read_csv(file, index_col=0)
    cols = data.columns

    for x in indices:
        print(cols[x])

In [None]:
if __name__ == '__main__':
    num_select = 12  # 选择出来的特征数。The number of selected features
    num_feature = 202
    len_label = 1
    file_weights = 'F:/数据挖掘/SGM-CNN/normalized/weights_new_3.5.txt'

    indices = get_indices(num_select, num_feature, file_weights)
    # print(indices)
    show_feature_name(indices)

    file_train = 'F:/数据挖掘/SGM-CNN/normalized/train.tfrecords'
    file_valid = 'F:/数据挖掘/SGM-CNN/normalized/validation.tfrecords'
    num_examples_train = 1778030
    num_examples_validation = 254005
    file_test = 'F:/数据挖掘/SGM-CNN/normalized/test.tfrecords'
    num_examples_test = 508012

    select_feature(file_train, num_examples_train, indices)
    select_feature(file_valid, num_examples_validation, indices)
    #select_feature(file_test, num_examples_test, indices)

get_indices(num_select, num_feature, file_weights)



In [None]:
# SMOTE all minority classes to (number of data set samples / classes)

from collections import Counter
import numpy as np

data = np.load('F:/数据挖掘/SGM-CNN/新建文件夹/train_select_12_data.npy')
label = np.load('F:/数据挖掘/SGM-CNN/新建文件夹/train_select_12_label_2.npy')

X = np.array(data)
b = np.array(label)
bb = b.reshape(b.shape[0], )
y2 = np.int32(bb)

print(X.shape)

sorted(Counter(y2).items())
from imblearn.over_sampling import SMOTE
import time

time_start = time.time()
a = 889015
# [X[0].size/2]

smo = SMOTE(sampling_strategy={1:a}, random_state=42)

X_smo, y_smo = smo.fit_sample(X, y2)
print(sorted(Counter(y_smo).items()))

time_end = time.time()
time = time_end - time_start
print("time:", time)

print(X_smo.shape[0])

# Extract Majority class of data

list0 = []
list1 = []
list2 = []

for i in range(X_smo.shape[0]):
    if y_smo[i] == 0:
        list0.append(X_smo[i])  # 正常流量
    else:
        list1.append(X_smo[i])
        list2.append(y_smo[i])

data0 = np.array(list0)
data1 = np.array(list1)
label1 = np.array(list2)

label11 = label1.reshape(label1.shape[0], )

print("Normal class data shape：", data0.shape)
print("Attack class data shape：", data1.shape)
print("Attack class label shape：", label11.shape)

# Cluster majority data into  C (total number of classes)

from sklearn.mixture import GaussianMixture
import time

time_start = time.time()

estimator = GaussianMixture(n_components=10)
estimator.fit(data0)

time_end = time.time()
time = time_end - time_start
print("time:", time)

label_pred = estimator.predict(data0)

sorted(Counter(label_pred).items())

# Select a certain amount of data from each cluster to form a new majority data


c0 = []
c1 = []
s0 = s1 = 0

for i in range(data0.shape[0]):
    if label_pred[i] == 0:
        c0.append(data0[i])
        s0 = s0 + 1
    elif label_pred[i] == 1:
        c1.append(data0[i])
        s1 = s1 + 1
a = 444507
# [a/2]
del c1[a:len(c1)]
c00 = np.array(c0)
c11 = np.array(c1)

q = np.concatenate((c00, c11), axis=0)

label_zc = np.zeros((q.shape[0],), dtype=int)

data_end = np.concatenate((q, data1), axis=0)
label_end = np.concatenate((label_zc, label1), axis=0)


sorted(Counter(label_end).items())

label_end = label_end.reshape(label_end.shape[0], 1)

np.save("F:/数据挖掘/SGM-CNN/新建文件夹/SGM_data_train.npy", data_end)
np.save("F:/数据挖掘/SGM-CNN/新建文件夹/SGM_label2_train.npy", label_end)


In [None]:
# SMOTE all minority classes to (number of data set samples / classes)

from collections import Counter
import numpy as np
from sklearn.mixture import GaussianMixture
from imblearn.over_sampling import SMOTE
import time

data = np.load('F:/数据挖掘/SGM-CNN/新建文件夹/train_select_12_data.npy')
label = np.load('F:/数据挖掘/SGM-CNN/新建文件夹/train_select_12_label_10.npy')

X = np.array(data)
b = np.array(label)
bb = b.reshape(b.shape[0], )
y10 = np.int32(bb)

print(X.shape)

sorted(Counter(y10).items())


time_start = time.time()
a = 177803
# [X[0].size/10]

smo = SMOTE(sampling_strategy={1:a, 2: a, 3: a, 4: a, 5: a, 6: a, 7: a, 8: a, 9: a}, random_state=42)

X_smo, y_smo = smo.fit_sample(X, y10)
print(sorted(Counter(y_smo).items()))

time_end = time.time()
time = time_end - time_start
print("time:", time)


print(X_smo.shape[0])

# Extract Majority class of data

list0 = []
list1 = []
list2 = []

for i in range(X_smo.shape[0]):
    if y_smo[i] == 0:
        list0.append(X_smo[i])  # 正常流量
    else:
        list1.append(X_smo[i])
        list2.append(y_smo[i])

data0 = np.array(list0)
data1 = np.array(list1)
label1 = np.array(list2)

label11 = label1.reshape(label1.shape[0], )

print("Normal class data shape：", data0.shape)
print("Attack class data shape：", data1.shape)
print("Attack class label shape：", label11.shape)

# Cluster majority data into  C (total number of classes)


time_start = time.time()

estimator = GaussianMixture(n_components=10)
estimator.fit(data0)

time_end = time.time()
time = time_end - time_start
print("time:", time)

label_pred = estimator.predict(data0)
sorted(Counter(label_pred).items())

# Select a certain amount of data from each cluster to form a new majority data


c0 = []
c1 = []
c2 = []
c3 = []
c4 = []
c5 = []
c6 = []
c7 = []
c8 = []
c9 = []
s0 = s1 = s2 = s3 = s4 = s5 = s6 = s7 = s8 = s9 = 0

for i in range(data0.shape[0]):
    if label_pred[i] == 0:
        c0.append(data0[i])
        s0 = s0 + 1
    elif label_pred[i] == 1:
        c1.append(data0[i])
        s1 = s1 + 1
    elif label_pred[i] == 2:
        c2.append(data0[i])
        s2 = s2 + 1
    elif label_pred[i] == 3:
        c3.append(data0[i])
        s3 = s3 + 1
    elif label_pred[i] == 4:
        c4.append(data0[i])
        s4 = s4 + 1
    elif label_pred[i] == 5:
        c5.append(data0[i])
        s5 = s5 + 1
    elif label_pred[i] == 6:
        c6.append(data0[i])
        s6 = s6 + 1
    elif label_pred[i] == 7:
        c7.append(data0[i])
        s7 = s7 + 1
    elif label_pred[i] == 8:
        c8.append(data0[i])
        s8 = s8 + 1
    elif label_pred[i] == 9:
        c9.append(data0[i])
        s9 = s9 + 1

a = 17780
# [a/10]

del c1[a:len(c1)]
del c2[a:len(c2)]
del c3[a:len(c3)]
del c4[a:len(c4)]
del c5[a:len(c5)]
del c6[a:len(c6)]
del c7[a:len(c7)]
del c8[a:len(c8)]
del c9[a:len(c9)]

c00 = np.array(c0)
c11 = np.array(c1)
c22 = np.array(c2)
c33 = np.array(c3)
c44 = np.array(c4)
c55 = np.array(c5)
c66 = np.array(c6)
c77 = np.array(c7)
c88 = np.array(c8)
c99 = np.array(c9)

q = np.concatenate((c00, c11, c22, c33, c44, c55, c66, c77, c88, c99), axis=0)
label_zc = np.zeros((q.shape[0],), dtype=int)
data_end = np.concatenate((q, data1), axis=0)
label_end = np.concatenate((label_zc, label1), axis=0)

sorted(Counter(label_end).items())

label_end = label_end.reshape(label_end.shape[0], 1)

np.save("F:/数据挖掘/SGM-CNN/新建文件夹/SGM_data_train.npy", data_end)
np.save("F:/数据挖掘/SGM-CNN/新建文件夹/SGM_label10_train.npy", label_end)