# Data Cutting

In [5]:
import os
from tqdm import tqdm
import numpy as np
from keras import Input, Model, models
import tensorflow as tf
from keras.layers import GRU, Bidirectional, Dropout, TimeDistributed, Dense, Activation, GlobalAveragePooling1D,
    Reshape, MaxPooling1D, Multiply


def DataCutting(irpath, labelpath, datafile):
    """数据切割

    :param irpath: 待切割ir数据地址
    :param labelpath: 待切割标签地址
    :param datafile: 切割后数据存放文件夹
    :return:
    """

    #切割完后存放标签的地址
    train_label_path = datafile + '/train_label.txt'
    test_label_path = datafile + '/test_label.txt'
    validation_label_path = datafile + '/validation_label.txt'
    #切割完后存放IR向量地址
    train_IR_path = datafile + '/train_IR.txt'
    test_IR_path = datafile + '/test_IR.txt'
    validation_IR_path = datafile + '/validation_IR.txt'

    #切割方式为每十行都把数据划分为6:2:2，分别为训练集、验证集、测试集
    #pbar用于进度条显示
    with tqdm(total=os.path.getsize(labelpath)) as pbar:
        with open(labelpath, 'r') as f1:
            train_label = open(train_label_path, 'a', encoding='UTF-8')
            test_label = open(test_label_path, 'a', encoding='UTF-8')
            validation_label = open(validation_label_path, 'a', encoding='UTF-8')
            t = 0
            train_num = 0
            test_num = 0
            validation_num = 0
            for line in f1:
                t += 1
                pbar.update(len(line))
                if t <= 6:
                    train_label.write(line)
                    train_num += 1
                elif t <= 8:
                    test_label.write(line)
                    test_num += 1
                elif t <= 10:
                    validation_label.write(line)
                    validation_num += 1
                    if t == 10:
                        t = 0

            train_label.close()
            test_label.close()
            validation_label.close()
            print("train_num: " + str(train_num))
            print("test_num: " + str(test_num))
            print("validation_num: " + str(validation_num))

    with tqdm(total=os.path.getsize(irpath)) as pbar:
        with open(irpath, 'r') as f1:
            train_IR = open(train_IR_path, 'a', encoding='UTF-8')
            test_IR = open(test_IR_path, 'a', encoding='UTF-8')
            validation_IR = open(validation_IR_path, 'a', encoding='UTF-8')
            t = 0
            k = 0
            for line in f1:
                pbar.update(len(line))
                if t < 6:
                    train_IR.write(line)
                elif t < 8:
                    test_IR.write(line)
                elif t < 10:
                    validation_IR.write(line)
                if line.find('#') != -1:
                    t += 1
                    k += 1
                    if t == 10:
                        t = 0

            print("data_sum: " + str(k))

            train_IR.close()
            test_IR.close()
            validation_IR.close()


#原始数据地址
label_path = '../data/label_Juliet2.txt'
IR_path = '../data/data_Juliet2.txt'
#切割后数据存放文件夹
data_file = '../data/Juliet'
DataCutting(IR_path, label_path, data_file)

100%|██████████| 24235/24235 [00:00<00:00, 1349937.68it/s]


train_num: 600
test_num: 200
validation_num: 200


100%|██████████| 293765750/293765750 [00:05<00:00, 55946300.52it/s]

1000 0





# Data Generator

## TrainDataGenerator

In [10]:

def TrainDataGenerator(data_path, label_path, batch_size=64, maxlen=1000):
    """训练数据集生成器

    :param data_path: 数据集路径
    :param label_path: 行号标签路径
    :param batch_size: 批次大小
    :param maxlen: 时间步维度，即最大保留多少行号
    :return: 一个generator,形式为([dataSet,matrixSet], labelSet)
        dataSet:IR向量数据集
        matrixSet:注意力矩阵
        labelSet:行号标签
    """

    fd = open(data_path)
    fl = open(label_path)
    datas = fd.readlines()
    datas_tag = 0
    labels = fl.readlines()
    # print(len(datas), len(labels))
    iter_num = int(len(labels) / batch_size)
    print(iter_num)
    i = 0
    while iter_num:
        irLine = []  #每行ir向量
        irList = []  #ir切片向量列表
        labelList = []  #label列表
        vulList = []  #漏洞列表
        matrixList = []
        label_line = labels[i:i + batch_size]

        for line in label_line:
            line = line.strip()
            a = line.split()
            a = list(map(float, a))
            if a[0] != 0:
                vulList.append(a)
                labelList.append(1)
            else:
                vulList.append(0)
                labelList.append(0)

        for vp in range(len(vulList)):
            #先求漏洞行号标注在一个一维向量上
            if not vulList[vp]:
                attentionLine = [1] * maxlen
            else:
                attentionLine = [0] * maxlen
                for vul in vulList[vp]:
                    if int(vul) > maxlen:
                        continue
                    attentionLine[int(vul) - 1] = 1
            #再将其转化为矩阵
            attentionmatrix = np.diag(attentionLine)
            matrixList.append(attentionmatrix)

        while len(irList) < batch_size:
            line = datas[datas_tag]
            datas_tag += 1
            #逐行遍历：行内字段按'\t'分隔符分隔，转换为列表
            line = line.strip()
            a = line.split('\t')
            if '#' not in a[0]:
                a = list(map(float, a))
                irLine.append(a)
            else:
                x = [0 for t in range(300)]
                while len(irLine) < maxlen:
                    irLine.append(x)
                irList.append(irLine)
                irLine = []
                continue

        dataSet = np.array(irList)
        labelSet = np.array(labelList)
        matrixSet = np.array(matrixList)
        # print(i)
        # print(dataSet.ndim)
        if dataSet.ndim != 3:
            i += batch_size
            iter_num -= 1
            if iter_num == 0:
                iter_num = int(len(labels) / batch_size)
                datas_tag = 0
                i = 0
            continue
        yield [dataSet, matrixSet], labelSet
        i += batch_size

        iter_num -= 1
        if iter_num == 0:
            iter_num = int(len(labels) / batch_size)
            datas_tag = 0
            i = 0

## TestDataGenerator

In [7]:
def TestDataGenerator(data_path, label_path, batch_size=64, maxlen=1000):
    """测试数据集生成器

    :param data_path: 数据集路径
    :param label_path: 行号标签路径
    :param batch_size: 批次大小
    :param maxlen: 时间步维度，即最大保留多少行号
    :return: 一个generator,形式为([dataSet,matrixSet], labelSet,vulList)
        dataSet:IR向量数据集
        matrixSet:注意力矩阵
        labelSet:行号01标签
        vulList:行号标签

    """

    fd = open(data_path)
    fl = open(label_path)
    datas = fd.readlines()
    labels = fl.readlines()
    # print(len(datas), len(labels))
    iter_num = int(len(labels) / batch_size)
    print(iter_num)
    datas_tag = 0
    i = 0
    while iter_num:
        irLine = []  #每行ir向量
        irList = []  #ir切片向量列表
        labelList = []  #label列表
        vulList = []  #漏洞列表
        matrixList = []
        label_line = labels[i:i + batch_size]

        for line in label_line:
            line = line.strip()
            a = line.split()
            a = list(map(float, a))
            if a[0] != 0:
                vulList.append(a)
                labelList.append(1)
            else:
                vulList.append(0)
                labelList.append(0)

        for vp in range(len(vulList)):
            #先求漏洞行号标注在一个一维向量上
            if not vulList[vp]:
                attentionLine = [1] * maxlen
            else:
                attentionLine = [0] * maxlen
                for vul in vulList[vp]:
                    if int(vul) > maxlen:
                        continue
                    attentionLine[int(vul) - 1] = 1
            #再将其转化为矩阵
            attentionmatrix = np.diag(attentionLine)
            matrixList.append(attentionmatrix)

        while len(irList) < batch_size:
            line = datas[datas_tag]
            datas_tag += 1
            #逐行遍历：行内字段按'\t'分隔符分隔，转换为列表
            line = line.strip()
            a = line.split('\t')
            if '#' not in a[0]:
                a = list(map(float, a))
                irLine.append(a)
            else:
                x = [0 for t in range(300)]
                while len(irLine) < maxlen:
                    irLine.append(x)
                irList.append(irLine)
                irLine = []
                continue

        dataSet = np.array(irList)
        labelSet = np.array(labelList)
        matrixSet = np.array(matrixList)
        # print(i)
        # print(dataSet.ndim)
        if dataSet.ndim != 3:
            i += batch_size
            iter_num -= 1
            if iter_num == 0:
                iter_num = int(len(labels) / batch_size)
                datas_tag = 0
                i = 0
            continue
        yield [dataSet, matrixSet], labelSet, vulList
        i += batch_size

        iter_num -= 1
        if iter_num == 0:
            iter_num = int(len(labels) / batch_size)
            datas_tag = 0
            i = 0

## cs

In [12]:
train_IR_path = '../data/Juliet/train_IR.txt'
test_IR_path = '../data/Juliet/test_IR.txt'
validation_IR_path = '../data/Juliet/validation_IR.txt'

train_label_path = '../data/Juliet/train_label.txt'
test_label_path = '../data/Juliet/test_label.txt'
validation_label_path = '../data/Juliet/validation_label.txt'

label_path = '../data/label_Juliet2.txt'
data_path = '../data/data_Juliet2.txt'

for i, ([data, attMatrix], label) in enumerate(TrainDataGenerator(train_IR_path, train_label_path, batch_size=64)):
    print(i, data.shape, label.shape, attMatrix.shape)

9
0 (64, 1000, 300) (64,) (64, 1000, 1000)
1 (64, 1000, 300) (64,) (64, 1000, 1000)


KeyboardInterrupt: 

# Model Build

In [16]:
def build_model(maxlen, dropout, units):
    """

    :param maxlen:最大时间步(最大行号)
    :param dropout:随机让神经元停止工作的概率
    :param units:GRU神经元数量
    :return:
    """
    inputs = Input(shape=(maxlen, 300))
    bgru_1 = Bidirectional(GRU(units=units,
                               activation='tanh',
                               recurrent_activation='sigmoid',
                               return_sequences=True),
                           name='bgru_1')(inputs)
    dropout_1 = Dropout(rate=dropout, name='dropout_1')(bgru_1)
    bgru_2 = Bidirectional(GRU(units=units,
                               activation='tanh',
                               recurrent_activation='sigmoid',
                               return_sequences=True),
                           name='bgru_2')(dropout_1)
    dropout_2 = Dropout(rate=dropout, name='dropout_2')(bgru_2)

    dense_1 = TimeDistributed(Dense(1), name='dense1')(dropout_2)
    activation_1 = Activation('sigmoid', name='activation_1')(dense_1)

    att_Matrix_1 = Input(shape=(maxlen, maxlen), name='att_Matrix')
    multiply_1 = Multiply(name='multiply_1')([att_Matrix_1, activation_1])
    reshape_1 = Reshape((1, maxlen ** 2))(multiply_1)

    k_max_1 = MaxPooling1D(pool_size=maxlen ** 2, data_format='channels_first')(reshape_1)
    average_1 = GlobalAveragePooling1D(name='average_1')(k_max_1)

    model = Model(inputs=[inputs, att_Matrix_1], outputs=average_1)
    model.summary()

    model.compile(optimizer="adam",
                  loss="binary_crossentropy",
                  metrics=["accuracy",
                           "Precision",
                           "Recall",
                           "TruePositives",
                           "TrueNegatives",
                           "FalsePositives",
                           "FalseNegatives"])
    return model


model = build_model(maxlen=1000, dropout=0.4, units=64)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 1000, 300)]  0                                            
__________________________________________________________________________________________________
bgru_1 (Bidirectional)          (None, 1000, 128)    140544      input_3[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 1000, 128)    0           bgru_1[0][0]                     
__________________________________________________________________________________________________
bgru_2 (Bidirectional)          (None, 1000, 128)    74496       dropout_1[0][0]                  
______________________________________________________________________________________________

# Model Train

In [17]:
def train_model(model, TrainGenerator, ValidGenerator):
    callbacks = [
        tf.keras.callbacks.ModelCheckpoint(
            # 模型路径
            filepath='./model/model_{epoch:02d}-{val_accuracy:.2f}.h5',
            # 是否保存最佳
            save_best_only=True,
            # 监控指标
            monitor='val_accuracy',
            # 进度条类型
            verbose=1
        )
    ]

    model.fit(TrainGenerator,
              steps_per_epoch=227,  #每批次总步数
              epochs=10,  #批次数
              validation_data=ValidGenerator,  #验证集
              validation_steps=5,  #每次验证步数
              validation_batch_size=64,  #验证集每批次数据大小
              verbose=1,
              callbacks=callbacks)


train_IR_path = '../data/Juliet/train_IR.txt'
validation_IR_path = '../data/Juliet/validation_IR.txt'

train_label_path = '../data/Juliet/train_label.txt'
validation_label_path = '../data/Juliet/validation_label.txt'

tdg = TrainDataGenerator(train_IR_path, train_label_path, batch_size=64)
vdg = TrainDataGenerator(validation_IR_path, validation_label_path, batch_size=64)
train_model(model, tdg, vdg)

9
Epoch 1/10
  2/227 [..............................] - ETA: 28:41 - loss: 0.7225 - accuracy: 0.4688 - precision: 0.2031 - recall: 0.4333 - true_positives: 13.0000 - true_negatives: 47.0000 - false_positives: 51.0000 - false_negatives: 17.0000    

KeyboardInterrupt: 

# Model Test

In [43]:

def get_predict_line(value_sequence, threshold_value=0.5):
    value_sequence = list(value_sequence)
    vs = len(value_sequence) - 1
    while value_sequence[vs] == value_sequence[-1]:
        vs -= 1
    value_sequence = value_sequence[:vs + 2]

    predict_line = []
    for i in range(len(value_sequence)):
        if value_sequence[i] > threshold_value:
            predict_line.append(i)
    return predict_line


def test_model(model, datapath, labelpath, result_path):
    TP, TN, FP, FN = 0, 0, 0, 0
    TP_l, TN_l, FP_l, FN_l = 0, 0, 0, 0
    loU_list = []
    partial_model = Model(inputs=model.layers[0].input, outputs=model.layers[7].output)
    test_data = TestDataGenerator(datapath, labelpath, batch_size=64)
    batch_size = 64
    iter_num = 92
    for i in range(4):
        print("epochs: " + str(i))
        td = next(test_data)
        output_test = partial_model([td[0][0]], training=False)
        label = td[1]
        vul_line = td[2]
        # print(output_test.shape,label.shape,len(vul_line))
        for j in range(batch_size):
            predict_line = get_predict_line(output_test[j])
            # print(output_test[i])
            # print(predict_line)
            # print(label)
            if predict_line:
                label_pred = 1
            else:
                label_pred = 0
            # print(label_pred,label[j])
            if label_pred == 0 and label[j] == 0:
                TN += 1
                TN_l += 1
            if label_pred == 0 and label[j] == 1:
                FN += 1
                FN_l += 1
            if label_pred == 1 and label[j] == 0:
                FP += 1
                FP_l += 1
            if label_pred == 1 and label[j] == 1:
                TP += 1
                flag_l = False
                for pred in predict_line:
                    if pred in vul_line[j]:
                        flag_l = True
                        break
                if flag_l:
                    TP_l += 1
                else:
                    FN_l += 1
                overlap_line = list(set(predict_line).intersection(set(vul_line[j])))
                union_line = list(set(predict_line).union(set(vul_line[j])))
                loU = len(overlap_line) / len(union_line)
                loU_list.append(loU)
        print('TP:' + str(TP) + ' FP:' + str(FP) + ' FN:' + str(FN) + ' TN:' + str(TN))
        print('TP_l:' + str(TP_l) + ' FP_l:' + str(FP_l) + ' FN_l:' + str(FN_l) + ' TN:' + str(TN_l))
    FPR = FP / (FP + TN)
    FNR = FN / (TP + FN)
    accuracy = (TP + TN) / (TP + TN + FP + FN)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    F1_score = (2 * precision * recall) / (precision + recall)

    FPR_line = FP_l / (FP_l + TN_l)
    FNR_line = FN_l / (TP_l + FN_l)
    accuracy_line = (TP_l + TN_l) / (TP_l + FP_l + FN_l + TN_l)
    precision_line = TP_l / (TP_l + FP_l)
    recall_line = TP_l / (TP_l + FN_l)
    F1_score_line = (2 * precision_line * recall_line) / (precision_line + recall_line)

    loU = np.mean(loU_list)
    # print('test_samples_num: ' + str(len(data)) + '\n')
    print('TP:' + str(TP) + ' FP:' + str(FP) + ' FN:' + str(FN) + ' TN:' + str(TN) + '\n')
    print('FPR: ' + str(FPR) + '\n')
    print('FNR: ' + str(FNR) + '\n')
    print('accuracy: ' + str(accuracy) + '\n')
    print('precision: ' + str(precision) + '\n')
    print('recall: ' + str(recall) + '\n')
    print('F1_score: ' + str(F1_score) + '\n\n')

    print('TP_l:' + str(TP_l) + ' FP_l:' + str(FP_l) + ' FN_l:' + str(FN_l) + ' TN:' + str(TN_l) + '\n')
    print('FPR_location: ' + str(FPR_line) + '\n')
    print('FNR_location: ' + str(FNR_line) + '\n')
    print('accuracy_location: ' + str(accuracy_line) + '\n')
    print('precision_location: ' + str(precision_line) + '\n')
    print('recall_location: ' + str(recall_line) + '\n')
    print('F1_score_location: ' + str(F1_score_line) + '\n\n')

    print('loU: ' + str(loU) + '\n')

    with open(result_path, 'a') as fwrite:
        # fwrite.write('test_samples_num: ' + str(len(data)) + '\n')
        fwrite.write('TP:' + str(TP) + ' FP:' + str(FP) + ' FN:' + str(FN) + ' TN:' + str(TN) + '\n')
        fwrite.write('FPR: ' + str(FPR) + '\n')
        fwrite.write('FNR: ' + str(FNR) + '\n')
        fwrite.write('accuracy: ' + str(accuracy) + '\n')
        fwrite.write('precision: ' + str(precision) + '\n')
        fwrite.write('recall: ' + str(recall) + '\n')
        fwrite.write('F1_score: ' + str(F1_score) + '\n\n')

        fwrite.write('TP_l:' + str(TP_l) + ' FP_l:' + str(FP_l) + ' FN_l:' + str(FN_l) + ' TN:' + str(TN_l) + '\n')
        fwrite.write('FPR_location: ' + str(FPR_line) + '\n')
        fwrite.write('FNR_location: ' + str(FNR_line) + '\n')
        fwrite.write('accuracy_location: ' + str(accuracy_line) + '\n')
        fwrite.write('precision_location: ' + str(precision_line) + '\n')
        fwrite.write('recall_location: ' + str(recall_line) + '\n')
        fwrite.write('F1_score_location: ' + str(F1_score_line) + '\n\n')

        fwrite.write('loU: ' + str(loU) + '\n')


modelPath = './model/model_10-0.97.h5'
resultPath = './result/result_model_10_0.97.txt'

# test_IR_path = '../data/Juliet/test_IR.txt'
# test_label_path = '../data/Juliet/test_label.txt'
test_IR_path = '../data/Juliet/data_Juliet2.txt'
test_label_path = '../data/Juliet/label_Juliet2.txt'

model = models.load_model(modelPath)
test_model(model, test_IR_path, test_label_path, resultPath)


epochs: 0
15
TP:15 FP:4 FN:2 TN:43
TP_l:15 FP_l:4 FN_l:2 TN:43
epochs: 1
TP:27 FP:4 FN:4 TN:93
TP_l:27 FP_l:4 FN_l:4 TN:93
epochs: 2
TP:40 FP:4 FN:8 TN:140
TP_l:40 FP_l:4 FN_l:8 TN:140
epochs: 3
TP:57 FP:5 FN:12 TN:182
TP_l:57 FP_l:5 FN_l:12 TN:182
TP:57 FP:5 FN:12 TN:182

FPR: 0.026737967914438502

FNR: 0.17391304347826086

accuracy: 0.93359375

precision: 0.9193548387096774

recall: 0.8260869565217391

F1_score: 0.8702290076335878


TP_l:57 FP_l:5 FN_l:12 TN:182

FPR_location: 0.026737967914438502

FNR_location: 0.17391304347826086

accuracy_location: 0.93359375

precision_location: 0.9193548387096774

recall_location: 0.8260869565217391

loU: 0.11617313027226238

loU: 0.11617313027226238



 ## cs

### 模型中间结果测试

In [15]:
test_data = TestDataGenerator(test_IR_path, test_label_path, batch_size=64)
td = next(test_data)
partial_model = Model(inputs=model.layers[0].input, outputs=model.layers[7].output)
output_test = partial_model([td[0][0]], training=False)
label = td[1]
vul_line = td[2]
print(output_test.shape, label.shape, len(vul_line))


15
(64, 1000, 1) (64,) 64
[0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1
 1 1 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 1 0]


In [36]:
print(label)
print(vul_line[4])
# print(output_test[4][0:50])
predict_line = get_predict_line(output_test[4])
print(predict_line)
if predict_line:
    label_pred = 1
else:
    label_pred = 0
print(label_pred)

[0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1
 1 1 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 1 0]
[26.0, 27.0, 28.0]
[21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46]
1


### 模型最终结果显示

In [13]:
td = next(test_data)
output = model([td[0]], training=False)
label = td[1]
print(label)
print(output)


NameError: name 'test_data' is not defined

In [12]:
test_data2 = TrainDataGenerator(test_IR_path, test_label_path, batch_size=64)
model.evaluate(test_data2, batch_size=64, steps=100)

15
  9/100 [=>............................] - ETA: 4:04 - loss: 0.1923 - accuracy: 0.9306 - precision: 0.9638 - recall: 0.7917 - true_positives: 133.0000 - true_negatives: 403.0000 - false_positives: 5.0000 - false_negatives: 35.0000

KeyboardInterrupt: 