In [None]:
from gensim.models import Word2Vec
import os
import pandas as pd
import torch

import time
import math

import matplotlib.pyplot as plt

"""=================RNN====================="""
import torch.nn as nn

device = torch.device("cuda:0")


class RNN(nn.Module):

    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i20 = nn.Linear(input_size+hidden_size, output_size)
        self.i2h = nn.Linear(input_size+hidden_size, hidden_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        x = torch.cat((input, hidden), 1)
        hidden = self.i2h(x)
        output = self.i20(x)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)



In [None]:
"""======================================读取文件数据，数据处理========================================"""
# dataset_path = "C:/Users/77037/PycharmProjects/untitled/LSTM_gat_data/labeled_june-train-sub5000.csv"
# dataset_path = "../../../LSTM_gat_data/labeled_june-train-sub5000.csv"
main_dataset_path = "../../../LSTM_gat_data/labeled-June-10min(P)-RAN-20191023.csv"
# my_data = pd.read_csv(dataset_path, header=0, index_col=None)


# 输出按序列去重后的数据列表及各个id中含有的告警数计数列表
def data_option(data_path, drop_duplicates=False, padd_seq_len=None):

    my_data = pd.read_csv(data_path, header=0, index_col=None)

    # 过滤掉没有根因的
    dropId = set({})
    for egroup in my_data.groupby(by='ID'):
        if 1 not in egroup[1]['Label'].values:
            dropId.add(egroup[0])
    data = my_data[my_data['ID'].isin(dropId) == False]
    # data_target只包含接下来用到的列
    useful_cols = ['ID', 'Alarm Name', 'Label', 'Root Alarm']

    if drop_duplicates:
        # 去重
        dataset_target = data[useful_cols].drop_duplicates()
    else:
        dataset_target = data[useful_cols]

    if padd_seq_len is None:
        # 各个id的告警计数写入对应的特征：dataset_target.groupby(by='ID').count()
        # .value_counts()：对所有alarmname出现次数值对应的次数记录（如有10个id都有5个告警，则对应的输出为 5 10）
        # 降序排列id内告警个数
        alarm_seq_len_distribution = dataset_target.groupby(by='ID').count()['Alarm Name'].value_counts().sort_index(
            ascending=False
        )
        # print(alarm_seq_len_distribution)
        # padding by max len，id内包含告警个数的最大值

    return dataset_target, alarm_seq_len_distribution


# 输出按序列去重后的数据列表及各个id中含有的告警数计数列表
dataset_target, alarm_seq_len_distribution = data_option(main_dataset_path, drop_duplicates=False,
                                                         padd_seq_len=None)
padd_seq_len = alarm_seq_len_distribution.index[0]



In [None]:

# 生成后续训练和验证所需的特征列表
def generate_feat_list(dataset_target):
    # alarm name的list集合
    alarm_name_list = []
    root_list = []
    # 字典中所有的alarm(去重，等价于求dictionary）
    singel_alarm_list = []
    # 所有的id列表
    ids_list = dataset_target['ID'].unique().tolist()

    # 按ID记录各个alarm的特征数据
    for eid in ids_list:
        cur_df = dataset_target[dataset_target['ID'] == eid]
        alarm_name_list.append(cur_df['Alarm Name'].values.tolist())
        root_list.append(cur_df['Root Alarm'].values.tolist()[0])  # 单个root alarm的类型为str
        for alarm in cur_df['Alarm Name'].values.tolist():
            singel_alarm_list.append(alarm)
        singel_alarm_list = dataset_target['Alarm Name'].values.tolist()
        alarm_dic = list(set(singel_alarm_list))  # 去重

    return alarm_name_list, root_list, singel_alarm_list, ids_list, alarm_dic


alarm_name_list, root_list, singel_alarm_list, ids_list, alarm_dic = generate_feat_list(dataset_target)

print("alarm dictionary")
print(alarm_dic)
print(singel_alarm_list)
alarm_count = len(singel_alarm_list)