### 此notebook使用textcnn方法对根因1进行单独分类

In [1]:
## 处理好的数据的路径，文件格式为csv，是对原始数据进行填充和截断之后的数据。并且进行了z-score归一化
Folder_Path = 'D:/data/rootcausecontest/train/z_score_train/z_score_train' 
## 模型文件保存姓名
model_name= 'D:/data/rootcausecontest/textcnn_with_attention_for_root1.h5'
## 标签路径
label_path = 'D:/data/rootcausecontest/processed_label.csv'

In [2]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.models import load_model
from keras.layers import *
from sklearn.metrics import accuracy_score, confusion_matrix
import keras
from keras.models import Model, load_model
from tensorflow.keras.utils import to_categorical
from keras.optimizers import adam_v2
%matplotlib inline

In [3]:
## 挑选的特征
features = ['feature0', 'feature1', 'feature2', 'feature11', 'feature12','feature13', 'feature15',
        'feature16', 'feature17','feature18',
        'feature19',
        'feature28_0', 'feature28_1', 'feature28_2', 'feature28_3',
        'feature28_4', 'feature28_5', 'feature28_6', 'feature28_7',
        'feature36_0', 'feature36_1', 'feature36_2', 'feature36_3',
        'feature36_4', 'feature36_5', 'feature36_6', 'feature36_7', 'feature60',
        'feature61_0', 'feature61_1', 'feature61_2', 'feature61_3',
        'feature61_4', 'feature61_5', 'feature61_6', 'feature61_7','feature_edge','feature_distance','length']

### 处理label

In [4]:
## 读取数据预处理阶段处理好的label
label_all = pd.read_csv(label_path,index_col = 0,header=None)

In [5]:
##将标签转换成1，2，3，4，1代表根因1；2代表根因2；3代表根因3；4代表根因2和3一起出现
label_list = []
for row in label_all.index:
    labels = label_all.loc[row,str(1):str(6)] 
    if (len(np.where(labels>0)[0]))>1:
        label_list.append(4)
    elif (len(np.where(labels>0)[0]))==1:
        label_list.append(np.where(labels>0)[0][0]+1)
    else:
        label_list.append(0)

In [6]:
label_all['labels'] = label_list
y =label_all['labels'].values

In [7]:
# ##将其它标签置0，单独对根因1进行分类
y[y == 1] = 1
y[y == 2] = 0
y[y == 3] = 0
y[y == 4] = 0

### 处理特征

In [8]:
##此处对特征进行读取和处理
import os
files = os.listdir(Folder_Path)
print(len(files))
files.sort(key=lambda x:int(x[:-4]))
all_feature = []
for filename in files:
    df = pd.read_csv(Folder_Path+'/'+filename,index_col = 0)
    list_tmp = []
    for nd in features:
        for i in df[nd].values:
            if type(i) == str:
                if len(i.split(';'))> 1:
                    i = np.array(i.split(';')).astype(float).mean()
            list_tmp.append(i)
    all_feature.append(list_tmp)
all_feature = np.array(all_feature)

1407


In [9]:
##防止有缺失值
all_feature[np.isnan(all_feature)] = 0

### 数据增强

In [10]:
from imblearn.over_sampling import BorderlineSMOTE, ADASYN

In [11]:
##数据增强，出来得结果0和1两类标签都一样
X_resampled, y_resampled = BorderlineSMOTE().fit_resample(all_feature, y)

In [12]:
from collections import Counter
print(sorted(Counter(y_resampled).items()))

[(0, 1295), (1, 1295)]


In [13]:
##划分训练集和测试集
from sklearn.model_selection import train_test_split
def dataprocess(teature,labels):
    x_train, x_test, y_train, y_test = train_test_split(teature, labels, test_size=0.2) 
    return x_train,y_train, x_test, y_test

In [14]:
feature_final = X_resampled.reshape(-1,len(features),30)
X_train, y_train, X_test, y_test = dataprocess(feature_final,y_resampled)

In [18]:
##定义注意力模块的函数
def attention_3d_block(inputs, time_steps):
    input_dim = int(inputs.shape[2])
    a = Permute((2, 1))(inputs)
    a = Dense(time_steps, activation='softmax')(a)
    if False:
        a = Lambda(lambda x: K.mean(x, axis=1), name='dim_reduction')(a)
        a = RepeatVector(input_dim)(a)
    a_probs = Permute((2, 1), name='attention_vec')(a)
    print(a_probs.shape, inputs.shape)
    output_attention_mul = Multiply()([inputs, a_probs])
    return output_attention_mul

In [19]:
##定义模型，设计模型参数
from keras.layers import Input
from tensorflow.python.keras import regularizers
pool_output = []
kernel_sizes = [3, 4, 5]
main_input = Input(shape=(X_train.shape[1],30), dtype='float64')
O_seq = attention_3d_block(main_input, X_train.shape[1])
for kernel_size in kernel_sizes:
    c = Conv1D(filters=32, kernel_size=kernel_size, padding='same', strides=1)(O_seq)
    c = BatchNormalization()(c)
    c = Activation('relu')(c)
    p = MaxPooling1D(pool_size=2)(c)
    p = Flatten()(p)
    pool_output.append(p)
x_flatten = concatenate(pool_output)
x_flatten = Dropout(0.4)(x_flatten)
y = Dense(2,activation ='softmax',kernel_regularizer=regularizers.l1(0.01))(x_flatten)
model = Model(inputs=main_input, outputs=y)
# model.summary()

(None, 39, 30) (None, 39, 30)


In [20]:
##训练模型，生成模型文件
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
Reduce = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5,
                           mode='auto', cooldown=0, min_lr=0.000001, verbose = 1)
opt = adam_v2.Adam(learning_rate=0.001)
model.compile(optimizer=opt,
              loss='categorical_crossentropy',
              metrics=['accuracy'])
callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, verbose=0),
    ModelCheckpoint(model_name, monitor='val_loss', mode='min', save_best_only=True),
    Reduce
]
print('\nTrain...')
one_hot_labels = to_categorical(y_train, num_classes=2) 
one_hot_labels_test = to_categorical(y_test, num_classes=2) 
history = model.fit(x = X_train, y = one_hot_labels,
                    batch_size=32,
                    epochs=300,
                    shuffle=True,
                    validation_data=(X_test, one_hot_labels_test),
                    callbacks=callbacks)

print("\nTesting...")
model = load_model(model_name)
score, accuracy = model.evaluate(X_test, one_hot_labels_test,
                                 batch_size=64,
                                 verbose=1)
print("Test loss:  ", score)
print("Test accuracy:  ", accuracy)


Train...
Epoch 1/300
Epoch 2/300

  layer_config = serialize_layer_fn(layer)


Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300


Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 00059: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78/300
Epoch 00078: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 79/300
Epoch 80/300
Epoch 81/300
Epoch 82/300
Epoch 83/300
Epoch 84/300
Epoch 85/300
Epoch 86/300
Epoch 87/300
Epoch 88/300
Epoch 89/300
Epoch 00089: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 90/300
Epoch 91/300
Epoch 92/300
Epoch 93/300
Epoch 94/300
Epoch 95/300
Epoch 96/300
Epoch 97/300
Epoch 98/300
Epoch 99/300
Epoch 100/300
Epoch 00100: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.
Epoch 101/300
Epoch 102/300
Epoch 103/300
Epoch 104/300
Epoch 105/300
Epoch 106/30

### 以下部分为读取模型生成根因1的csv文件

In [4]:
##原始测试数据文件
Folder_Path1 = 'D:/data/rootcausecontest/test_600'
##处理过后的测试集文件，每个文件30行
Folder_Path2 = 'D:/data/rootcausecontest/test/z_score_test/z_score_test' 
model_name_test = 'D:/data/rootcausecontest/textcnn_with_attention_for_root1_old.h5'

In [5]:
## 此处用来统计feature13和feature15的均值，基于我们的发现，
##这两个特征波动比较大的文件大多包含多个根因
import os
import pandas as pd
import numpy as np
feature13_std = []
feature15_std = []
files = os.listdir(Folder_Path1)
files.sort(key=lambda x:int(x[:-4]))
for filename in files:
    df = pd.read_csv(Folder_Path1+'/'+filename,index_col = 0)
    feature13_std.append(df['feature13'].std())
    feature15_std.append(df['feature15'].std())
feature13_std = np.array(feature13_std)
feature15_std = np.array(feature15_std)
feature13_std[np.isnan(feature13_std)] = 0
feature15_std[np.isnan(feature15_std)] = 0
feature13_std = feature13_std/np.max(feature13_std)
feature15_std = feature15_std/np.max(feature15_std)
feature_fil = feature13_std+feature15_std

In [6]:
###此cell用来读取600个文件的特征参数，出来list的维度是[600,len(nodes)*20]
import os
import pandas as pd
files = os.listdir(Folder_Path2)
files.sort(key=lambda x:int(x[:-4]))
all_feature = []
for filename in files:
    df = pd.read_csv(Folder_Path2+'/'+filename,index_col = 0)
    list_tmp = []
    for nd in features:
        for i in df[nd].values:
            if type(i) == str:
                if len(i.split(';'))> 1:
                    i = np.array(i.split(';')).astype(float).mean()
            list_tmp.append(i)
    all_feature.append(list_tmp)

In [7]:
#这边reshape了一下，往模型里面送模型接收的维度是[600,len(nodes),20]
all_feature = np.array(all_feature).reshape(-1,len(features),30)

In [8]:
#读取模型
from keras.models import load_model
model = load_model(model_name_test)

In [9]:
#进行预测
res = model.predict(all_feature)
result = np.argmax(res,axis=1)

In [10]:
##由于存在根因1和多种其它根因出现的情况，降低根因1的判别阈值
result_new = []
for i in res:
    if i[0]>0.8:
        result_new.append(0)
    else:
        result_new.append(1)

In [11]:
np.sum(result_new)

318

In [12]:
#形成最后提交的csv
submit = np.zeros((600,6))
for i in range(len(result_new)):
    if result_new[i] == 1:
        submit[i,0] = 1
submit_dataframe = pd.DataFrame()
submit_dataframe['ID'] = [i for i in range(600)]
for i in range(len(submit[0])):
    col = 'Root cause {}'.format(i+1)
    submit_dataframe[col] = submit[:,i]

In [13]:
## 单独对每行进行预测
pre_result = pd.DataFrame(columns=['index','length','pre=1','pre=0','score'])
for m in range(600):
    test_array = []
    raw = pd.read_csv(Folder_Path2 + '/{}.csv'.format(m),index_col=0)
    raw = raw[features]
    for i in range(len(raw)):
        line = raw.loc[i,:].to_frame().T
        line_30 = pd.DataFrame(np.repeat(line.values,30,axis=0))
        line_30.columns = line.columns
        test_array.append(line_30.T.values)
    test_array = np.array(test_array)
    res2 = model.predict(test_array)
    result2 = np.argmax(res2,axis=1)
    t = []
    t.append(m)
    t.append(len(result2))
    t.append(result2.sum())
    t.append(len(result2)-result2.sum())
    t.append(res2[:,1].sum()/len(result2))
    pre_result.loc[m,:]=t
pre_result.to_csv('D:/data/rootcausecontest/pre_result_soft.csv')

In [13]:
##处理较慢，此处有之前的运行结果
pre_result = pd.read_csv('D:/data/rootcausecontest/pre_result.csv') ##硬判决文件
pre_result['score'] = pre_result['pre=1']/pre_result['length']

In [14]:
##若用软判决则关闭上面两行代码
# pre_result = pd.read_csv('D:/data/rootcausecontest/pre_result_new.csv') ##软判决文件

In [15]:
# ##软判决门限
# add_1 = []
# for i in range(len(pre_result['score'])):
#     if pre_result['score'][i] == 1:
#         add_1.append(i)
# for val in np.where(np.array(feature_fil)>0.6)[0]:  ## 用于筛选特征13和15波动性比较大的样本
#     if pre_result.loc[val,'score']>0.14:
#         add_1.append(val)

In [16]:
##硬判决门限
add_1 = []
for i in range(len(pre_result['score'])):
    if pre_result['score'][i] == 1:
        add_1.append(i)
for val in np.where(np.array(feature_fil)>0.6)[0]:  ## 用于筛选特征13和15波动性比较大的样本
    if pre_result.loc[val,'score']>0.2:
        add_1.append(val)

In [17]:
submit_dataframe.loc[add_1,'Root cause 1'] = 1

In [None]:
submit_dataframe.to_csv('D:/data/rootcausecontest/submit_root1.csv',index = None)