In [1]:
import pandas as pd 
import os
import numpy as np
import warnings

In [2]:
## evaluate function
def score(pred,label):
    plus=np.sum(pred*label,axis=1)
    minus=np.sum(pred*(1-label),axis=1)
    return np.mean((plus-minus)/np.sum(label,axis=1)) 

In [3]:
##  file path
data_path = r'.\all_file'
df_test_label = pd.read_csv(os.path.join(data_path,'test_label.csv'),index_col='ID') ##  load label
df_submit = pd.read_csv(os.path.join(data_path,'submit.csv'),index_col='ID') ## load submit template

# classfier for rootcause1 by text-cnn

In [4]:
# load data
features = ['feature0', 'feature1', 'feature2', 'feature11', 'feature12','feature13', 'feature15',
        'feature16', 'feature17','feature18',
        'feature19',
        'feature28_0', 'feature28_1', 'feature28_2', 'feature28_3',
        'feature28_4', 'feature28_5', 'feature28_6', 'feature28_7',
        'feature36_0', 'feature36_1', 'feature36_2', 'feature36_3',
        'feature36_4', 'feature36_5', 'feature36_6', 'feature36_7', 'feature60',
        'feature61_0', 'feature61_1', 'feature61_2', 'feature61_3',
        'feature61_4', 'feature61_5', 'feature61_6', 'feature61_7','feature_edge','feature_distance','length']

train_data_process = 'train_for_textcnn'
test_data_process = 'test_for_textcnn'
test_data_raw = 'test'

files = os.listdir(os.path.join(data_path,test_data_process))
files.sort(key=lambda x:int(x[:-4]))
all_feature = []
for filename in files:
    df = pd.read_csv(os.path.join(data_path,test_data_process,filename),index_col = 0)
    list_tmp = []
    for nd in features:
        for i in df[nd].values:
            if type(i) == str:
                if len(i.split(';'))> 1:
                    i = np.array(i.split(';')).astype(float).mean()
            list_tmp.append(i)
    all_feature.append(list_tmp)
all_feature = np.array(all_feature).reshape(-1,len(features),30)

In [5]:
## load model
from keras.models import load_model
model_name = 'textcnn_with_attention_root1_old.h5'
model = load_model(os.path.join(data_path,model_name))

In [6]:
## predict result for mean
result_new = []
res = model.predict(all_feature)
for i in res:
    if i[0]>0.8:
        result_new.append(0)
    else:
        result_new.append(1)

## predict result for each row 
pre_result = pd.DataFrame(columns=['index','length','pre=1','pre=0','score'])
for m in range(600):
    test_array = []
    raw = pd.read_csv(os.path.join(data_path,test_data_process,'{}.csv'.format(m)) ,index_col=0)
    raw = raw[features]
    for i in range(len(raw)):
        line = raw.loc[i,:].to_frame().T
        line_30 = pd.DataFrame(np.repeat(line.values,30,axis=0))
        line_30.columns = line.columns
        test_array.append(line_30.T.values)
    test_array = np.array(test_array)
    res2 = model.predict(test_array)
    result2 = np.argmax(res2,axis=1)
    t = []
    t.append(m)
    t.append(len(result2))
    t.append(result2.sum())
    t.append(len(result2)-result2.sum())
    t.append(res2[:,1].sum()/len(result2))
    pre_result.loc[m,:]=t

In [7]:
##  key feature fluctuate
feature13_std = []
feature15_std = []
files = os.listdir(os.path.join(data_path,test_data_raw))
files.sort(key=lambda x:int(x[:-4]))
for filename in files:
    df = pd.read_csv(os.path.join(data_path,test_data_raw,filename),index_col = 0)
    feature13_std.append(df['feature13'].std())
    feature15_std.append(df['feature15'].std())
feature13_std = np.array(feature13_std)
feature15_std = np.array(feature15_std)
feature13_std[np.isnan(feature13_std)] = 0
feature15_std[np.isnan(feature15_std)] = 0
feature13_std = feature13_std/np.max(feature13_std)
feature15_std = feature15_std/np.max(feature15_std)
feature_fil = feature13_std+feature15_std

add_1 = []
for i in range(len(pre_result['score'])):
    if pre_result['score'][i] == 1:
        add_1.append(i)
for val in np.where(np.array(feature_fil)>0.6)[0]:  ## 用于筛选特征13和15波动性比较大的样本
    if pre_result.loc[val,'score']>0.14:
        add_1.append(val)

In [8]:
df_submit['Root1'] = result_new
df_submit.loc[add_1,['Root1']] = 1

# clssfier for rootcause2 & rootcause3 by ML

In [10]:
# load data 
feature_select =  ['feature0', 'feature1', 'feature2',
       'feature11', 'feature12','feature13', 'feature14', 'feature15', 
       'feature16','feature17','feature18', 'feature19', 'feature60',
       'feature20_distance','featureY_if','featureX_if','featureY_mean','featureX_mean'] 

df_train = pd.read_csv(os.path.join(data_path,'train_for_ml.csv'),index_col = 0)
df_test = pd.read_csv(os.path.join(data_path,'test_for_ml.csv'),index_col = 0)

df_train_label = pd.read_csv(os.path.join(data_path,'train_label.csv'),index_col='sample_index')
df_train_label['Root1'] = df_train_label['root-cause(s)'].apply(lambda x : 1 if 'rootcause1' in x else 0 )
df_train_label['Root2'] = df_train_label['root-cause(s)'].apply(lambda x : 1 if 'rootcause2' in x else 0 )
df_train_label['Root3'] = df_train_label['root-cause(s)'].apply(lambda x : 1 if 'rootcause3' in x else 0 )

data_train = df_train.groupby('sample_index').mean()
data_train['Root2'] = df_train_label.loc[:,'Root2']
data_train['Root3'] = df_train_label.loc[:,'Root3']
data_train.reset_index(inplace=True)
data_train.dropna(inplace=True)

data_test = df_test.groupby('sample_index').mean()
data_test['Root2'] = df_test_label.loc[:,'Root2'] 
data_test['Root3'] = df_test_label.loc[:,'Root3'] 
data_test.reset_index(inplace=True)
data_test.fillna(data_test.mean(),inplace=True)

## classfier for rootcause2

In [11]:
X_train = data_train.loc[:,feature_select]
y_train = data_train['Root2'] 
print(y_train.value_counts())
print(X_train.shape,y_train.shape)

X_test = data_test.loc[:,feature_select]
y_test = data_test['Root2']
print(y_test.value_counts())
print(X_test.shape,y_test.shape)

0    1297
1      65
Name: Root2, dtype: int64
(1362, 18) (1362,)
0    484
1    116
Name: Root2, dtype: int64
(600, 18) (600,)


In [12]:
## data standscaler
from sklearn.preprocessing import StandardScaler,MinMaxScaler
X = pd.concat([X_train,X_test])
scaler = StandardScaler()
scaler.fit(X)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [13]:
## data augmentation
from imblearn.combine import SMOTEENN, SMOTETomek
from collections import Counter
print(sorted(Counter(y_train).items()))
X_train, y_train = SMOTEENN(random_state=0).fit_resample(X_train, y_train)
print(sorted(Counter(y_train).items()))

[(0, 1297), (1, 65)]
[(0, 1216), (1, 1297)]


In [14]:
## classifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

clf_list = [SVC(),RandomForestClassifier(),MLPClassifier(max_iter=800),GradientBoostingClassifier(),AdaBoostClassifier()]
clf = clf_list[-1]
clf.fit(X_train, y_train)
print('train model finished')

y_predict_list = {}     
threshold_1 = 7
threshold_2 = 0
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=UserWarning)
    for sample_index,df in df_test.groupby('sample_index'):
        df.fillna(df.interpolate(),inplace=True)
        df.fillna(data_test.mean(),inplace=True)
        if df['feature19'].std() > threshold_1 : # key feature fluctuate
            X_test_one = df.loc[:,feature_select]
            X_test_one = scaler.transform(X_test_one)
            y_predict_one = clf.predict(X_test_one)
            if np.sum(y_predict_one) > threshold_2 :
                y_predict_list[sample_index] = 1
            else:
                y_predict_list[sample_index] = 0
        else:
            X_test_one =data_test.loc[data_test['sample_index']==sample_index,feature_select]
            X_test_one = X_test_one.values.reshape(1,-1)
            X_test_one = scaler.transform(X_test_one)
            y_predict_one = clf.predict(X_test_one)[0]
            y_predict_list[sample_index] = y_predict_one
        # Too many missing values in some files
        if sample_index in [142, 152, 153, 284, 305, 355, 361, 373, 420]:
            y_predict_list[sample_index] = 1
print('test model finished')

train model finished
test model finished


In [15]:
df_submit['Root2'] = y_predict_list.values()

## classfier for rootcause3

In [16]:
X_train = data_train.loc[:,feature_select]
y_train = data_train['Root3'] 
print(y_train.value_counts())
print(X_train.shape,y_train.shape)

X_test = data_test.loc[:,feature_select]
y_test = data_test['Root3']
print(y_test.value_counts())
print(X_test.shape,y_test.shape)

1    1251
0     111
Name: Root3, dtype: int64
(1362, 18) (1362,)
1    376
0    224
Name: Root3, dtype: int64
(600, 18) (600,)


In [17]:
## data standscaler
from sklearn.preprocessing import StandardScaler,MinMaxScaler
X = pd.concat([X_train,X_test])
scaler = StandardScaler()
scaler.fit(X)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [18]:
## data augmentation
from imblearn.combine import SMOTEENN, SMOTETomek
from collections import Counter
print(sorted(Counter(y_train).items()))
X_train, y_train = SMOTEENN(random_state=0).fit_resample(X_train, y_train)
print(sorted(Counter(y_train).items()))

[(0, 111), (1, 1251)]
[(0, 1248), (1, 1220)]


In [19]:
## test classifier

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

clf_list = [SVC(),RandomForestClassifier(),MLPClassifier(max_iter=800),GradientBoostingClassifier(),AdaBoostClassifier()]
clf = clf_list[0]
clf.fit(X_train, y_train)
print('train model finished')

y_predict_list = {}     
threshold_1 = 18000
threshold_2 = 0
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=UserWarning)
    for sample_index,df in df_test.groupby('sample_index'):
        df.fillna(df.interpolate(),inplace=True)
        df.fillna(data_test.mean(),inplace=True)
        if df['feature13'].std()> threshold_1 : # key feature fluctuate
            X_test_one = df.loc[:,feature_select]
            X_test_one = scaler.transform(X_test_one)
            y_predict_one = clf.predict(X_test_one)
            if np.sum(y_predict_one) > threshold_2 :
                y_predict_list[sample_index] = 1
            else:
                y_predict_list[sample_index] = 0
        else:
            X_test_one =data_test.loc[data_test['sample_index']==sample_index,feature_select]
            X_test_one = X_test_one.values.reshape(1,-1)
            X_test_one = scaler.transform(X_test_one)
            y_predict_one = clf.predict(X_test_one)[0]
            y_predict_list[sample_index] = y_predict_one
        # Too many missing values in some files 
        if sample_index in [142, 152, 153, 284, 305, 355, 361, 373, 420]:
            y_predict_list[sample_index] = 1
print('test model finished')

train model finished
test model finished


In [20]:
df_submit['Root3'] = y_predict_list.values()

# Final score

In [21]:
score(df_submit.values,df_test_label.values)

0.9191666666666667

In [None]:
df_submit.to_csv('result.csv')