In [1]:
import pandas as pd 
import os
import matplotlib.pyplot as plt
import numpy as np
import warnings

test_path = r'..\test150'
test_path_2 = r'..\test600'
train_path = r'..\train'

label_path = r'..'
train_label_file = 'train_label.csv'

## load dataset

In [2]:
# feature process
def feature60_process(s):
    if isinstance(s,str):
        s = eval(s.replace(';',','))
        s = round(np.mean(s), 2)
    return s

def feature20_edge_count(feature20_series):
    edge_list = [0,1,6,7,8,9,14,15,16,17,22,23,24,25,30,31]
    feature20_edge_count = 0
    for num in feature20_series:
        if not np.isnan(num) and num in edge_list:
            feature20_edge_count +=1
    if feature20_series.isnull().all() : feature20_edge_count = np.nan
    return feature20_edge_count
    
def feature20_distance_count(feature20_series):
    feature20_series.dropna(inplace=True)
    coordinate = []
    for m in feature20_series:
        coordinate.append((m-8*(m//8), m//8))
    distance = 0
    for a in coordinate:
        for b in coordinate:
            distance = distance + ((a[0]-b[0])**2 + (a[1]-b[1])**2)**0.5
    return distance 

def featureXY_process(featureXY_series):
    featureY_list = ['feature28','feature36','feature44','feature52']
    featureX_list = ['feature61','feature69','feature77','feature85']
    featureX = pd.DataFrame()
    featureY = pd.DataFrame()
    for featureX_name in featureX_list:
        featureX = featureX.append(featureXY_series.loc[featureXY_series.index.str.contains(featureX_name)])
    for featureY_name in featureY_list:
        featureY = featureY.append(featureXY_series.loc[featureXY_series.index.str.contains(featureY_name)])
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=RuntimeWarning)
        if len(featureX) == 0:
            featureX_mean = np.nan
            featureX_min = np.nan
        else:
            featureX_mean =np.nanmean(featureX)
            featureX_min =np.nanmin(featureX)       
        if len(featureY) == 0:
            featureY_mean = np.nan
            featureY_min = np.nan
        else:
            featureY_mean =np.nanmean(featureY)
            featureY_min =np.nanmin(featureY)
    return [featureY_mean, featureX_mean,featureY_min,featureX_min]

def feature20XY_interference(feature20XY_series):
    #返回 feature X/Y _if 两个个值 ， apply函数需要设置result_type = 'expand'
    
    feature_if_dict = {} # X和Y分别的信号值-干扰值
    feature20 = feature20XY_series.loc[feature20XY_series.index.str.contains('feature20')]
    featureY_list = ['feature28','feature36','feature44','feature62']
    featureX_list = ['feature61','feature69','feature81','feature85']
    
    for featureV_list in [featureY_list,featureX_list]:
        for featureV_name in featureV_list:
            featureV = feature20XY_series.loc[feature20XY_series.index.str.contains(featureV_name)]
            # 计算得到[[横坐标，纵坐标，V值] *8]
            c_v_list = []
            for i,value in zip(feature20,featureV):
                if not (np.isnan(i) and np.isnan(value)):
                        c_v = [i-8*(i//8), i//8, value]
                        c_v_list.append(c_v)
            interference = 0 #干扰计算公式，Σ(S -Σsn/dn)
            for item in c_v_list:
                for other_item in c_v_list:
                    if other_item != item: 
                        distance =((item[0]-other_item[0])**2 + (item[1]-other_item[1])**2)**0.5
                        if distance == 0: distance = 1 
                        interference += other_item[2] / distance
            if len(c_v_list) == 0 :
                interference = np.nan 
            else:
                interference = interference/len(c_v_list)
                interference = round(interference,2)
            feature_if_dict[featureV_name] = interference
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=RuntimeWarning)
        featureY_if = np.nanmean([feature_if_dict[k] for k in featureY_list])
        featureX_if = np.nanmean([feature_if_dict[k] for k in featureX_list])
    return [featureY_if,featureX_if]

    
# def feature17_diff(feature17_series):
#     return feature17_series.diff()

In [3]:
#  label file
df_train_label = pd.read_csv(os.path.join(label_path,train_label_file),index_col='sample_index')
train_label_dict = df_train_label['root-cause(s)'].to_dict()

unknown_label_dict = {}
for i in range(2984):
    if i not in train_label_dict.keys():
        unknown_label_dict[i] = 'unknown'

In [4]:
# select feature
feature_all = ['feature0', 'feature1', 'feature2', 
       'feature3_1','feature3_2', 'feature3_3', 'feature3_4', 
       'feature11', 'feature12', 'feature13','feature14', 'feature15', 
       'feature16', 'feature17', 'feature18','feature19', 'feature60',
       'feature20_distance','feature20_edge',
       'featureY_if','featureX_if','featureY_mean','featureX_mean',
       'featureY_min','featureX_min'
       ]
feature_select =  ['feature0', 'feature1', 'feature2',
       'feature11', 'feature12','feature13', 'feature14', 'feature15', 
       'feature16','feature17','feature18', 'feature19', 'feature60',
       'feature20_distance','featureY_if','featureX_if','featureY_mean','featureX_mean'] 

In [None]:
# preprocess datatest + feature enginering
def load_dataset_for_train(data_path,label_dict): 
    df_data = pd.DataFrame()
    row_num = 0
    for sample_index,cause in label_dict.items():
        df = pd.read_csv(os.path.join(data_path,str(sample_index)+'.csv'))
        row_num += len(df)    
        df['feature60'] = df['feature60'].apply(feature60_process)
        df['feature20_edge'] = df.loc[:,df.columns.str.contains('feature20')].apply(feature20_edge_count,axis=1)
        df['feature20_distance'] = df.loc[:,df.columns.str.contains('feature20')].apply(feature20_distance_count,axis=1)
        df[['featureY_if','featureX_if']] = df.apply(feature20XY_interference,axis=1,result_type='expand')
        df[['featureY_mean','featureX_mean','featureY_min','featureX_min']] = df.apply(featureXY_process,axis=1,result_type='expand')
        tmp = df.loc[:,feature_all]
        tmp.insert(0,'causes_type',cause) 
        tmp.insert(1,'sample_index',sample_index)
        df_data = df_data.append(tmp,ignore_index=True)
    df_data.dropna(axis=1, inplace=True,how='all') #扔掉所有na的列
    print('验证数据集维度',df_data.shape, row_num)
    return df_data

def load_dataset_for_test(data_path,file_num): 
    df_data = pd.DataFrame()
    row_num = 0
    for i in range(file_num):
        df = pd.read_csv(os.path.join(data_path,str(i)+'.csv'))
        row_num += len(df)    
        df['feature60'] = df['feature60'].apply(feature60_process)
        df['feature20_edge'] = df.loc[:,df.columns.str.contains('feature20')].apply(feature20_edge_count,axis=1)
        df['feature20_distance'] = df.loc[:,df.columns.str.contains('feature20')].apply(feature20_distance_count,axis=1)
        df[['featureY_if','featureX_if']] = df.apply(feature20XY_interference,axis=1,result_type='expand')
        df[['featureY_mean','featureX_mean','featureY_min','featureX_min']] = df.apply(featureXY_process,axis=1,result_type='expand')
        tmp = df.loc[:,feature_all]
        tmp.insert(1,'sample_index',i)
        df_data = df_data.append(tmp,ignore_index=True)
    df_data.dropna(axis=1, inplace=True,how='all') #扔掉所有na的列
    print('验证数据集维度',df_data.shape, row_num)
    return df_data

# df_unknown = load_dataset_for_train(train_path,unknown_label_dict)
df_train = load_dataset_for_train(train_path,train_label_dict)
df_val = load_dataset_for_test(test_path,150)
df_test = load_dataset_for_test(test_path_2,600)

In [5]:
# load processed data

# df_train.to_csv('train.csv')
# df_val.to_csv('test.csv')
# df_test.to_csv('test_2.csv')

df_train = pd.read_csv('train.csv',index_col = 0)
df_val = pd.read_csv('test.csv',index_col = 0)
df_test = pd.read_csv('test_2.csv',index_col = 0)

## binary classifier

In [6]:
# select causes type

# cause_type = 'cause1'
cause_type = 'cause2'
# cause_type = 'cause3'

In [7]:
# data_train = df_train.groupby('sample_index').quantile(q=0.15)
data_train = df_train.groupby('sample_index').mean()
data_train.reset_index(inplace=True)
data_train['causes_type'] = data_train['sample_index'].apply(lambda x:train_label_dict[x])
data_train['label'] = data_train['causes_type'].apply(lambda x: 1 if cause_type in x else 0)
data_train.dropna(inplace=True)

# data_test = df_test_2.groupby('sample_index').quantile(q=0.5)
data_test = df_test.groupby('sample_index').mean()
data_test.reset_index(inplace=True)
data_test.fillna(data_test.mean(),inplace=True)

In [8]:
X_train = data_train.loc[:,feature_select]
y_train = data_train['label']
print(X_train.shape,y_train.shape)

X_test = data_test.loc[:,feature_select]
print(X_test.shape)

(1362, 18) (1362,)
(600, 18)


In [9]:
# data standscaler
from sklearn.preprocessing import StandardScaler,MinMaxScaler,Normalizer
X = pd.concat([X_train,X_test])
scaler = StandardScaler()
scaler.fit(X)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
# data augmentation
from imblearn.over_sampling import RandomOverSampler,SMOTE,ADASYN,BorderlineSMOTE,KMeansSMOTE,SVMSMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN, SMOTETomek
from collections import Counter

print(sorted(Counter(y_train).items()))
# X_train, y_train = RandomOverSampler(random_state=0).fit_resample(X_train, y_train)
# X_train, y_train = RandomUnderSampler(random_state=0).fit_resample(X_train, y_train)
# X_train, y_train = SMOTE().fit_resample(X_train, y_train)
# X_train, y_train = ADASYN().fit_resample(X_train, y_train)
# X_train, y_train = BorderlineSMOTE().fit_resample(X_train, y_train)
# X_train, y_train = KMeansSMOTE().fit_resample(X_train, y_train)
# X_train, y_train = SVMSMOTE().fit_resample(X_train, y_train)
X_train, y_train = SMOTEENN(random_state=0).fit_resample(X_train, y_train)
# X_train, y_train = SMOTETomek(random_state=0).fit_resample(X_train, y_train)
print(sorted(Counter(y_train).items()))

[(0, 1297), (1, 65)]
[(0, 1216), (1, 1297)]


In [11]:
# different classifier

from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

clf_list = [SVC(),LinearSVC(),
            KNeighborsClassifier(),NearestCentroid(),
            LogisticRegression(),
            BernoulliNB(),GaussianNB(),
            DecisionTreeClassifier(),ExtraTreeClassifier(),                
            RandomForestClassifier(),
            LinearDiscriminantAnalysis(),
            MLPClassifier(max_iter=800),
            LogisticRegression(),
            QuadraticDiscriminantAnalysis(),
            RidgeClassifier(),
            GradientBoostingClassifier(),
            AdaBoostClassifier(),
            BaggingClassifier()
           ]

In [12]:
# feature importance 
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
tmp = pd.Series(clf.feature_importances_,index=feature_select)
display(tmp.sort_values(ascending=False))

featureY_mean         0.341163
feature60             0.267044
feature19             0.228706
featureY_if           0.045305
feature1              0.029057
feature20_distance    0.019831
feature16             0.014264
featureX_mean         0.009002
feature0              0.007826
featureX_if           0.007704
feature18             0.007637
feature17             0.006314
feature2              0.003314
feature15             0.003025
feature11             0.002834
feature12             0.002609
feature14             0.002251
feature13             0.002112
dtype: float64

In [None]:
# time series fluctuation analyse and binary classifier for rootcauses 1
clf = clf_list[9]
clf.fit(X_train, y_train)
print(clf)

y_predict_list = {}     
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=FutureWarning)
    for sample_index,df in df_test.groupby('sample_index'):
        df.fillna(df.interpolate(),inplace=True)
        df.fillna(data_test.mean(),inplace=True)
        if (np.nanmax(df['feature15']) - np.nanmin(df['feature15']) > 400):
#             print('当前文件波动',sample_index)
            X_test_one = df.loc[:,feature_select]
            X_test_one = scaler.transform(X_test_one)
            y_predict_one = clf.predict(X_test_one)
            if np.sum(y_predict_one)/len(y_predict_one) > 0.2 and sample_index < 500:
                y_predict_list[sample_index] = 1
            elif np.sum(y_predict_one)/len(y_predict_one) > 0.3 and sample_index > 499:
                y_predict_list[sample_index] = 1
            else:
                y_predict_list[sample_index] = 0
        else:
            X_test_one =data_test.loc[data_test['sample_index']==sample_index,feature_select]
            X_test_one = X_test_one.values.reshape(1,-1)
            X_test_one = scaler.transform(X_test_one)
            y_predict_one = clf.predict(X_test_one)[0]
            y_predict_list[sample_index] = y_predict_one
            
        # Too many missing values in some files
        if sample_index in [142, 152, 153, 284, 305, 355, 361, 373, 420]:
            y_predict_list[sample_index] = 0

In [None]:
y_result = pd.Series(y_predict_list)
y_result.to_csv('cause1_tmp.csv')

In [13]:
# time series fluctuation analyse and binary classifier
# for rootcauses 2 
clf = clf_list[9]
clf.fit(X_train, y_train)
print(clf)
y_predict_list = {}     
threshold_1 = 7
threshold_2 = 0
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=FutureWarning)
    for sample_index,df in df_test.groupby('sample_index'):
        df.fillna(df.interpolate(),inplace=True)
        df.fillna(data_test.mean(),inplace=True)
        if df['feature19'].std() > threshold_1 :
#             print('当前文件波动',sample_index)
            X_test_one = df.loc[:,feature_select]
            X_test_one = scaler.transform(X_test_one)
            y_predict_one = clf.predict(X_test_one)
            if np.sum(y_predict_one) > threshold_2 :
                y_predict_list[sample_index] = 1
            else:
                y_predict_list[sample_index] = 0
        else:
            X_test_one =data_test.loc[data_test['sample_index']==sample_index,feature_select]
            X_test_one = X_test_one.values.reshape(1,-1)
            X_test_one = scaler.transform(X_test_one)
            y_predict_one = clf.predict(X_test_one)[0]
            y_predict_list[sample_index] = y_predict_one
            
        # Too many missing values in some files
        if sample_index in [142, 152, 153, 284, 305, 355, 361, 373, 420]:
            y_predict_list[sample_index] = 1

BaggingClassifier()


In [14]:
y_result = pd.Series(y_predict_list)
y_result.to_csv('cause2_tmp.csv')

In [None]:
# time series fluctuation analyse and binary classifier
# for rootcauses 3 
clf = clf_list[9]
clf.fit(X_train, y_train)
print(clf)
y_predict_list = {}     

with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=FutureWarning)
    for sample_index,df in df_test.groupby('sample_index'):
        df.fillna(df.interpolate(),inplace=True)
        df.fillna(data_test.mean(),inplace=True)
        if  df['feature15'].std()>135 or df['feature13'].std() > 30000:
#             print('当前文件波动',sample_index)
            X_test_one = df.loc[:,feature_select]
            X_test_one = scaler.transform(X_test_one)
            y_predict_one = clf.predict(X_test_one)
            if np.sum(y_predict_one) > 0 :
                y_predict_list[sample_index] = 1
            else:
                y_predict_list[sample_index] = 0
        else:
            X_test_one =data_test.loc[data_test['sample_index']==sample_index,feature_select]
            X_test_one = X_test_one.values.reshape(1,-1)
            X_test_one = scaler.transform(X_test_one)
            y_predict_one = clf.predict(X_test_one)[0]
            y_predict_list[sample_index] = y_predict_one
            
        # Too many missing values in some files
        if sample_index in [142, 152, 153, 284, 305, 355, 361, 373, 420]:
            y_predict_list[sample_index] = 1

In [None]:
y_result = pd.DataFrame(y_predict,columns=[cause_type])
y_result.to_csv('cause3_tmp.csv')