In [1]:
import pandas as pd 
import os
import numpy as np
import warnings

data_path = r'.'
test_path = 'test600'
train_path = 'train'

label_path = r'.'
train_label_file = 'train_label.csv'

## process data for ML

In [2]:
# feature process
def feature60_process(s):
    if isinstance(s,str):
        s = eval(s.replace(';',','))
        s = round(np.mean(s), 2)
    return s

def feature20_edge_count(feature20_series):
    edge_list = [0,1,6,7,8,9,14,15,16,17,22,23,24,25,30,31]
    feature20_edge_count = 0
    for num in feature20_series:
        if not np.isnan(num) and num in edge_list:
            feature20_edge_count +=1
    if feature20_series.isnull().all() : feature20_edge_count = np.nan
    return feature20_edge_count
    
def feature20_distance_count(feature20_series):
    feature20_series.dropna(inplace=True)
    coordinate = []
    for m in feature20_series:
        coordinate.append((m-8*(m//8), m//8))
    distance = 0
    for a in coordinate:
        for b in coordinate:
            distance = distance + ((a[0]-b[0])**2 + (a[1]-b[1])**2)**0.5
    return distance 

def featureXY_process(featureXY_series):
    featureY_list = ['feature28','feature36','feature44','feature52']
    featureX_list = ['feature61','feature69','feature77','feature85']
    featureX = pd.DataFrame()
    featureY = pd.DataFrame()
    for featureX_name in featureX_list:
        featureX = featureX.append(featureXY_series.loc[featureXY_series.index.str.contains(featureX_name)])
    for featureY_name in featureY_list:
        featureY = featureY.append(featureXY_series.loc[featureXY_series.index.str.contains(featureY_name)])
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=RuntimeWarning)
        if len(featureX) == 0:
            featureX_mean = np.nan
            featureX_min = np.nan
        else:
            featureX_mean =np.nanmean(featureX)
            featureX_min =np.nanmin(featureX)       
        if len(featureY) == 0:
            featureY_mean = np.nan
            featureY_min = np.nan
        else:
            featureY_mean =np.nanmean(featureY)
            featureY_min =np.nanmin(featureY)
    return [featureY_mean, featureX_mean,featureY_min,featureX_min]

def feature20XY_interference(feature20XY_series):
    #返回 feature X/Y _if 两个个值 ， apply函数需要设置result_type = 'expand'
    
    feature_if_dict = {} # X和Y分别的信号值-干扰值
    feature20 = feature20XY_series.loc[feature20XY_series.index.str.contains('feature20')]
    featureY_list = ['feature28','feature36','feature44','feature62']
    featureX_list = ['feature61','feature69','feature81','feature85']
    
    for featureV_list in [featureY_list,featureX_list]:
        for featureV_name in featureV_list:
            featureV = feature20XY_series.loc[feature20XY_series.index.str.contains(featureV_name)]
            # 计算得到[[横坐标，纵坐标，V值] *8]
            c_v_list = []
            for i,value in zip(feature20,featureV):
                if not (np.isnan(i) and np.isnan(value)):
                        c_v = [i-8*(i//8), i//8, value]
                        c_v_list.append(c_v)
            interference = 0 #干扰计算公式，Σ(S -Σsn/dn)
            for item in c_v_list:
                for other_item in c_v_list:
                    if other_item != item: 
                        distance =((item[0]-other_item[0])**2 + (item[1]-other_item[1])**2)**0.5
                        if distance == 0: distance = 1 
                        interference += other_item[2] / distance
            if len(c_v_list) == 0 :
                interference = np.nan 
            else:
                interference = interference/len(c_v_list)
                interference = round(interference,2)
            feature_if_dict[featureV_name] = interference
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=RuntimeWarning)
        featureY_if = np.nanmean([feature_if_dict[k] for k in featureY_list])
        featureX_if = np.nanmean([feature_if_dict[k] for k in featureX_list])
    return [featureY_if,featureX_if]

In [3]:
#  label file
df_train_label = pd.read_csv(os.path.join(label_path,train_label_file),index_col='sample_index')
train_label_dict = df_train_label['root-cause(s)'].to_dict()

unknown_label_dict = {}
for i in range(2984):
    if i not in train_label_dict.keys():
        unknown_label_dict[i] = 'unknown'

In [4]:
# select feature
feature_all = ['feature0', 'feature1', 'feature2', 
       'feature3_1','feature3_2', 'feature3_3', 'feature3_4', 
       'feature11', 'feature12', 'feature13','feature14', 'feature15', 
       'feature16', 'feature17', 'feature18','feature19', 'feature60',
       'feature20_distance','feature20_edge',
       'featureY_if','featureX_if','featureY_mean','featureX_mean',
       'featureY_min','featureX_min'
       ]
feature_select =  ['feature0', 'feature1', 'feature2',
       'feature11', 'feature12','feature13', 'feature14', 'feature15', 
       'feature16','feature17','feature18', 'feature19', 'feature60',
       'feature20_distance','featureY_if','featureX_if','featureY_mean','featureX_mean'] 

In [6]:
# preprocess datatest + feature enginering
def load_dataset_for_train(data_path,label_dict): 
    df_data = pd.DataFrame()
    row_num = 0
    for sample_index,cause in label_dict.items():
        df = pd.read_csv(os.path.join(data_path,str(sample_index)+'.csv'))
        row_num += len(df)    
        df['feature60'] = df['feature60'].apply(feature60_process)
        df['feature20_edge'] = df.loc[:,df.columns.str.contains('feature20')].apply(feature20_edge_count,axis=1)
        df['feature20_distance'] = df.loc[:,df.columns.str.contains('feature20')].apply(feature20_distance_count,axis=1)
        df[['featureY_if','featureX_if']] = df.apply(feature20XY_interference,axis=1,result_type='expand')
        df[['featureY_mean','featureX_mean','featureY_min','featureX_min']] = df.apply(featureXY_process,axis=1,result_type='expand')
        tmp = df.loc[:,feature_all]
        tmp.insert(0,'causes_type',cause) 
        tmp.insert(1,'sample_index',sample_index)
        df_data = df_data.append(tmp,ignore_index=True)
    df_data.dropna(axis=1, inplace=True,how='all') #扔掉所有na的列
    print('验证数据集维度',df_data.shape, row_num)
    return df_data

def load_dataset_for_test(data_path,file_num): 
    df_data = pd.DataFrame()
    row_num = 0
    for i in range(file_num):
        df = pd.read_csv(os.path.join(data_path,str(i)+'.csv'))
        row_num += len(df)    
        df['feature60'] = df['feature60'].apply(feature60_process)
        df['feature20_edge'] = df.loc[:,df.columns.str.contains('feature20')].apply(feature20_edge_count,axis=1)
        df['feature20_distance'] = df.loc[:,df.columns.str.contains('feature20')].apply(feature20_distance_count,axis=1)
        df[['featureY_if','featureX_if']] = df.apply(feature20XY_interference,axis=1,result_type='expand')
        df[['featureY_mean','featureX_mean','featureY_min','featureX_min']] = df.apply(featureXY_process,axis=1,result_type='expand')
        tmp = df.loc[:,feature_all]
        tmp.insert(1,'sample_index',i)
        df_data = df_data.append(tmp,ignore_index=True)
    df_data.dropna(axis=1, inplace=True,how='all') #扔掉所有na的列
    print('验证数据集维度',df_data.shape, row_num)
    return df_data

df_train = load_dataset_for_train(os.path.join(data_path,train_path),train_label_dict)
df_test = load_dataset_for_test(os.path.join(data_path,test_path),600)

验证数据集维度 (14933, 27) 14933
验证数据集维度 (20910, 26) 20910


In [None]:
# load processed data

# df_train.to_csv('train_for_ml.csv')
# df_test.to_csv('test_for_ml.csv')