Tips

In [None]:
import numpy as np
import os
import pandas as pd
from tqdm import tqdm
from scipy.signal import resample
# 基于 tensorflow.keras
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau



In [None]:
txdir='/content/drive/My Drive/Colab Notebooks'
os.chdir(txdir)
sub = pd.read_csv('data/submit.csv')
df_train = pd.read_csv('data/sensor_train.csv')
df_test  = pd.read_csv('data/sensor_test.csv')
y = df_train.groupby('fragment_id')['behavior_id'].min()

df_test['fragment_id'] += 10000
df_data = pd.concat([df_train, df_test],axis=0,ignore_index=True)


In [None]:
df = df_data.drop_duplicates(subset=['fragment_id']).reset_index(drop=True)[['fragment_id', 'behavior_id']]
df.head()


Unnamed: 0,fragment_id,behavior_id
0,0,0.0
1,1,0.0
2,2,0.0
3,3,0.0
4,4,0.0


In [None]:
df_data['acc'] = (df_data['acc_x'] ** 2 + df_data['acc_y'] ** 2 + df_data['acc_z'] ** 2) ** 0.5
df_data['accg'] = (df_data['acc_xg'] ** 2 + df_data['acc_yg'] ** 2 + df_data['acc_zg'] ** 2) ** 0.5


In [None]:
label_feat = 'behavior_id'
train = df_data[df_data[label_feat].isna()==False]
test = df_data[df_data[label_feat].isna()==True]
test['fragment_id'] -=10000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [None]:
x = np.zeros((7292, 60, 8, 1))
t = np.zeros((7500, 60, 8, 1))
for i in tqdm(range(7292)):
    tmp = train[train.fragment_id == i][:60]
    x[i,:,:, 0] = resample(tmp.drop(['fragment_id', 'time_point', 'behavior_id'],
                                    axis=1), 60, np.array(tmp.time_point))[0]
for i in tqdm(range(7500)):
    tmp = test[test.fragment_id == i][:60]
    t[i,:,:, 0] = resample(tmp.drop(['fragment_id', 'time_point', 'behavior_id'],
                                    axis=1), 60, np.array(tmp.time_point))[0]

100%|██████████| 7292/7292 [00:18<00:00, 391.99it/s]
100%|██████████| 7500/7500 [00:19<00:00, 392.92it/s]


# 提取简单统计特征

In [None]:
df_data['xy'] = (df_data['acc_x'] ** 2 + df_data['acc_y'] ** 2) ** 0.5
df_data['xy_g'] = (df_data['acc_xg'] ** 2 + df_data['acc_yg'] ** 2) ** 0.5

In [None]:
def get_dic(df,  main_col, fea_col, agg):
    dic = df.groupby(main_col)[fea_col].agg(agg).to_dict()
    fea_name = '_'.join([main_col, fea_col, agg])
    return fea_name, dic
    
def get_1st_order_xyz_features(df, fea_cols, main_col = 'fragment_id'): 
    df_fea           = pd.DataFrame()
    df_fea[main_col] = df[main_col].unique()
    ## count 特征 ##
    _, dic = get_dic(df, main_col, fea_cols[0], 'count') 
    df_fea['cnt']    = df_fea[main_col].map(dic).values
    
    ## 数值统计特征 ##
    for f in tqdm(fea_cols):
        for agg in ['min','max','mean','std','skew','median']:

            fea_name, dic       = get_dic(df, main_col, f, agg) 
            df_fea[fea_name]    = df_fea[main_col].map(dic).values
            
        df_fea['_'.join([main_col, f, 'gap'])]   = df_fea['_'.join([main_col, f, 'max'])] - df_fea['_'.join([main_col, f, 'min'])]
        df_fea['_'.join([main_col, f, 'skew2'])] = (df_fea['_'.join([main_col, f, 'mean'])] - df_fea['_'.join([main_col, f, 'median'])]) / df_fea['_'.join([main_col, f, 'std'])]
        
    return df_fea

def get_1st_order_xyz_features_self(df, fea_cols, main_col = 'fragment_id'):
    
    df_fea  = pd.DataFrame()
    ## 数值统计特征 ##
    for f in tqdm(fea_cols):
        df_fea=df.groupby(main_col).agg(
            min=(f,'min'),
            max=(f,'max'),
            mean=(f,'mean'),
            std=(f,'std'),
            skew=(f,'skew'),
            median=(f,'median'),
        ).reset_index().rename(columns={
            "min":'_'.join([main_col, f, 'min']),
            "max":'_'.join([main_col, f, 'max']),
            "mean":'_'.join([main_col, f, 'mean']),
            "std":'_'.join([main_col, f, 'std']),
            "skew":'_'.join([main_col, f, 'skew']),
            "median":'_'.join([main_col, f, 'median']),
        })
        df_fea['_'.join([main_col, f, 'gap'])]   = df_fea['_'.join([main_col, f, 'max'])] - df_fea['_'.join([main_col, f, 'min'])]
        df_fea['_'.join([main_col, f, 'skew2'])] = (df_fea['_'.join([main_col, f, 'mean'])] - df_fea['_'.join([main_col, f, 'median'])]) / df_fea['_'.join([main_col, f, 'std'])]
         
    ## count 特征 ##   
    df_fea[main_col] = df[main_col].unique()
    df_fea['cnt']=df.groupby(main_col)[fea_cols[0]].agg("count").values
    
    return df_fea
    

In [None]:
origin_fea_cols = ['acc_x','acc_y','acc_z','acc','acc_xg','acc_yg','acc_zg','accg','xy','xy_g']
df_xyz_fea1 = get_1st_order_xyz_features(df_data,origin_fea_cols,main_col='fragment_id')


100%|██████████| 10/10 [00:50<00:00,  5.00s/it]


In [None]:
df_data

Unnamed: 0,fragment_id,time_point,acc_x,acc_y,acc_z,acc_xg,acc_yg,acc_zg,behavior_id,acc,accg,xy,xy_g
0,0,27,0.3,-0.3,0.1,0.6,4.5,8.8,0.0,0.435890,9.902020,0.424264,4.539824
1,0,108,0.1,-0.0,-0.4,0.4,4.7,8.4,0.0,0.412311,9.633795,0.100000,4.716991
2,0,198,0.1,0.0,0.3,0.9,4.6,9.0,0.0,0.316228,10.147413,0.100000,4.687217
3,0,297,0.1,-0.1,-0.5,0.8,4.7,7.2,0.0,0.519615,8.635392,0.141421,4.767599
4,0,388,0.1,0.2,0.6,0.9,4.7,8.9,0.0,0.640312,10.104949,0.223607,4.785394
...,...,...,...,...,...,...,...,...,...,...,...,...,...
855536,17499,4611,-0.2,-0.1,-0.2,-2.0,2.4,9.0,,0.300000,9.526804,0.223607,3.124100
855537,17499,4692,0.1,0.1,-1.1,-1.6,2.8,8.2,,1.109054,8.811356,0.141421,3.224903
855538,17499,4788,-0.5,-0.1,1.0,-1.5,2.1,10.5,,1.122497,10.812493,0.509902,2.580698
855539,17499,4873,-0.3,0.2,-0.3,-1.6,2.2,9.3,,0.469042,9.689685,0.360555,2.720294


# 提取傅里叶特征

In [None]:
from scipy.fftpack import fft
from scipy.signal import resample
import matplotlib.pyplot as plt
def get_fft_values(y_values, N, f_s):
    f_values = np.linspace(0.0, f_s/2.0, N//2)
    fft_values_ = fft(y_values)
    fft_values = 2.0/N * np.abs(fft_values_[0:N//2])
    return f_values, fft_values


from scipy.signal import welch
def get_psd_values(y_values, N, f_s):
    f_values, psd_values = welch(y_values, fs=f_s)
    return f_values, psd_values


def show_topK_peak(fft_values,top_peak_number=5):  
    '''
    show_topK_peak(fft_values)
    '''
    peak_list = []
    for index,values in enumerate(fft_values):
        if index == 0 or index == len(fft_values) - 1:
            continue
        if fft_values[index] > fft_values[index - 1] and fft_values[index] > fft_values[index + 1]:
            peak_list.append((values,index))
    t_res = sorted(peak_list)[::-1]
#     print(t_res)
#     t_res = sorted(zip(fft_values,range(len(fft_values))))[::-1]
#     print(t_res)
    top_peak_A = [A for A,P in t_res[:top_peak_number]]
    top_peak_P = [P for A,P in t_res[:top_peak_number]]
#     print(top_peak_A)
    plt.plot(range(len(fft_values)),fft_values)
    plt.scatter(top_peak_P,top_peak_A)
    plt.show()
    
    
top_peak_number = 5    
def get_fft_topK_AP(array_with_time,feat_name,K=top_peak_number):
#     print(array_with_time)
    x,t = resample(array_with_time[feat_name], 120, np.array(array_with_time["time_point"]))
    f_values, fft_values = get_fft_values(x, N=120, f_s=5)
    peak_list = []
    for index,values in enumerate(fft_values):
        if index == 0 or index == len(fft_values) - 1:
            continue
        if fft_values[index] > fft_values[index - 1] and fft_values[index] > fft_values[index + 1]:
            peak_list.append((values,index))
    if len(peak_list) < 5:
        cnt = 5 - len(peak_list)
        for i in range(cnt):
            peak_list.append((0,-1))
    t_res = sorted(zip(fft_values,range(len(fft_values))))[::-1]
#     print(t_res)
    top_peak_A = [A for A,P in t_res[:top_peak_number]]
    top_peak_P = [P for A,P in t_res[:top_peak_number]]
    return [top_peak_A + top_peak_P]


top_peak_number = 5    
def get_psd_topK_AP(array_with_time,feat_name,K=top_peak_number):
#     print(array_with_time)
    x,t = resample(array_with_time[feat_name], 120, np.array(array_with_time["time_point"]))
    f_values, fft_values = get_psd_values(x, N=120, f_s=5)
    peak_list = []
    for index,values in enumerate(fft_values):
        if index == 0 or index == len(fft_values) - 1:
            continue
        if fft_values[index] > fft_values[index - 1] and fft_values[index] > fft_values[index + 1]:
            peak_list.append((values,index))
    if len(peak_list) < 5:
        cnt = 5 - len(peak_list)
        for i in range(cnt):
            peak_list.append((0,-1))
    
    t_res = sorted(zip(fft_values,range(len(fft_values))))[::-1]

    top_peak_A = [A for A,P in t_res[:top_peak_number]]
    top_peak_P = [P for A,P in t_res[:top_peak_number]]
     
    '''
    (19) Root mean square of the differences between two successive peaks;
    (20) Standard deviation of the intervals between two successive peaks;
    (21) The number of pairs of successive peaks intervals that differ by more than 50 ms.
    '''
    t_res2 = sorted(peak_list,key=lambda x:x[1])[::-1]
    diff_of_successive_peaks = np.zeros(len(t_res2) - 1)
    intervals_of_successive_peaks = np.zeros(len(t_res2) - 1)
    peak_values = np.array([p for p,i in peak_list])
    for index,i in enumerate(t_res2):
        if index == 0:
            continue
        diff_of_successive_peaks[index-1] = t_res2[index][0] - t_res2[index-1][0]
        intervals_of_successive_peaks[index-1] = t_res2[index][1] - t_res2[index-1][1]
    return [top_peak_A + top_peak_P]


In [None]:
oral_item = ['acc_x','acc_y','acc_z','acc','acc_xg','acc_yg','acc_zg','accg','xy','xy_g']

for item in tqdm(oral_item):
    tmp = df_data[["fragment_id",item,"time_point"]].groupby(["fragment_id"],as_index=False)[item].agg(get_fft_topK_AP,feat_name=item)
    
    for A in range(top_peak_number):
        print(A)
        tmp[item+"_fftA_"+str(A)] = tmp[item].apply(lambda x:x[A])
        df = df.merge(tmp[["fragment_id",item+"_fftA_"+str(A)]],on='fragment_id',how='left')
    for P in range(top_peak_number):
        print(P)
        tmp[item+"_fftP_"+str(P)] = tmp[item].apply(lambda x:x[top_peak_number+P])
        df = df.merge(tmp[["fragment_id",item+"_fftP_"+str(P)]],on='fragment_id',how='left')
    
    tmp = df_data[["fragment_id",item,"time_point"]].groupby(["fragment_id"],as_index=False)[item].agg(get_psd_topK_AP,feat_name=item)
    for A in range(top_peak_number):
        tmp[item+"_psdA_"+str(A)] = tmp[item].apply(lambda x:x[A])
        df = df.merge(tmp[["fragment_id",item+"_psdA_"+str(A)]],on='fragment_id',how='left')
    for P in range(top_peak_number):
        tmp[item+"_psdP_"+str(P)] = tmp[item].apply(lambda x:x[top_peak_number+P])
        df = df.merge(tmp[["fragment_id",item+"_psdP_"+str(P)]],on='fragment_id',how='left')




  0%|          | 0/10 [00:00<?, ?it/s]

0
1
2
3
4
0
1
2
3
4


  .format(nperseg, input_length))
 10%|█         | 1/10 [00:31<04:39, 31.07s/it]

0
1
2
3
4
0
1
2
3
4


 20%|██        | 2/10 [01:02<04:08, 31.04s/it]

0
1
2
3
4
0
1
2
3
4


 30%|███       | 3/10 [01:33<03:37, 31.07s/it]

0
1
2
3
4
0
1
2
3
4


 40%|████      | 4/10 [02:04<03:06, 31.13s/it]

0
1
2
3
4
0
1
2
3
4


 50%|█████     | 5/10 [02:36<02:36, 31.27s/it]

0
1
2
3
4
0
1
2
3
4


 60%|██████    | 6/10 [03:07<02:05, 31.36s/it]

0
1
2
3
4
0
1
2
3
4


 70%|███████   | 7/10 [03:39<01:34, 31.54s/it]

0
1
2
3
4
0
1
2
3
4


 80%|████████  | 8/10 [04:11<01:03, 31.62s/it]

0
1
2
3
4
0
1
2
3
4


 90%|█████████ | 9/10 [04:43<00:31, 31.78s/it]

0
1
2
3
4
0
1
2
3
4


100%|██████████| 10/10 [05:15<00:00, 31.55s/it]


# 将简单特征与傅里叶特征合并

In [None]:
df_tr_te = df.merge(df_xyz_fea1, on ='fragment_id', how = 'left')


# 保存统计特征

In [None]:
data_path = '/content/drive/My Drive/Colab Notebooks'
df_tr_te.to_pickle(data_path + "df_fea1.pkl")#保存一阶统计特征以及异常点个数

In [None]:
# 后续可以直接读取统计特征

In [None]:
df_tr_te = pd.read_pickle(data_path+"df_fea1.pkl")
df_tr_te

Unnamed: 0,fragment_id,behavior_id,acc_x_fftA_0,acc_x_fftA_1,acc_x_fftA_2,acc_x_fftA_3,acc_x_fftA_4,acc_x_fftP_0,acc_x_fftP_1,acc_x_fftP_2,acc_x_fftP_3,acc_x_fftP_4,acc_x_psdA_0,acc_x_psdA_1,acc_x_psdA_2,acc_x_psdA_3,acc_x_psdA_4,acc_x_psdP_0,acc_x_psdP_1,acc_x_psdP_2,acc_x_psdP_3,acc_x_psdP_4,acc_y_fftA_0,acc_y_fftA_1,acc_y_fftA_2,acc_y_fftA_3,acc_y_fftA_4,acc_y_fftP_0,acc_y_fftP_1,acc_y_fftP_2,acc_y_fftP_3,acc_y_fftP_4,acc_y_psdA_0,acc_y_psdA_1,acc_y_psdA_2,acc_y_psdA_3,acc_y_psdA_4,acc_y_psdP_0,acc_y_psdP_1,acc_y_psdP_2,...,fragment_id_acc_yg_min,fragment_id_acc_yg_max,fragment_id_acc_yg_mean,fragment_id_acc_yg_std,fragment_id_acc_yg_skew,fragment_id_acc_yg_median,fragment_id_acc_yg_gap,fragment_id_acc_yg_skew2,fragment_id_acc_zg_min,fragment_id_acc_zg_max,fragment_id_acc_zg_mean,fragment_id_acc_zg_std,fragment_id_acc_zg_skew,fragment_id_acc_zg_median,fragment_id_acc_zg_gap,fragment_id_acc_zg_skew2,fragment_id_accg_min,fragment_id_accg_max,fragment_id_accg_mean,fragment_id_accg_std,fragment_id_accg_skew,fragment_id_accg_median,fragment_id_accg_gap,fragment_id_accg_skew2,fragment_id_xy_min,fragment_id_xy_max,fragment_id_xy_mean,fragment_id_xy_std,fragment_id_xy_skew,fragment_id_xy_median,fragment_id_xy_gap,fragment_id_xy_skew2,fragment_id_xy_g_min,fragment_id_xy_g_max,fragment_id_xy_g_mean,fragment_id_xy_g_std,fragment_id_xy_g_skew,fragment_id_xy_g_median,fragment_id_xy_g_gap,fragment_id_xy_g_skew2
0,0,0.0,0.095500,0.092020,0.088037,0.071845,0.068201,24,2,13,26,8,0.079752,0.075956,0.053650,0.048345,0.046628,24,2,23,27,13,0.041967,0.035681,0.033780,0.032334,0.030917,21,4,27,23,25,0.014054,0.013951,0.009968,0.009779,0.008149,24,21,4,...,4.4,5.2,4.773684,0.158707,0.284234,4.80,0.8,-0.165813,7.2,9.3,8.508772,0.338738,-0.746650,8.5,2.1,0.025896,8.635392,10.560776,9.784581,0.293469,-0.677138,9.786726,1.925384,-0.007309,0.0,0.905539,0.144087,0.154493,2.543933,0.100000,0.905539,0.285368,4.455334,5.215362,4.825658,0.155398,0.372397,4.825971,0.760028,-0.002016
1,1,0.0,0.050000,0.047902,0.044440,0.042883,0.040711,0,24,4,3,1,0.026726,0.016944,0.016112,0.015899,0.015326,21,22,15,24,1,0.053571,0.033169,0.028012,0.026694,0.023137,0,22,20,17,21,0.015934,0.015810,0.015056,0.014714,0.007909,20,21,19,...,4.6,5.1,4.830357,0.123465,0.411559,4.80,0.5,0.245877,8.1,9.4,8.519643,0.241525,0.917398,8.5,1.3,0.081328,9.441398,10.521407,9.819616,0.202188,1.053017,9.791067,1.080009,0.141198,0.0,0.509902,0.093610,0.089370,1.834842,0.100000,0.509902,-0.071503,4.617359,5.162364,4.879333,0.125653,0.417325,4.850773,0.545005,0.227288
2,2,0.0,0.111380,0.101354,0.095443,0.081610,0.080519,13,12,28,21,11,0.223735,0.204908,0.107606,0.065627,0.064981,13,12,11,16,14,0.077716,0.070431,0.060925,0.048163,0.046791,16,5,4,15,18,0.042555,0.041412,0.039207,0.026196,0.018034,15,16,5,...,4.4,5.4,4.952632,0.234601,-0.081638,4.90,1.0,0.224345,7.2,9.8,8.394737,0.433186,-0.103604,8.4,2.6,-0.012150,8.823831,10.813418,9.790638,0.355424,-0.205059,9.827004,1.989586,-0.102316,0.0,0.921954,0.205445,0.184398,1.539513,0.141421,0.921954,0.347205,4.570558,5.597321,5.026851,0.237700,0.436153,4.936598,1.026763,0.379691
3,3,0.0,0.088078,0.083340,0.082012,0.077895,0.065502,27,6,11,21,8,0.113268,0.071477,0.066208,0.060511,0.057133,11,12,21,27,22,0.068327,0.051335,0.051253,0.050926,0.047608,21,27,6,8,22,0.045489,0.038310,0.028655,0.019457,0.014073,21,22,8,...,4.5,5.5,5.049091,0.196141,-0.054651,5.00,1.0,0.250283,7.5,9.5,8.340000,0.414371,0.472631,8.3,2.0,0.096532,8.828363,10.585367,9.767121,0.360655,0.193839,9.780082,1.757004,-0.035936,0.0,0.565685,0.160956,0.144985,1.208859,0.100000,0.565685,0.420432,4.657252,5.514526,5.076391,0.176062,0.277034,5.048762,0.857274,0.156928
4,4,0.0,0.172832,0.169693,0.166202,0.118818,0.114920,18,26,25,24,14,0.700952,0.533470,0.235892,0.214878,0.209049,25,26,24,18,21,0.098433,0.088219,0.073351,0.068003,0.064772,14,26,13,24,12,0.164950,0.147200,0.112870,0.085499,0.083687,14,13,25,...,3.7,5.3,4.645455,0.296784,-0.382833,4.70,1.6,-0.183788,5.8,10.4,8.558182,0.753434,-0.615809,8.6,4.6,-0.055503,7.856844,11.500435,9.778645,0.628718,-0.043196,9.819878,3.643591,-0.065582,0.0,1.392839,0.299192,0.245199,2.026445,0.282843,1.392839,0.066680,3.992493,5.303772,4.705197,0.269491,-0.293768,4.751842,1.311279,-0.173086
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14787,17495,,0.045147,0.040977,0.040422,0.040261,0.039312,7,20,6,18,15,0.048378,0.046549,0.038826,0.036409,0.031132,7,6,19,20,5,0.015939,0.013628,0.013101,0.011839,0.011687,7,12,14,20,19,0.004613,0.003674,0.003630,0.003618,0.003534,7,20,19,...,5.4,5.8,5.584483,0.110504,-0.005140,5.60,0.4,-0.140422,7.4,8.2,7.756897,0.183640,0.194264,7.7,0.8,0.309827,9.276314,9.912618,9.586268,0.151631,0.119090,9.596874,0.636304,-0.069945,0.0,0.300000,0.052131,0.087095,1.632115,0.000000,0.300000,0.598555,5.423099,5.854912,5.630619,0.113603,-0.093399,5.643580,0.431814,-0.114095
14788,17496,,0.106112,0.080633,0.066384,0.066344,0.064810,17,18,28,14,6,0.060048,0.055085,0.053703,0.050741,0.039529,17,18,22,28,21,0.054676,0.052369,0.048163,0.038902,0.036345,20,24,27,26,17,0.024449,0.023786,0.007449,0.007021,0.006696,20,24,25,...,5.4,6.0,5.693220,0.122986,0.075420,5.70,0.6,-0.055125,7.1,8.4,7.830508,0.242298,-0.054358,7.8,1.3,0.125913,9.285473,10.261579,9.775604,0.171509,0.064170,9.761660,0.976106,0.081305,0.0,0.509902,0.147790,0.135909,0.553309,0.100000,0.509902,0.351630,5.554278,6.118823,5.848321,0.119011,-0.083291,5.846366,0.564546,0.016422
14789,17497,,0.197510,0.182615,0.160584,0.148469,0.134668,16,15,13,17,12,0.336796,0.267417,0.235563,0.160510,0.129293,15,13,12,18,17,0.246572,0.192428,0.140895,0.135017,0.117888,13,22,14,17,9,0.512874,0.382228,0.221015,0.173679,0.106141,13,14,7,...,5.3,6.9,6.093103,0.426681,0.029642,6.10,1.6,-0.016163,5.6,9.1,7.427586,0.683334,0.212985,7.4,3.5,0.040370,8.173738,10.745697,9.636704,0.527621,-0.108426,9.670832,2.571958,-0.064683,0.0,1.303840,0.449790,0.253604,1.123254,0.412311,1.303840,0.147787,5.303772,6.987131,6.110052,0.426768,0.042748,6.105325,1.683359,0.011075
14790,17498,,0.302035,0.207664,0.180798,0.121870,0.112926,2,4,6,3,9,0.861793,0.640440,0.519377,0.420560,0.329365,2,3,4,5,1,0.403140,0.097188,0.092168,0.091762,0.084397,4,6,3,19,11,1.517809,0.495129,0.191386,0.076149,0.058186,4,3,5,...,0.4,2.2,1.275000,0.446094,-0.031063,1.25,1.8,0.056042,8.6,10.2,9.492857,0.351028,-0.249675,9.5,1.6,-0.020348,8.858894,10.321822,9.653737,0.326990,-0.289059,9.671866,1.462928,-0.055445,0.0,1.077033,0.425702,0.270405,0.585549,0.400000,1.077033,0.095051,0.632456,2.662705,1.662364,0.553444,-0.089851,1.752857,2.030250,-0.163509


# 数据预处理

In [None]:
label_feat = 'behavior_id'
train_df = df_tr_te[((df_tr_te[label_feat].isna()==False) & (df_tr_te[label_feat] >=0))].reset_index(drop=True)
test_df  = df_tr_te[((df_tr_te[label_feat].isna()==True) | (df_tr_te[label_feat] < 0))].reset_index(drop=True)


# 根据lightgbm树模型得到的特征
selected_feat = ['fragment_id_acc_yg_max',
 'fragment_id_xy_median',
 'fragment_id_acc_yg_min',
 'acc_xg_fftA_0',
 
 'acc_yg_fftA_0',
 'accg_fftA_0',
 'fragment_id_acc_yg_gap',
 'fragment_id_xy_g_max',
 'fragment_id_acc_xg_mean',
 'fragment_id_acc_xg_max',
 'cnt',
 'acc_y_fftA_0',
 'fragment_id_acc_yg_mean',
 'fragment_id_acc_yg_std',
 'fragment_id_acc_yg_median',
 'acc_zg_fftA_0',
 'fragment_id_acc_zg_min',
 'fragment_id_accg_median',
 'fragment_id_acc_xg_min',
 'fragment_id_acc_xg_median']
train_stat = train_df[selected_feat]
test_stat  = test_df[selected_feat]

train_stat = np.array(train_stat.values)
test_stat = np.array(test_stat.values)


# 标准化

In [None]:
def autos(X):
    m, n = X.shape[0], X.shape[1] 
    mu = np.mean(X, axis=0)
    sigma = np.std(X, axis=0, ddof=1)
    X_ = ((X - mu) / (sigma))
    return X_
train_stat = autos(train_stat)
test_stat = autos(test_stat)

In [None]:
def standardization(X):
    # x1 = X.transpose(0, 1, 3, 2)
    x1 = X
    x2 = x1.reshape(-1, x1.shape[-2])
    # mean = [8.03889039e-03, -6.41381949e-02, 2.37856977e-02, 8.64949391e-01,
    #         2.80964889e+00, 7.83041714e+00, 6.44853358e-01, 9.78580749e+00]
    # std = [0.6120893, 0.53693888, 0.7116134, 3.22046385, 3.01195336, 2.61300056, 0.87194132, 0.68427254]
    mu=np.mean(x2,axis=0)
    sigma=np.std(x2,axis=0)
    print(mu,sigma)
    x3 = ((x2 - mu) / (sigma))
    # x4 = x3.reshape(x1.shape).transpose(0, 1, 3, 2)
    x4 = x3.reshape(x1.shape)
    return x4

x = standardization(x)
t = standardization(t)

[ 9.75304151e-03 -6.36132955e-02  2.48834835e-02  9.64539739e-01
  2.73127545e+00  7.82182825e+00  6.50165171e-01  9.79294912e+00] [0.62101194 0.53955464 0.71851487 3.29887193 2.98092609 2.65323979
 0.88162122 0.69216192]
[ 6.37227840e-03 -6.46485372e-02  2.27183572e-02  7.68121016e-01
  2.88584878e+00  7.83876784e+00  6.39688859e-01  9.77886392e+00] [0.60328295 0.53438288 0.7048369  3.13932736 3.03988    2.57324659
 0.86239434 0.67644147]


# 数据增强

In [None]:
def jitter(x, snr_db):
    """
    根据信噪比添加噪声
    :param x:
    :param snr_db:
    :return:
    """
    # 随机选择信噪比
    assert isinstance(snr_db, list)
    snr_db_low = snr_db[0]
    snr_db_up = snr_db[1]
    snr_db = np.random.randint(snr_db_low, snr_db_up, (1,))[0]

    snr = 10 ** (snr_db / 10)
    Xp = np.sum(x ** 2, axis=0, keepdims=True) / x.shape[0]  # 计算信号功率
    Np = Xp / snr  # 计算噪声功率
    n = np.random.normal(size=x.shape, scale=np.sqrt(Np), loc=0.0)  # 计算噪声 loc均值，scale方差
    xn = x + n
    return xn

In [None]:
x1 = jitter(x,[5,15])
x = np.concatenate([x, x1], axis=0)
y = np.concatenate([y, y], axis=0)
train_stat = np.concatenate([train_stat,train_stat],axis=0)

In [None]:
from keras.preprocessing.image import ImageDataGenerator
datagen = ImageDataGenerator(zoom_range = 0,
                            height_shift_range = 0.2,
                            width_shift_range = 0,
                            rotation_range = 0)

# Here is the function that merges our two generators
# We use the exact same generator with the same random seed for both the y and angle arrays
def gen_flow_for_two_inputs(X1, X2, y, batch_size):
    genX1 = datagen.flow(X1,y,  batch_size=batch_size,seed = 2020)
    genX2 = datagen.flow(X1,X2, batch_size=
                         batch_size,seed = 2020)
    while True:
            X1i = genX1.next()
            X2i = genX2.next()
            #Assert arrays are equal - this was for peace of mind, but slows down training
            #np.testing.assert_array_equal(X1i[0],X2i[0])
            yield [X1i[0], X2i[1]], X1i[1]



# 根据官方评分函数改写的tf评价函数

In [None]:
import tensorflow as tf
def get_acc_combo():
    def combo(y, y_pred):
        # 数值ID与行为编码的对应关系
        mapping = {0: 'A_0', 1: 'A_1', 2: 'A_2', 3: 'A_3',
            4: 'D_4', 5: 'A_5', 6: 'B_1',7: 'B_5',
            8: 'B_2', 9: 'B_3', 10: 'B_0', 11: 'A_6',
            12: 'C_1', 13: 'C_3', 14: 'C_0', 15: 'B_6',
            16: 'C_2', 17: 'C_5', 18: 'C_6'}
        # 将行为ID转为编码

        code_y, code_y_pred = mapping[int(y)], mapping[int(y_pred)]
        if code_y == code_y_pred: #编码完全相同得分1.0
            return 1.0
        elif code_y.split("_")[0] == code_y_pred.split("_")[0]: #编码仅字母部分相同得分1.0/7
            return 1.0/7
        elif code_y.split("_")[1] == code_y_pred.split("_")[1]: #编码仅数字部分相同得分1.0/3
            return 1.0/3
        else:
            return 0.0

    confusionMatrix=np.zeros((19,19))
    for i in range(19):
        for j in range(19):
            confusionMatrix[i,j]=combo(i,j)
    confusionMatrix=tf.convert_to_tensor(confusionMatrix)

    def acc_combo(y, y_pred):
        y=tf.argmax(y,axis=1)
        y_pred = tf.argmax(y_pred, axis=1)
        indices=tf.stack([y,y_pred],axis=1)#在1轴增加一个维度
        scores=tf.gather_nd(confusionMatrix,tf.cast(indices,tf.int32))
        return tf.reduce_mean(scores)
    return acc_combo

In [None]:
kfold = StratifiedKFold(5, shuffle=True, random_state=2020)
def Net():
    input = Input(shape=(60, 8, 1))
    hin = Input(shape=(20, ))
    X = Conv2D(filters= 64,
               kernel_size=(3, 3),
               activation='relu',
               padding='same')(input)
    X = BatchNormalization()(X)
    X = Conv2D(filters= 128,
               kernel_size=(3, 3),
               activation='relu',
               padding='same')(X)
    X = BatchNormalization()(X)
    X = MaxPooling2D()(X)
    X = Conv2D(filters=256,
               kernel_size=(3, 3),
               activation='relu',
               padding='same')(X)
    X = BatchNormalization()(X)
    X = Conv2D(filters=512,
               kernel_size=(3, 3),
               activation='relu',
               padding='same')(X)
    X = BatchNormalization()(X)
    X = GlobalMaxPooling2D()(X)
    merge = concatenate([X, hin])
    merge = BatchNormalization()(merge)
    merge = Dropout(0.3)(merge)
    X = Dense(64,activation='relu')(merge)
    X = BatchNormalization()(X)

    y = Dense(19, activation='softmax')(X)
    return Model(inputs=[input, hin], outputs=y)


proba_t = np.zeros((7500, 19))
val_loss = []
val_acc = []
batch_size = 64
for fold, (xx, yy) in enumerate(kfold.split(x, y)):
    y_ = to_categorical(y, num_classes=19)
    model = Net()
    print(model.summary())
    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(5e-3),
                  metrics=['acc',get_acc_combo()])
    plateau = ReduceLROnPlateau(monitor="val_acc_combo",
                                verbose=1,
                                mode='max',
                                factor=0.5,
                                patience=4)
    early_stopping = EarlyStopping(monitor='val_acc_combo',
                                   verbose=1,
                                   mode='max',
                                   patience=20)
    checkpoint = ModelCheckpoint(f'data/fold{fold}.h5',
                                 monitor='val_acc_combo',
                                 verbose=1,
                                 mode='max',
                                 save_best_only=True)
    #gen_flow = gen_flow_for_two_inputs(x[xx], train_stat[xx], y_[xx], batch_size)
    hist =  model.fit([x[xx],train_stat[xx]], y_[xx],
              #gen_flow,steps_per_epoch=x[xx].shape[0] / batch_size,
              epochs=100,
              verbose=1,
              shuffle=True,
              validation_data=([x[yy],train_stat[yy]], y_[yy]),
              callbacks=[plateau, early_stopping, checkpoint])
    val_loss.append(np.min(hist.history['val_loss']))
    val_acc.append(np.max(hist.history['val_acc']))
    #model.load_weights(f'data/fold{fold}.h5')
    proba_t += model.predict([t,test_stat], verbose=0, batch_size=1024) / 5.
print('log loss:', np.mean(val_loss))
print('val_acc:', np.mean(val_acc))


Model: "model_12"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_25 (InputLayer)           [(None, 60, 8, 1)]   0                                            
__________________________________________________________________________________________________
conv2d_48 (Conv2D)              (None, 60, 8, 64)    640         input_25[0][0]                   
__________________________________________________________________________________________________
batch_normalization_72 (BatchNo (None, 60, 8, 64)    256         conv2d_48[0][0]                  
__________________________________________________________________________________________________
conv2d_49 (Conv2D)              (None, 60, 8, 128)   73856       batch_normalization_72[0][0]     
___________________________________________________________________________________________

KeyboardInterrupt: ignored

In [None]:
proba_y = np.zeros((len(x), 19))
proba_y=model.predict([x,train_stat], verbose=1, batch_size=1024)
proba_y=np.argmax(proba_y,axis=1)
#print(proba_y)
def acc_combo(y, y_pred):
    # 数值ID与行为编码的对应关系
    mapping = {0: 'A_0', 1: 'A_1', 2: 'A_2', 3: 'A_3', 
        4: 'D_4', 5: 'A_5', 6: 'B_1',7: 'B_5', 
        8: 'B_2', 9: 'B_3', 10: 'B_0', 11: 'A_6', 
        12: 'C_1', 13: 'C_3', 14: 'C_0', 15: 'B_6', 
        16: 'C_2', 17: 'C_5', 18: 'C_6'}
    # 将行为ID转为编码
    code_y, code_y_pred = mapping[y], mapping[y_pred]
    if code_y == code_y_pred: #编码完全相同得分1.0
        return 1.0
    elif code_y.split("_")[0] == code_y_pred.split("_")[0]: #编码仅字母部分相同得分1.0/7
        return 1.0/7
    elif code_y.split("_")[1] == code_y_pred.split("_")[1]: #编码仅数字部分相同得分1.0/3
        return 1.0/3
    else:
        return 0.0
score = sum(acc_combo(y_true, y_pred) for y_true, y_pred in zip(y, proba_y)) / proba_y.shape[0]
print(round(score, 5))

sub.behavior_id=np.argmax(proba_t, axis=1)
print(sub)
from datetime import *
current = datetime.now()
current=current.strftime('%m-%d-%H-%M')
sub.to_csv('data/%s_sub%.5f.csv' %(current,score), index=False)
#生成半监督训练数据
sub["proba"]=np.max(proba_t,axis=1)
sub.to_csv("data/semi_test_8_nobest.csv",index=False)
sub.info()