# 导入包

In [130]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from scipy.stats import skew
from scipy.stats import kurtosis
from scipy.stats import mode
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from collections import Counter
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
import warnings 
warnings.filterwarnings("ignore")

# 导入数据

In [2]:
nrows = None

df_train = pd.read_csv('../data/sensor_train.csv',sep=',',nrows=nrows)
df_test = pd.read_csv('../data/sensor_test.csv',sep=',',nrows=nrows)
df_submit = pd.read_csv('../data/提交结果示例.csv',sep=',',nrows=nrows)

# 合并数据

In [3]:
df_train['flag'] = 'train'
df_test['flag'] = 'test'
df_test['behavior_id'] = -1

In [131]:
df_train_test = pd.concat([df_train, df_test])

In [5]:
df_train_test['acc_all'] = (df_train_test['acc_x'] ** 2 + df_train_test['acc_y'] ** 2 + df_train_test['acc_z'] ** 2) ** 0.5
df_train_test['acc_allg'] = (df_train_test['acc_xg'] ** 2 + df_train_test['acc_yg'] ** 2 + df_train_test['acc_zg'] ** 2) ** 0.5

In [6]:
df_train_test = df_train_test.sort_values(['flag','fragment_id','time_point'])

In [8]:
# 统计长度
# Counter(list(df_train_test.groupby(['flag','fragment_id','behavior_id']).size()))

# 数据聚合

In [9]:
seq_len = 61
def agg_func(x):
    list_x = list(x)
    len_x = len(list_x)
    if len_x <= seq_len:
        list_x = [0] * (seq_len-len_x) + list_x
    else:
        list_x = list_x[:seq_len]
    return list_x

map_agg_func = {    
    'time_point' : agg_func,
    
    'acc_all' : agg_func,
    'acc_allg' : agg_func,
    
    'acc_x' : agg_func,
    'acc_y' : agg_func,
    'acc_z' : agg_func,
    
    'acc_xg' : agg_func,
    'acc_yg' : agg_func,
    'acc_zg' : agg_func
}

In [10]:
df_train_test_list = df_train_test.groupby(['flag','fragment_id','behavior_id']).agg(map_agg_func).reset_index()

# 查看聚合后的数据

In [11]:
df_train_test_list.head()

Unnamed: 0,flag,fragment_id,behavior_id,time_point,acc_all,acc_allg,acc_x,acc_y,acc_z,acc_xg,acc_yg,acc_zg
0,test,0,-1,"[0, 0, 0, 71, 150, 244, 326, 409, 495, 579, 66...","[0, 0, 0, 1.1832159566199232, 1.22065556157337...","[0, 0, 0, 10.619322012256713, 10.16267681273, ...","[0, 0, 0, 0.2, 0.0, -0.2, 2.3, -0.3, 0.3, -0.8...","[0, 0, 0, 1.0, 1.0, 0.8, -0.4, -1.4, -1.1, 0.0...","[0, 0, 0, 0.6, -0.7, -2.4, -1.6, 3.3, 3.2, -0....","[0, 0, 0, 0.2, 0.2, -0.4, 2.5, 0.2, 0.5, -0.3,...","[0, 0, 0, 5.3, 6.0, 5.3, 4.5, 3.5, 3.1, 4.1, 4...","[0, 0, 0, 9.2, 8.2, 7.6, 3.4, 12.4, 9.8, 8.5, ..."
1,test,1,-1,"[0, 0, 0, 0, 0, 0, 151, 232, 318, 406, 493, 58...","[0, 0, 0, 0, 0, 0, 0.1, 0.1, 0.0, 0.2236067977...","[0, 0, 0, 0, 0, 0, 9.629122493768579, 9.525229...","[0, 0, 0, 0, 0, 0, 0.1, 0.0, 0.0, 0.2, 0.1, 0....","[0, 0, 0, 0, 0, 0, 0.0, 0.0, 0.0, -0.1, 0.0, 0...","[0, 0, 0, 0, 0, 0, 0.0, 0.1, 0.0, 0.0, -0.3, 0...","[0, 0, 0, 0, 0, 0, -1.0, -0.9, -1.2, -0.8, -0....","[0, 0, 0, 0, 0, 0, 4.6, 4.4, 4.6, 4.4, 4.4, 4....","[0, 0, 0, 0, 0, 0, 8.4, 8.4, 8.3, 8.2, 8.2, 8...."
2,test,2,-1,"[0, 0, 0, 0, 0, 46, 135, 233, 315, 397, 483, 5...","[0, 0, 0, 0, 0, 0.1, 0.31622776601683794, 0.5,...","[0, 0, 0, 0, 0, 10.00299955013495, 9.720596689...","[0, 0, 0, 0, 0, 0.0, 0.3, 0.3, 0.3, 0.4, -0.1,...","[0, 0, 0, 0, 0, 0.0, 0.0, 0.0, 0.3, 0.0, -0.1,...","[0, 0, 0, 0, 0, 0.1, -0.1, -0.4, 0.4, 0.5, 0.0...","[0, 0, 0, 0, 0, 0.9, 1.2, 1.2, 1.3, 1.2, 0.9, ...","[0, 0, 0, 0, 0, 3.3, 3.2, 3.2, 3.2, 2.9, 2.8, ...","[0, 0, 0, 0, 0, 9.4, 9.1, 8.8, 9.6, 9.7, 9.4, ..."
3,test,3,-1,"[0, 0, 0, 0, 91, 172, 264, 345, 436, 516, 618,...","[0, 0, 0, 0, 0.28284271247461906, 0.2, 0.0, 0....","[0, 0, 0, 0, 9.642095207992918, 9.390953093270...","[0, 0, 0, 0, 0.0, 0.0, 0.0, -0.2, 0.0, 0.0, 0....","[0, 0, 0, 0, -0.2, -0.2, 0.0, 0.0, 0.0, -0.1, ...","[0, 0, 0, 0, 0.2, 0.0, 0.0, -0.2, -0.2, -0.3, ...","[0, 0, 0, 0, 0.2, 0.3, 0.2, 0.2, 0.2, 0.1, 0.2...","[0, 0, 0, 0, 5.8, 5.9, 5.8, 5.9, 5.9, 5.8, 5.9...","[0, 0, 0, 0, 7.7, 7.3, 7.4, 7.5, 7.3, 7.1, 7.5..."
4,test,4,-1,"[0, 38, 112, 205, 282, 364, 451, 530, 617, 700...","[0, 0.7348469228349533, 0.31622776601683794, 1...","[0, 10.431203190428226, 9.633794683301073, 8.9...","[0, 0.2, 0.1, -0.1, -0.4, 2.3, 0.7, -0.9, 0.4,...","[0, -0.1, 0.0, 0.5, 1.0, -0.3, -0.5, -0.5, -1....","[0, 0.7, -0.3, -0.9, 0.3, 1.7, 0.3, -1.4, 1.4,...","[0, 3.7, 3.4, 3.0, 2.3, 5.6, 3.3, 2.3, 3.2, 1....","[0, 4.6, 5.0, 5.1, 6.1, 5.0, 4.9, 5.4, 4.7, 5....","[0, 8.6, 7.5, 6.7, 8.3, 9.1, 8.4, 5.9, 8.8, 6...."


# 特征处理

In [12]:
list_features = []
for index, row in tqdm(df_train_test_list.iterrows()):
    acc_all = np.array(row['acc_all'])
    acc_allg = np.array(row['acc_allg'])
    acc_x = np.array(row['acc_x'])
    acc_y = np.array(row['acc_y'])
    acc_z = np.array(row['acc_z'])
    acc_xg = np.array(row['acc_xg'])
    acc_yg = np.array(row['acc_yg'])
    acc_zg = np.array(row['acc_zg'])
    
    features = np.stack([acc_all,acc_allg,acc_x,acc_y,acc_z,acc_xg,acc_yg,acc_zg]).T
    list_features.append(features)

14792it [00:05, 2702.28it/s]


In [13]:
df_train_test_list['features'] = list_features

# 模型训练 

In [133]:
import tensorflow as tf

In [16]:
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers.core import Dense, Dropout

Using TensorFlow backend.


In [17]:
df_train_test_features = df_train_test_list
X = df_train_test_features[df_train_test_features['flag']=='train']['features'].values
y = df_train_test_features[df_train_test_features['flag']=='train']['behavior_id'].values
X_test = df_train_test_features[df_train_test_features['flag']=='test']['features'].values

# CNN

In [21]:
def accuracy(y_true, y_pred):
    return tf.keras.metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=1)

In [24]:
df_train_stacking = pd.DataFrame(np.zeros((X.shape[0],19)))
df_test_stacking = pd.DataFrame(np.zeros((X_test.shape[0],19)))

In [25]:
seed = 2020
folds = 5
kfold = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)

# 训练模型

In [105]:
for train_index, val_index in tqdm(kfold.split(X, y)):
    
    inputs = tf.keras.Input(shape=(61,8))
    
    
    layer_cnn = tf.keras.layers.Conv1D(256, 10, padding='same')(inputs)
    layer_maxpool = tf.keras.layers.MaxPool1D(3, padding='same')(layer_cnn)
    
    for i in [5,3]:
        layer_cnn = tf.keras.layers.Conv1D(256, i, padding='same')(layer_maxpool)
        layer_maxpool = tf.keras.layers.MaxPool1D(2, padding='same')(layer_cnn)
    
    layer_flatten = tf.keras.layers.Flatten()(layer_maxpool)

    layer_dense = tf.keras.layers.Dense(units=64, activation=tf.nn.relu)(layer_flatten)
    outputs = tf.keras.layers.Dense(units=19, activation=tf.nn.softmax)(layer_dense)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    
    model.compile(loss='sparse_categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adam(),
              metrics=[tf.keras.metrics.sparse_categorical_crossentropy,
                      accuracy])

    print('--------------- begin ---------------')
    X_train, X_val = np.array(list(X[train_index])), np.array(list(X[val_index]))
    y_train, y_val = np.array(list(y[train_index])), np.array(list(y[val_index]))
    
    model.fit(X_train,y_train,
          batch_size=2048,
          epochs=5,
          validation_data=(X_val,y_val)
         )
    X_val_predict = model.predict(X_val)
    X_test_predict = model.predict(np.array(list(X_test)))
    
    df_train_stacking.loc[val_index,:] = X_val_predict
    df_test_stacking[:] += X_test_predict / folds
    print('--------------- end ---------------')

0it [00:00, ?it/s]

--------------- begin ---------------
Train on 5833 samples, validate on 1459 samples
Epoch 1/5

Epoch 2/5

Epoch 3/5

Epoch 4/5

Epoch 5/5



1it [00:06,  6.61s/it]

--------------- end ---------------
--------------- begin ---------------
Train on 5833 samples, validate on 1459 samples
Epoch 1/5

Epoch 2/5

Epoch 3/5

Epoch 4/5

Epoch 5/5



2it [00:13,  6.63s/it]

--------------- end ---------------
--------------- begin ---------------
Train on 5834 samples, validate on 1458 samples
Epoch 1/5

Epoch 2/5

Epoch 3/5

Epoch 4/5

Epoch 5/5



3it [00:20,  6.72s/it]

--------------- end ---------------
--------------- begin ---------------
Train on 5834 samples, validate on 1458 samples
Epoch 1/5

Epoch 2/5

Epoch 3/5

Epoch 4/5

Epoch 5/5



4it [00:27,  7.03s/it]

--------------- end ---------------
--------------- begin ---------------
Train on 5834 samples, validate on 1458 samples
Epoch 1/5

Epoch 2/5

Epoch 3/5

Epoch 4/5

Epoch 5/5



5it [00:34,  6.97s/it]

--------------- end ---------------





In [106]:
df_test_stacking.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,0.243812,0.064706,0.029902,0.047865,0.989932,0.039458,0.056583,0.049457,0.044811,0.059043,0.08398,0.14238,7.102009,2.740461,0.351695,0.536511,0.147312,0.15635,1.313733
1,3.619321,2.087809,0.084942,3.298951,0.203715,0.162211,1.595368,0.10977,0.091975,0.55709,0.584444,0.413253,0.478515,0.198514,0.142229,0.264602,0.076586,0.142535,0.088169
2,0.325228,2.108909,0.15232,1.788739,0.172716,0.444777,2.385206,0.434482,0.292874,1.470469,2.028093,0.76191,0.343931,0.286607,0.17472,0.487102,0.142693,0.273382,0.125843
3,2.327215,3.1253,0.055698,1.412805,0.084388,0.148327,4.144405,0.117813,0.069589,0.825792,0.221483,0.491864,0.334511,0.15919,0.09673,0.312032,0.059305,0.128634,0.084916
4,0.112531,0.137456,0.052214,0.113882,0.546122,0.098055,0.092291,0.094492,0.06141,0.086177,0.106932,0.281676,1.327897,4.019764,0.159728,0.564558,0.149304,0.949602,5.245909


# 验证和输出结果

In [107]:
def acc_combo(y, y_pred):
    # 数值ID与行为编码的对应关系
    mapping = {0: 'A_0', 1: 'A_1', 2: 'A_2', 3: 'A_3', 
        4: 'D_4', 5: 'A_5', 6: 'B_1',7: 'B_5', 
        8: 'B_2', 9: 'B_3', 10: 'B_0', 11: 'A_6', 
        12: 'C_1', 13: 'C_3', 14: 'C_0', 15: 'B_6', 
        16: 'C_2', 17: 'C_5', 18: 'C_6'}
    # 将行为ID转为编码
    code_y, code_y_pred = mapping[y], mapping[y_pred]
    if code_y == code_y_pred: #编码完全相同得分1.0
        return 1.0
    elif code_y.split("_")[0] == code_y_pred.split("_")[0]: #编码仅字母部分相同得分1.0/7
        return 1.0/7
    elif code_y.split("_")[1] == code_y_pred.split("_")[1]: #编码仅数字部分相同得分1.0/3
        return 1.0/3
    else:
        return 0.0

In [108]:
labels = np.argmax(df_test_stacking.values, axis=1)
pred_y = np.argmax(df_train_stacking.values, axis=1)


acc_scores = round(accuracy_score(y, pred_y), 5)
acc_combo_scores = round(sum(acc_combo(y_true, y_pred) for y_true, y_pred in zip(y, pred_y)) / len(list(y)),5)

print('--------')
print(' acc : ', acc_scores, 'acc_combo : ', acc_combo_scores)

df_out = df_train_test_features[df_train_test_features['flag']=='test'][['fragment_id']]
df_out['behavior_id'] = labels
df_out.to_csv('./submit_cnn_%.5f_%.5f.csv' % (acc_scores, acc_combo_scores), index=False)

--------
 acc :  0.22943 acc_combo :  0.3257


# LSTM

In [109]:
df_train_stacking = pd.DataFrame(np.zeros((X.shape[0],19)))
df_test_stacking = pd.DataFrame(np.zeros((X_test.shape[0],19)))

In [110]:
seed = 2020
folds = 5
kfold = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)

# 训练模型

In [128]:
for train_index, val_index in tqdm(kfold.split(X, y)):
    
    inputs = tf.keras.Input(shape=(61,8))
    
    layer_lstm = tf.keras.layers.LSTM(128,input_shape=(61,8),return_sequences=True)(inputs)
    
    layer_lstm = tf.keras.layers.LSTM(128)(layer_lstm)

    layer_flatten = tf.keras.layers.Flatten()(layer_lstm)

    layer_dense = tf.keras.layers.Dense(units=64, activation=tf.nn.tanh)(layer_flatten)
    outputs = tf.keras.layers.Dense(units=19, activation=tf.nn.softmax)(layer_dense)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    
    model.compile(loss='sparse_categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adam(),
              metrics=[tf.keras.metrics.sparse_categorical_crossentropy,
                      accuracy])

    print('--------------- begin ---------------')
    X_train, X_val = np.array(list(X[train_index])), np.array(list(X[val_index]))
    y_train, y_val = np.array(list(y[train_index])), np.array(list(y[val_index]))
    
    model.fit(X_train,y_train,
          batch_size=1024*5,
          epochs=10,
          validation_data=(X_val,y_val)
         )
    X_val_predict = model.predict(X_val)
    X_test_predict = model.predict(np.array(list(X_test)))
    
    df_train_stacking.loc[val_index,:] = X_val_predict
    df_test_stacking[:] += X_test_predict / folds
    print('--------------- end ---------------')

0it [00:00, ?it/s]

--------------- begin ---------------
Train on 5833 samples, validate on 1459 samples
Epoch 1/10

Epoch 2/10

Epoch 3/10

Epoch 4/10

Epoch 5/10

Epoch 6/10

Epoch 7/10

Epoch 8/10

Epoch 9/10

Epoch 10/10



1it [00:26, 26.35s/it]

--------------- end ---------------
--------------- begin ---------------
Train on 5833 samples, validate on 1459 samples
Epoch 1/10

Epoch 2/10

Epoch 3/10

Epoch 4/10

Epoch 5/10

Epoch 6/10

Epoch 7/10

Epoch 8/10

Epoch 9/10

Epoch 10/10



2it [00:53, 26.61s/it]

--------------- end ---------------
--------------- begin ---------------
Train on 5834 samples, validate on 1458 samples
Epoch 1/10

Epoch 2/10

Epoch 3/10

Epoch 4/10

Epoch 5/10

Epoch 6/10

Epoch 7/10

Epoch 8/10

Epoch 9/10

Epoch 10/10



3it [01:19, 26.30s/it]

--------------- end ---------------
--------------- begin ---------------
Train on 5834 samples, validate on 1458 samples
Epoch 1/10

Epoch 2/10

Epoch 3/10

Epoch 4/10

Epoch 5/10

Epoch 6/10

Epoch 7/10

Epoch 8/10

Epoch 9/10

Epoch 10/10



4it [01:46, 26.50s/it]

--------------- end ---------------
--------------- begin ---------------
Train on 5834 samples, validate on 1458 samples
Epoch 1/10

Epoch 2/10

Epoch 3/10

Epoch 4/10

Epoch 5/10

Epoch 6/10

Epoch 7/10

Epoch 8/10

Epoch 9/10

Epoch 10/10



5it [02:12, 26.57s/it]

--------------- end ---------------





# 验证和输出结果

In [129]:
labels = np.argmax(df_test_stacking.values, axis=1)
pred_y = np.argmax(df_train_stacking.values, axis=1)


acc_scores = round(accuracy_score(y, pred_y), 5)
acc_combo_scores = round(sum(acc_combo(y_true, y_pred) for y_true, y_pred in zip(y, pred_y)) / len(list(y)),5)

print('--------')
print(' acc : ', acc_scores, 'acc_combo : ', acc_combo_scores)

df_out = df_train_test_features[df_train_test_features['flag']=='test'][['fragment_id']]
df_out['behavior_id'] = labels
df_out.to_csv('./submit_lstm_%.5f_%.5f.csv' % (acc_scores, acc_combo_scores), index=False)

--------
 acc :  0.31363 acc_combo :  0.40915
