In [1]:
#人类行为识别
#陈奕阳、田泽予

In [2]:
#导入所需要的数据包
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.api as sm
from sklearn import metrics
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,log_loss
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedShuffleSplit, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, PowerTransformer, MinMaxScaler
from sklearn.metrics import accuracy_score, f1_score, plot_confusion_matrix, classification_report, precision_score, recall_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from IPython.display import display
import copy
import time
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
import keras
from keras import regularizers

In [3]:
#模式映射
mode_map = ["transient","lying","sitting","standing","walking","running",
            "cycling","Nordic_walking","","watching_TV","computer_work","car driving",
           "acending_stairs","descending_stairs","","","vacuum_cleaning","ironing",
           "folding_laundry","house_cleaning","playing_soccer","","","","rope_jumping"]
#参数映射
cols = ['time_stamp','activity_id','heart_rate','hand_temperature','hand_3D_acceleration_16_x','hand_3D_acceleration_16_y',
  'hand_3D_acceleration_16_z','hand_3D_acceleration_6_x','hand_3D_acceleration_6_y','hand_3D_acceleration_6_z',
  'hand_3D_gyroscope_x','hand_3D_gyroscope_y','hand_3D_gyroscope_z','hand_3D_magnetometer_x','hand_3D_magnetometer_y',
  'hand_3D_magnetometer_z','hand_4D_orientation_x','hand_4D_orientation_y','hand_4D_orientation_z','hand_4D_orientation_w',
  'chest_temperature','chest_3D_acceleration_16_x','chest_3D_acceleration_16_y','chest_3D_acceleration_16_z',
  'chest_3D_acceleration_6_x','chest_3D_acceleration_6_y','chest_3D_acceleration_6_z','chest_3D_gyroscope_x','chest_3D_gyroscope_y',
  'chest_3D_gyroscope_z','chest_3D_magnetometer_x','chest_3D_magnetometer_y','chest_3D_magnetometer_z','chest_4D_orientation_x',
  'chest_4D_orientation_y','chest_4D_orientation_z','chest_4D_orientation_w','ankle_temperature','ankle_3D_acceleration_16_x',
  'ankle_3D_acceleration_16_y','ankle_3D_acceleration_16_z','ankle_3D_acceleration_6_x','ankle_3D_acceleration_6_y',
  'ankle_3D_acceleration_6_z','ankle_3D_gyroscope_x','ankle_3D_gyroscope_y','ankle_3D_gyroscope_z','ankle_3D_magnetometer_x',
  'ankle_3D_magnetometer_y','ankle_3D_magnetometer_z','ankle_4D_orientation_x','ankle_4D_orientation_y','ankle_4D_orientation_z',
  'ankle_4D_orientation_w']

In [4]:
#读入数据集--按subject
path = "./PAMAP2_Dataset/Protocol/subject"
def load_subjects(path):
    subjects = []
    
    for i in range(101,110):
        data_path = path + str(i) +'.dat'
        subject = pd.read_table(data_path, header=None, sep='\s+')
        subject.columns = cols 
        subject['id'] = i
        subjects.append(subject)

    return subjects

data_subject = load_subjects(path)

In [5]:
def fix_data(data):
    output = []
    for subject in data:
        subject = subject.interpolate()
        subject = subject.drop(subject[subject['activity_id']==0].index)
        output.append(subject)
    return output

data_subject = fix_data(data_subject)

In [6]:
data = pd.concat(data_subject).reset_index(drop=True)
data

Unnamed: 0,time_stamp,activity_id,heart_rate,hand_temperature,hand_3D_acceleration_16_x,hand_3D_acceleration_16_y,hand_3D_acceleration_16_z,hand_3D_acceleration_6_x,hand_3D_acceleration_6_y,hand_3D_acceleration_6_z,...,ankle_3D_gyroscope_y,ankle_3D_gyroscope_z,ankle_3D_magnetometer_x,ankle_3D_magnetometer_y,ankle_3D_magnetometer_z,ankle_4D_orientation_x,ankle_4D_orientation_y,ankle_4D_orientation_z,ankle_4D_orientation_w,id
0,37.66,1,100.0,30.375,2.21530,8.27915,5.58753,2.24689,8.55387,5.77143,...,-0.027714,0.001752,-61.1081,-36.863600,-58.369600,1.000000,0.000000,0.000000,0.000000,101
1,37.67,1,100.0,30.375,2.29196,7.67288,5.74467,2.27373,8.14592,5.78739,...,0.000945,0.006007,-60.8916,-36.319700,-58.365600,1.000000,0.000000,0.000000,0.000000,101
2,37.68,1,100.0,30.375,2.29090,7.14240,5.82342,2.26966,7.66268,5.78846,...,-0.052422,-0.004882,-60.3407,-35.784200,-58.611900,1.000000,0.000000,0.000000,0.000000,101
3,37.69,1,100.0,30.375,2.21800,7.14365,5.89930,2.22177,7.25535,5.88000,...,-0.018844,0.026950,-60.7646,-37.102800,-57.879900,1.000000,0.000000,0.000000,0.000000,101
4,37.70,1,100.0,30.375,2.30106,7.25857,6.09259,2.20720,7.24042,5.95555,...,-0.048878,-0.006328,-60.2040,-37.122500,-57.884700,1.000000,0.000000,0.000000,0.000000,101
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1942867,95.06,24,162.0,25.125,4.99466,6.01881,5.59830,4.90787,6.05780,5.68357,...,-0.012885,0.005878,-45.7855,-0.831734,-0.170139,0.522929,-0.291612,0.705786,-0.378648,109
1942868,95.07,24,162.0,25.125,5.02764,5.90369,5.48372,4.89090,5.95209,5.56301,...,0.003629,-0.004235,-46.0331,-0.817288,0.538134,0.522880,-0.291694,0.705895,-0.378450,109
1942869,95.08,24,162.0,25.125,5.06409,5.71370,5.48491,4.97981,5.87584,5.45738,...,-0.035176,-0.002309,-45.5140,-1.229410,0.540438,0.522625,-0.291978,0.706161,-0.378084,109
1942870,95.09,24,162.0,25.125,5.13914,5.63724,5.48629,4.97690,5.69448,5.29167,...,-0.036457,-0.007076,-45.9093,-0.565555,0.680109,0.522536,-0.291955,0.706426,-0.377733,109


In [7]:
#分段
def moving_window(df, length=512, shift=0):
    size = df.shape[0]
    prev = 0
    for start in range(0, size, length-shift):
        yield df[start:start + length] if start + length < size else df[start:size]

In [8]:
def get_peaks_DFT(t, f, top=5, dt=0.01):
    n = len(t)
    fhat = np.fft.fft(f, n)
    PSD = fhat * np.conj(fhat) / n
    freq = (1/(dt*n)) * np.arange(n)
    L = np.arange(1, np.floor(n/2), dtype='int')
    
    top_index = np.argsort(PSD, )[::-1][:top]
    return list(PSD[top_index].astype(np.float64)), list(top_index)

In [10]:
N_FFT_PEAKS = 5

subject = data_subject[0]
columns=['activity_id']
new_feats = ['max', 'min', 'mean', 'var', 'skew', 'kurtosis']
fft_feats = [f'top{i}_{k}' for k in ['PSD', 'freq'] for i in range(1, N_FFT_PEAKS+1)]
#new_feats += fft_feats
new_cols = [f'{feat}_{col}' for col in subject.columns[2:-1] for feat in new_feats]
columns += new_cols + ['id']

In [12]:
rows = []
for subject in data_subject:
    for segment in moving_window(subject):
        row = []
        row.append(segment['activity_id'].mode()[0])
        t = segment['time_stamp']
        for i in range(2, len(segment.columns)-1):
            series = segment.iloc[:, i]
            row.append(series.max())
            row.append(series.min())
            row.append(series.mean())
            row.append(series.var())
            row.append(series.skew())
            row.append(series.kurtosis())
            #top_PSDs, top_freqs = get_peaks_DFT(t, series, top=N_FFT_PEAKS)
            #row += top_PSDs + top_freqs
        row.append(segment['id'].iloc[0])
        rows.append(row)
segmented = pd.DataFrame(rows, columns=columns)
segmented

Unnamed: 0,activity_id,max_heart_rate,min_heart_rate,mean_heart_rate,var_heart_rate,skew_heart_rate,kurtosis_heart_rate,max_hand_temperature,min_hand_temperature,mean_hand_temperature,...,var_ankle_4D_orientation_z,skew_ankle_4D_orientation_z,kurtosis_ankle_4D_orientation_z,max_ankle_4D_orientation_w,min_ankle_4D_orientation_w,mean_ankle_4D_orientation_w,var_ankle_4D_orientation_w,skew_ankle_4D_orientation_w,kurtosis_ankle_4D_orientation_w,id
0,1,102.000000,100.0,101.490234,0.411575,-0.882699,-0.326263,30.4375,30.3750,30.417480,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,101
1,1,103.000000,102.0,102.207031,0.160932,1.450112,0.130486,30.5000,30.4375,30.486084,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,101
2,1,105.909091,103.0,104.054688,0.675175,-0.024326,-1.385904,30.5625,30.5000,30.526611,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,101
3,1,107.000000,106.0,106.705078,0.201234,-0.901677,-1.157321,30.6250,30.5625,30.573975,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,101
4,1,106.000000,103.0,104.560547,0.827140,-0.047368,-0.822068,30.6250,30.6250,30.625000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,101
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3794,24,154.000000,153.0,153.171875,0.135496,1.742804,1.115500,25.1250,25.0625,25.119385,...,0.001545,-0.669159,0.359304,-0.070364,-0.441434,-0.297615,0.007945,0.496857,-0.658364,109
3795,24,154.000000,153.0,153.050781,0.044739,4.094710,15.151994,25.1250,25.1250,25.125000,...,0.001359,-0.374331,-0.397474,-0.196159,-0.484406,-0.352572,0.004334,0.296299,-0.784905,109
3796,24,159.000000,154.0,156.237305,2.008618,0.378598,-1.044418,25.1250,25.1250,25.125000,...,0.003328,-0.297803,0.132597,-0.159602,-0.551772,-0.379662,0.007334,0.526879,-0.368876,109
3797,24,161.000000,159.0,160.111328,0.639957,-0.203103,-1.423994,25.1250,25.1250,25.125000,...,0.003067,-0.656631,-0.174480,-0.009793,-0.405541,-0.246068,0.011363,0.356198,-1.025469,109


In [13]:
def split_train_test(data, scaler):
    subject107 = data[data['id'] == 107]
    subject108 = data[data['id'] == 108]
    test = subject107.append(subject108)
    
    train = data[data['id'] != 107]
    train = train[train['id'] != 108]
    
    test = test.drop(['id'], axis = 1)
    train = train.drop(['id'], axis = 1)
    
    X_train = train.drop(['activity_id'], axis = 1)
    X_test = test.drop(['activity_id'], axis = 1)
    
    if scaler == 'StandardScaler':
        scaler = StandardScaler()
        scaler.fit(X_train)
        scaler.fit(X_test)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
    if scaler == 'MinMaxScaler':
        scaler = MinMaxScaler()
        scaler.fit(X_train)
        scaler.fit(X_test)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
    elif scaler == 'PowerTransformer':
        scaler = PowerTransformer()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.fit_transform(X_test)
    elif scaler == 'none':
        X_train = X_train.values
        X_test = X_test.values
    
    y_train = train['activity_id'].values
    y_test = test['activity_id'].values
    return X_train, X_test, y_train, y_test

In [14]:
def convert_labels(data):
    for i in range(len(data)):
        if data[i]==1: data[i] = 0
        if data[i]==2: data[i] = 1
        if data[i]==3: data[i] = 2
        if data[i]==4: data[i] = 3
        if data[i]==5: data[i] = 4
        if data[i]==6: data[i] = 5
        if data[i]==7: data[i] = 6
        if data[i]==12: data[i] = 7
        if data[i]==13: data[i] = 8
        if data[i]==16: data[i] = 9
        if data[i]==17: data[i] = 10
        if data[i]==24: data[i] = 11
    return data

In [16]:
X_train, X_test, y_train, y_test = split_train_test(segmented, 'none')
y_train_convert = to_categorical(convert_labels(y_train), 12)
y_test_convert = to_categorical(convert_labels(y_test), 12)

In [17]:
def MLP():
    model = Sequential()
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(2048, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(12, activation='softmax'))

    # Configure the model and start training
    model.compile(loss='categorical_crossentropy', 
                  optimizer='adam', 
                  metrics=['accuracy'])
    return model

In [18]:
## CV K-fold
k_fold_cv = KFold(n_splits=10, shuffle=True, random_state=42)

accuracy_list = []
loss_list = []
best_mean_acc = 0
start_time = time.time()

cv_type = 'Repeated K-Fold'
for train_ids, val_ids in k_fold_cv.split(X_train, y_train_convert):
    #print(len(train_ids),len(val_ids),'\n')
    kfold_model = MLP()

    # Train model
    kfold_model.fit(X_train[train_ids], y_train_convert[train_ids], batch_size=64, epochs=15, verbose=0)
    scores = kfold_model.evaluate(X_train[val_ids], y_train_convert[val_ids], verbose=0)

    accuracy_list.append(scores[1] * 100)
    loss_list.append(scores[0])
    
    if scores[1] > best_mean_acc:
        pred_test = kfold_model.predict(X_test)
        best_mean_acc = scores[1]
training_time = time.time() - start_time



In [19]:
result_info = {'Model': [], 
               'Validation-Accuracy': [],
               'Accuracy': [],
               'Precision': [], 
               'Recall': [], 
               'F1-score': [], 
               'Training Time': [],
               'Cross-validation': []}
def MLP_performance():
    result_info['Model'].append('MLPClassifier')
    result_info['Validation-Accuracy'].append(str(np.mean(accuracy_list)) + '+-' + str(np.std(accuracy_list)))
    result_info['Accuracy'].append(accuracy_score(y_test, pred_test_convert).round(6)*100)
    result_info['Precision'].append(precision_score(y_test, pred_test_convert, average = 'macro').round(6)*100)
    result_info['Recall'].append(recall_score(y_test, pred_test_convert, average = 'macro').round(6)*100)
    result_info['F1-score'].append(f1_score(y_test, pred_test_convert, average = 'macro').round(6)*100)
    result_info['Training Time'].append(str(training_time) + 's')
    result_info['Cross-validation'].append(cv_type)

In [20]:
pred_test_convert = []
for i in range(len(pred_test)):
    pred_test_convert.append(pred_test[i].argmax())
MLP_performance() 

In [21]:
pd.DataFrame(result_info).to_csv("result_NN_2.csv")
pd.DataFrame(result_info)

Unnamed: 0,Model,Validation-Accuracy,Accuracy,Precision,Recall,F1-score,Training Time,Cross-validation
0,MLPClassifier,91.70245826244354+-1.8486560050966454,70.5274,74.1423,71.2964,69.72,376.0956127643585s,Repeated K-Fold


In [22]:
## CV Stratified Random Sampling
sss_cv = StratifiedShuffleSplit(n_splits = 10, test_size = 0.25, random_state = 42)

accuracy_list = []
loss_list = []
best_mean_acc = 0
start_time = time.time()
cv_type = 'Stratified Random Sampling'
for train_ids, val_ids in sss_cv.split(X_train, y_train_convert):
    #print(len(train_ids),len(val_ids),'\n')
    sss_model = MLP()
    # Train model
    sss_model.fit(X_train[train_ids], y_train_convert[train_ids], batch_size=64, epochs=15, verbose=0)
    scores = sss_model.evaluate(X_train[val_ids], y_train_convert[val_ids], verbose=0)

    accuracy_list.append(scores[1] * 100)
    loss_list.append(scores[0])
    
    if scores[1] > best_mean_acc:
        pred_test = sss_model.predict(X_test)
        best_mean_acc = scores[1]
training_time = time.time() - start_time



In [23]:
pred_test_convert = []
for i in range(len(pred_test)):
    pred_test_convert.append(pred_test[i].argmax())
MLP_performance()

In [25]:
pd.DataFrame(result_info).to_csv("result_NN_init.csv")
pd.DataFrame(result_info)

Unnamed: 0,Model,Validation-Accuracy,Accuracy,Precision,Recall,F1-score,Training Time,Cross-validation
0,MLPClassifier,91.70245826244354+-1.8486560050966454,70.5274,74.1423,71.2964,69.72,376.0956127643585s,Repeated K-Fold
1,MLPClassifier,89.81638431549072+-1.4561730106976027,76.0083,75.9768,73.8249,73.4177,372.0806519985199s,Stratified Random Sampling
