In [1]:
import pandas as pd
from sklearn.neural_network import MLPRegressor

# Data Processing & Visualisation

In [2]:
# reading in results of the ch2 code
df = pd.read_csv("chapter2_result.csv")
df.rename(columns={'Unnamed: 0':'timestamp'}, inplace=True )

# add lag column
df.insert(13,'hr_watch_rate_lag1', df['hr_watch_rate'].shift(1))

In [3]:
def count_na(df):
    percent_missing = df.isna().sum() * 100 / len(df)
    return pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})    

# some analytics
print(df.columns.values)
print(count_na(df))

['timestamp' 'acc_phone_x' 'acc_phone_y' 'acc_phone_z' 'acc_watch_x'
 'acc_watch_y' 'acc_watch_z' 'gyr_phone_x' 'gyr_phone_y' 'gyr_phone_z'
 'gyr_watch_x' 'gyr_watch_y' 'gyr_watch_z' 'hr_watch_rate_lag1'
 'hr_watch_rate' 'labelOnTable' 'labelSitting' 'labelWashingHands'
 'labelWalking' 'labelStanding' 'labelDriving' 'labelEating'
 'labelRunning' 'light_phone_lux' 'mag_phone_x' 'mag_phone_y'
 'mag_phone_z' 'mag_watch_x' 'mag_watch_y' 'mag_watch_z'
 'press_phone_pressure']
                               column_name  percent_missing
timestamp                        timestamp         0.000000
acc_phone_x                    acc_phone_x         0.000000
acc_phone_y                    acc_phone_y         0.000000
acc_phone_z                    acc_phone_z         0.000000
acc_watch_x                    acc_watch_x         8.778818
acc_watch_y                    acc_watch_y         8.778818
acc_watch_z                    acc_watch_z         8.778818
gyr_phone_x                    gyr_phone_x  

# Model Training

Training the model on all features in the dataset, including the lag-1 column. Missing values in the predictors were imputed by the mean.

In [4]:
#train/test split
test = df[df['hr_watch_rate'].isna()]
train  = df[[not c for c in df['hr_watch_rate'].isna()]]

# Create sets, impute by mean in training data
X_train = train.drop(['timestamp','hr_watch_rate'],axis=1)
X_train = X_train.fillna(X_train.mean())

y_train = train.loc[:,'hr_watch_rate']

X_test = test.drop(['timestamp','hr_watch_rate'],axis=1)
X_test = X_test.fillna(X_train.mean())

In [5]:
# sklearn implementation
clf = MLPRegressor(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2), random_state=1)

clf.fit(X_train,y_train)

MLPRegressor(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(5, 2), learning_rate='constant',
             learning_rate_init=0.001, max_iter=200, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)

# Prediction

In [6]:
predictions = pd.Series((clf.predict(X_test)), index = test['hr_watch_rate'].index)
imputed_hr = pd.concat([y_train,predictions]).sort_index()

In [7]:
pd.to_csv(imputed_hr, "C")


0        138.588686
1        138.839373
2        139.382790
3        139.152991
4        138.952642
5        159.500000
6        172.525618
7        158.000000
8        156.000000
9        154.000000
10       167.556136
11       138.946241
12       139.006769
13       138.943338
14       138.944720
15       155.000000
16       168.084745
17       138.875072
18       138.955888
19       154.000000
20       155.000000
21       168.247498
22       156.000000
23       157.000000
24       158.000000
25       159.000000
26       171.804365
27       160.000000
28       161.000000
29       173.804304
            ...    
31808    124.304783
31809    124.279818
31810    124.256107
31811    124.294356
31812    124.228687
31813    124.374371
31814    124.293347
31815    124.286879
31816    124.356106
31817    124.276896
31818    124.251430
31819    124.310267
31820    124.349322
31821    124.353347
31822    124.265879
31823    124.283325
31824    124.251910
31825    124.368529
31826    124.260321
