In [1]:
import pandas as pd
from sklearn.neural_network import MLPRegressor

# Data Processing & Visualisation

In [15]:
# reading in results of the ch2 code
df = pd.read_csv("chapter2_result.csv")
df.rename(columns={'Unnamed: 0':'timestamp'}, inplace=True )

# add lag column, forward fill missing values and backfill the first few values
df.insert(13,'hr_watch_rate_lag1', df['hr_watch_rate'].fillna(method='ffill').shift(1).fillna(method='bfill'))

df.loc[:,['hr_watch_rate','hr_watch_rate_lag1']]

Unnamed: 0,hr_watch_rate,hr_watch_rate_lag1
0,,159.5
1,,159.5
2,,159.5
3,,159.5
4,,159.5
5,159.5,159.5
6,,159.5
7,158.0,159.5
8,156.0,158.0
9,154.0,156.0


In [16]:
def count_na(df):
    percent_missing = df.isna().sum() * 100 / len(df)
    return pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})    

# some analytics
print(df.columns.values)
print(count_na(df))

['timestamp' 'acc_phone_x' 'acc_phone_y' 'acc_phone_z' 'acc_watch_x'
 'acc_watch_y' 'acc_watch_z' 'gyr_phone_x' 'gyr_phone_y' 'gyr_phone_z'
 'gyr_watch_x' 'gyr_watch_y' 'gyr_watch_z' 'hr_watch_rate_lag1'
 'hr_watch_rate' 'labelOnTable' 'labelSitting' 'labelWashingHands'
 'labelWalking' 'labelStanding' 'labelDriving' 'labelEating'
 'labelRunning' 'light_phone_lux' 'mag_phone_x' 'mag_phone_y'
 'mag_phone_z' 'mag_watch_x' 'mag_watch_y' 'mag_watch_z'
 'press_phone_pressure']
                               column_name  percent_missing
timestamp                        timestamp         0.000000
acc_phone_x                    acc_phone_x         0.000000
acc_phone_y                    acc_phone_y         0.000000
acc_phone_z                    acc_phone_z         0.000000
acc_watch_x                    acc_watch_x         8.778818
acc_watch_y                    acc_watch_y         8.778818
acc_watch_z                    acc_watch_z         8.778818
gyr_phone_x                    gyr_phone_x  

# Model Training

Training the model on all features in the dataset, including the lag-1 column. Missing values in the predictors were imputed by the mean.

In [17]:
#train/test split
test  = df[df['hr_watch_rate'].isna()]
train = df[[not c for c in df['hr_watch_rate'].isna()]]

# Create sets, impute by mean in training data
X_train = train.drop(['timestamp','hr_watch_rate'],axis=1)
X_train = X_train.fillna(X_train.mean())

y_train = train.loc[:,'hr_watch_rate']

X_test = test.drop(['timestamp','hr_watch_rate'],axis=1)
X_test = X_test.fillna(X_train.mean())

In [18]:
# sklearn implementation
clf = MLPRegressor(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(100, 10), random_state=1)

clf.fit(X_train,y_train)

# pd.concat([X_train['hr_watch_rate_lag1'],y_train],axis=1)

MLPRegressor(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 10), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

# Prediction

In [19]:
predictions = pd.Series((clf.predict(X_test)), index = test['hr_watch_rate'].index)
imputed_hr = pd.concat([y_train,predictions]).sort_index()

imputed_hr

0        159.185892
1        160.045611
2        160.049573
3        160.093050
4        160.111520
5        159.500000
6        160.133688
7        158.000000
8        156.000000
9        154.000000
10       154.624430
11       154.685358
12       154.697814
13       153.640003
14       154.677950
15       155.000000
16       155.678424
17       155.689049
18       155.665470
19       154.000000
20       155.000000
21       155.657178
22       156.000000
23       157.000000
24       158.000000
25       159.000000
26       159.550020
27       160.000000
28       161.000000
29       161.554769
            ...    
31808    101.690037
31809    101.691340
31810    101.691820
31811    101.690372
31812    101.692307
31813    102.324665
31814    101.693543
31815    101.690823
31816    101.690611
31817    101.693112
31818    101.696980
31819    101.696723
31820    101.693815
31821    102.325825
31822    101.694729
31823    101.694561
31824    101.695793
31825    102.324262
31826    101.695280


In [7]:
imputed_hr.to_csv("/Users/tommymaaiveld/Documents/Github/planningandreinf/ML4QS/notebooks/ch3/output_model.csv")
