In [34]:
import os
import pandas as pd
from scipy.stats import zscore
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
import pandas as pd
import io
import requests
import numpy as np
from sklearn import metrics
from sklearn.model_selection import KFold
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from scipy.stats import zscore


# Begin assignment
df = pd.read_csv("https://data.heatonresearch.com/data/t81-558/datasets/reg-33-data.csv")

# Encode the feature vector
ids = df['id']
df.drop('id',1,inplace=True)

# fill missing value
df['height'] = df['height'].fillna(df['height'].median())
df['length'] = df['length'].fillna(df['length'].median())

#Dummies
df = pd.get_dummies(data=df, columns=['convention','cat2','usage','region','code','item','country'])

#zscore
df[['height','max','number','length','power','weight']]= df[['height','max','number','length','power','weight']].apply(zscore)
df.head()

# set x & y
x_columns = df.columns.drop('target')
x = df[x_columns].values
y = df['target'].values

# Cross-Validate
kf = KFold(5, shuffle=True, random_state=42)
oos_y = []
oos_pred = []

fold = 0
for train, test in kf.split(x):
  fold+=1
  print(f"Fold #{fold}")
  
  x_train = x[train]
  y_train = y[train]
  x_test = x[test]
  y_test = y[test]
  
  model = Sequential()
  model.add(Dropout(0.2))
  model.add(Dense(60, input_dim=x.shape[1], activation='relu'))
  model.add(Dense(30, activation='relu'))
  model.add(Dense(1))
  model.compile(loss='mean_squared_error', optimizer='adam')
  
  monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto',
        restore_best_weights=True)
  model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor],verbose=0,epochs=1000)
  
  pred = model.predict(x_test)
  
  oos_y.append(y_test)
  oos_pred.append(pred)
  
  # Measure each folder's RMSE
  score = np.sqrt(metrics.mean_squared_error(pred,y_test))
  print(f"Fold score (RMSE): {score}")
  
# Build the oos prediction list and calculate the error.
oos_y = np.concatenate(oos_y) #concatenate默认按列叠加
oos_pred = np.concatenate(oos_pred)
score = np.sqrt(metrics.mean_squared_error(oos_pred,oos_y))
print(f"Final, out of sample score (RMSE): {score}")    

Fold #1
Restoring model weights from the end of the best epoch.
Epoch 00031: early stopping
Fold score (RMSE): 7569.1890592845775
Fold #2
Restoring model weights from the end of the best epoch.
Epoch 00037: early stopping
Fold score (RMSE): 7464.661575783819
Fold #3
Restoring model weights from the end of the best epoch.
Epoch 00028: early stopping
Fold score (RMSE): 7655.867265254203
Fold #4
Restoring model weights from the end of the best epoch.
Epoch 00026: early stopping
Fold score (RMSE): 7560.951308710839
Fold #5
Restoring model weights from the end of the best epoch.
Epoch 00027: early stopping
Fold score (RMSE): 8029.040551306764
Final, out of sample score (RMSE): 7658.418722906013


In [44]:
oos_y = pd.DataFrame(oos_y)
oos_y.rename(columns={0:'y'}, inplace=True)
oos_pred = pd.DataFrame(oos_pred) 
oos_pred.rename(columns={0:'pred'}, inplace=True)
ids = pd.DataFrame(ids)
concatenated = pd.concat([ids, oos_y, oos_pred],axis=1)
concatenated['diff'] = concatenated['pred'] - concatenated['y']
concatenated['diff'] = concatenated['diff'].abs()
concatenated

Unnamed: 0,id,y,pred,diff
0,1,44098.106769,51726.402344,7628.295575
1,2,130572.202064,123238.640625,7333.561439
2,3,67926.242813,72554.304688,4628.061874
3,4,64558.309366,69469.195312,4910.885946
4,5,48666.213736,54935.640625,6269.426889
5,6,54713.558618,61392.417969,6678.859350
6,7,57085.221438,61632.496094,4547.274655
7,8,96855.256660,95376.187500,1479.069160
8,9,104936.042881,100134.687500,4801.355381
9,10,117063.293999,110636.351562,6426.942436
