# Neural Network for Regression 
This code blends the best of 2 tutorials: https://towardsdatascience.com/deep-neural-networks-for-regression-problems-81321897ca33 and https://www.tensorflow.org/tutorials/keras/regression

In [19]:
# If error [No module named 'sklearn'], in terminal: conda install -c conda-forge scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error 
%matplotlib notebook
from matplotlib import pyplot as plt
import seaborn as sb
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)
# pip install git+https://github.com/tensorflow/docs
import tensorflow as tf
import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling
# Libraries and options
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten

# 1. Data Preprocessing
Data: https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data

In [2]:
# Load data
gt = pd.read_csv('train.csv')
gt.describe()

#!# If opening teh funda file bugs because of special characters:
# Open the file with sublime. Open menu Find > Replace. 
# On the bottom left of the screen, click on the button for regular expression (it's a dot and a star .* )
# Run thi sregular expression, and replace with a space 
# [^A-Za-z0-9-,;()_ "'\.\-\n\r\t]

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [18]:
# Number of rows & columns
gt.shape

(1460, 216)

### Exclude data points with missing values

In [3]:
# Exclude columns containing missing values
def get_cols_with_no_nans(df):
    cols_with_no_nans = []
    for col in df.columns:
        if not df[col].isnull().any():
            cols_with_no_nans.append(col)
    return cols_with_no_nans

cols_no_nans = get_cols_with_no_nans(gt)
gt = gt[cols_no_nans]
gt.describe()

Unnamed: 0,Id,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,10516.828082,6.099315,5.575342,1971.267808,1984.865753,443.639726,46.549315,567.240411,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,9981.264932,1.382997,1.112799,30.202904,20.645407,456.098091,161.319273,441.866955,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,223.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,9478.5,6.0,5.0,1973.0,1994.0,383.5,0.0,477.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,11601.5,7.0,6.0,2000.0,2004.0,712.25,0.0,808.0,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,215245.0,10.0,9.0,2010.0,2010.0,5644.0,1474.0,2336.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


### Clean data frame

In [4]:
# Rename & extract the variable to predict
target = gt.SalePrice
gt.drop(['SalePrice'],axis = 1 , inplace = True)
gt['Target'] = target

# Drop column ID
gt.drop(['Id'],axis = 1 , inplace = True)

# Overview of feature distribution
gt.hist(figsize = (10,8))
plt.show()

<IPython.core.display.Javascript object>

### Explore correlations

In [5]:
# Compute correlation
correlation_matrix = gt.corr()

# Open figure container
fig = plt.figure(figsize = (8,8))
# Make color heatmap (library seaborn)
sb.heatmap(correlation_matrix, vmax = 0.8, vmin=-0.8, square = True, cmap='RdBu_r')
# Display plot
plt.show()

<IPython.core.display.Javascript object>

### One-hot-encoding of categorical variables

In [6]:
# Function for the one-hot encoding
def oneHotEncode(df,colNames):
    for col in colNames:
        if( df[col].dtype == np.dtype('object')):
            dummies = pd.get_dummies(df[col],prefix=col)
            df = pd.concat([df,dummies],axis=1)

            #drop the encoded column
            df.drop([col],axis = 1 , inplace=True)
    return df

# Get index of categorical columns 
cat_cols = gt.select_dtypes(include=['object'])
cat_cols = cat_cols.columns

# Transform data
print('There were {} columns before encoding categorical features'.format(gt.shape[1]))
gt = oneHotEncode(gt, cat_cols)
print('There are {} columns after encoding categorical features'.format(gt.shape[1]))


There were 61 columns before encoding categorical features
There are 216 columns after encoding categorical features


### Split data into train & test

In [7]:
n_train = round(gt.shape[0] * 0.6)
train = gt[:n_train]
test = gt[n_train:]

target = train['Target']
train.drop(['Target'], axis = 1, inplace = True)

truth = test['Target']
test.drop(['Target'], axis = 1, inplace = True)

# 2. Build Neural Network

In [8]:
# Initiate a sequential model (i.e., no recurrence)
NN_model = Sequential()

# Make the first layer
NN_model.add(Dense(128, kernel_initializer='normal',input_dim = train.shape[1], activation='relu'))

# Make hidden layers
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))

# Make the output layer
NN_model.add(Dense(1, kernel_initializer='normal',activation='linear'))

# Compile the network
NN_model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_absolute_error'])
NN_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 128)               27648     
_________________________________________________________________
dense_1 (Dense)              (None, 256)               33024     
_________________________________________________________________
dense_2 (Dense)              (None, 256)               65792     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 257       
Total params: 126,721
Trainable params: 126,721
Non-trainable params: 0
_________________________________________________________________


### Checkpoint callback
A backup system that saves the models learned at each epoch.
It saves only the models that are better than the previous models.

A file is made for each new model, containing the weights & biases of each neuron. Any of these files can be loaded to instantiate the corresponding network.

In [9]:
# Define how to name the files
checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
# Instantiate the checkpoint system
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint]

### Train the neural network

In [10]:
history = NN_model.fit(train, target, epochs=100, 
             # No. of randomly sampled data points used to compute the errors at each epoch (avoid overfitting)
             batch_size=32, 
             # Size of validation set for cross-validation
             validation_split = 0.2, 
             # Link to checkpoint system, to check teh best model previously built
             callbacks=callbacks_list)

Train on 700 samples, validate on 176 samples
Epoch 1/100
 32/700 [>.............................] - ETA: 5s - loss: 35472637952.0000 - mean_absolute_error: 179352.1875
Epoch 00001: val_loss improved from inf to 31822081861.81818, saving model to Weights-001--31822081861.81818.hdf5
Epoch 2/100
 32/700 [>.............................] - ETA: 0s - loss: 49331843072.0000 - mean_absolute_error: 182491.9688
Epoch 00002: val_loss improved from 31822081861.81818 to 7853151837.09091, saving model to Weights-002--7853151837.09091.hdf5
Epoch 3/100
 32/700 [>.............................] - ETA: 0s - loss: 5871048704.0000 - mean_absolute_error: 50696.1094
Epoch 00003: val_loss improved from 7853151837.09091 to 4872776704.00000, saving model to Weights-003--4872776704.00000.hdf5
Epoch 4/100
Epoch 00004: val_loss improved from 4872776704.00000 to 4563818007.27273, saving model to Weights-004--4563818007.27273.hdf5
Epoch 5/100
 32/700 [>.............................] - ETA: 0s - loss: 2504974080.000

Epoch 00041: val_loss did not improve from 1364200203.63636
Epoch 42/100
 32/700 [>.............................] - ETA: 0s - loss: 1664751104.0000 - mean_absolute_error: 28013.9980
Epoch 00042: val_loss improved from 1364200203.63636 to 1349876427.63636, saving model to Weights-042--1349876427.63636.hdf5
Epoch 43/100
 32/700 [>.............................] - ETA: 0s - loss: 2370180352.0000 - mean_absolute_error: 35112.0000
Epoch 00043: val_loss did not improve from 1349876427.63636
Epoch 44/100
 32/700 [>.............................] - ETA: 0s - loss: 1794754432.0000 - mean_absolute_error: 25597.9199
Epoch 00044: val_loss did not improve from 1349876427.63636
Epoch 45/100
 32/700 [>.............................] - ETA: 0s - loss: 1041768320.0000 - mean_absolute_error: 24063.2539
Epoch 00045: val_loss did not improve from 1349876427.63636
Epoch 46/100
 32/700 [>.............................] - ETA: 0s - loss: 2553967104.0000 - mean_absolute_error: 32834.9453
Epoch 00046: val_loss imp

 32/700 [>.............................] - ETA: 0s - loss: 6491301888.0000 - mean_absolute_error: 40606.3477
Epoch 00063: val_loss did not improve from 1331360640.00000
Epoch 64/100
 32/700 [>.............................] - ETA: 0s - loss: 2222128128.0000 - mean_absolute_error: 24900.9492
Epoch 00064: val_loss improved from 1331360640.00000 to 1316913163.63636, saving model to Weights-064--1316913163.63636.hdf5
Epoch 65/100
 32/700 [>.............................] - ETA: 0s - loss: 1102179328.0000 - mean_absolute_error: 23871.4727
Epoch 00065: val_loss did not improve from 1316913163.63636
Epoch 66/100
 32/700 [>.............................] - ETA: 0s - loss: 1750078976.0000 - mean_absolute_error: 28422.4805
Epoch 00066: val_loss did not improve from 1316913163.63636
Epoch 67/100
 32/700 [>.............................] - ETA: 0s - loss: 994994880.0000 - mean_absolute_error: 23052.2148
Epoch 00067: val_loss did not improve from 1316913163.63636
Epoch 68/100
 32/700 [>................

### Inspect errors, epoch after epoch

In [11]:
# Retrieve the hitory of every epoch's training
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()

Unnamed: 0,loss,mean_absolute_error,val_loss,val_mean_absolute_error,epoch
95,1461661000.0,25051.431641,1312171000.0,24675.289062,95
96,1536749000.0,26085.853516,1356326000.0,25519.607422,96
97,1491566000.0,25233.65625,1402628000.0,25898.054688,97
98,1482349000.0,25377.494141,1318064000.0,24637.498047,98
99,1529475000.0,25373.476562,1300698000.0,24431.054688,99


In [13]:
# Plot the evolution of errors epoch after epoch
plotter = tfdocs.plots.HistoryPlotter(smoothing_std=2)
plotter.plot({'Basic': history}, metric = "mean_absolute_error")
plt.ylabel('MAE')
plt.show()

<IPython.core.display.Javascript object>

# 3. Test the Neural Network
First we instantiate the model using the bext parameters from the training phase.

In [14]:
#!# Check best checkpoint in your local folder (the last file saved)
#   Paste the name of the last file saved by tech checkpoint system
weights_file = 'Weights-100--1266909282.90909.hdf5' 

# Load teh parameters of teh models (weights & bias)
NN_model.load_weights(weights_file)
NN_model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error'])

In [15]:
prediction = NN_model.predict(test)

In [16]:
plt.figure()
plt.scatter(prediction, truth, alpha=0.2)
plt.xlabel('Predictions')
plt.ylabel('True Values')
lims = [0, 800000]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)
plt.show()

<IPython.core.display.Javascript object>

In [17]:
error = prediction - truth.to_numpy()
plt.figure()
plt.hist(error[0], bins = 30)
plt.xlabel("Prediction Error")
plt.ylabel("Count")
plt.show()

<IPython.core.display.Javascript object>