# Episode 19. Supervised Learning For Cross-Sectional Analysis

## Data: Boston Housing Prices
### Keras Built-in function - https://keras.io/api/datasets/boston_housing/
### Excel file form Kaggle - https://www.kaggle.com/fedesoriano/the-boston-houseprice-data


In [None]:
### Original Data - http://lib.stat.cmu.edu/datasets/boston
#### The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
#### prices and the demand for clean air', J. Environ. Economics & Management,
#### vol.5, 81-102, 1978.

# There are 14 attributes in each case of the dataset. They are:
# CRIM - per capita crime rate by town
# ZN - proportion of residential land zoned for lots over 25,000 sq.ft.
# INDUS - proportion of non-retail business acres per town.
# CHAS - Charles River dummy variable (1 if tract bounds river; 0 otherwise)
# NOX - nitric oxides concentration (parts per 10 million)
# RM - average number of rooms per dwelling
# AGE - proportion of owner-occupied units built prior to 1940
# DIS - weighted distances to five Boston employment centres
# RAD - index of accessibility to radial highways
# TAX - full-value property-tax rate per $10,000
# PTRATIO - pupil-teacher ratio by town
# B - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
# LSTAT - % lower status of the population
# MEDV - Median value of owner-occupied homes in $1000's

## Step 1. Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

## Step 2. Load Data

In [None]:
# Load Loan Default Data
# Original Data Source: https://keras.io/api/datasets/boston_housing
from google.colab import files
uploaded = files.upload()

Saving boston.csv to boston (2).csv


In [None]:
# Store data in DataFrame
df = pd.read_csv("boston.csv")

## Step 3. Check & Cleanup Data

In [None]:
# check data
print(df.keys())
# rename columns: small letter, remove special characters and space
df.columns = df.columns.str.lower()
# check modified column names
print(df.keys())

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'MEDV'],
      dtype='object')
Index(['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax',
       'ptratio', 'b', 'lstat', 'medv'],
      dtype='object')


In [None]:
# number of missing data by columns
print(df.isna().sum())
# drop missing data
drop_df = df.dropna(how='any')
print(drop_df.shape)

crim       0
zn         0
indus      0
chas       0
nox        0
rm         0
age        0
dis        0
rad        0
tax        0
ptratio    0
b          0
lstat      0
medv       0
dtype: int64
(506, 14)


In [None]:
# Find Duplicate Data
duplicate_rows_df = df[df.duplicated()]
print(duplicate_rows_df.shape)
# drop duplicate data
print(df.drop_duplicates().shape) 

(0, 14)
(506, 14)


In [None]:
# summary statistics
df.describe()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677082,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


## Step 4. Set Training Variables

In [None]:
# split train & test data
from sklearn.model_selection import train_test_split
x = df.loc[:, df.columns != 'medv']
y = df.loc[:, df.columns == 'medv']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=123)

In [None]:
# check train & test data
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(354, 13)
(354, 1)
(152, 13)
(152, 1)


In [None]:
# summary stats
x_train.describe()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat
count,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0
mean,3.515843,11.432203,10.889379,0.059322,0.550042,6.314153,67.660734,3.85366,9.214689,401.550847,18.441525,360.450537,12.592599
std,9.203,22.989767,6.839935,0.236561,0.113865,0.708672,28.354127,2.127925,8.451106,166.332065,2.133292,87.90948,7.222552
min,0.01301,0.0,0.46,0.0,0.392,3.561,6.0,1.1691,1.0,187.0,12.6,0.32,1.73
25%,0.080573,0.0,5.145,0.0,0.44825,5.9265,42.325,2.1137,4.0,277.0,17.4,377.255,6.8775
50%,0.22325,0.0,8.56,0.0,0.524,6.214,76.5,3.2948,5.0,329.0,18.95,392.215,10.805
75%,2.66286,19.375,18.1,0.0,0.624,6.634,92.975,5.226975,8.0,666.0,20.2,396.295,17.1175
max,88.9762,100.0,27.74,1.0,0.871,8.725,100.0,12.1265,24.0,711.0,22.0,396.9,36.98


In [None]:
# standardize data
mean = x_train.mean(axis=0)
std = x_train.std(axis=0)
x_train = (x_train - mean) / std
x_test = (x_test - mean) / std

In [None]:
# check mean
print(x_train.mean(axis=0))
print(x_test.mean(axis=0))

crim      -5.139486e-17
zn        -6.742880e-17
indus      3.329415e-15
chas      -3.362031e-16
nox        1.761303e-15
rm         2.008437e-15
age        4.522434e-16
dis       -5.582477e-16
rad       -5.394304e-17
tax       -3.261672e-17
ptratio    1.712378e-14
b          7.638232e-15
lstat      7.849967e-16
dtype: float64
crim       0.035334
zn        -0.009929
indus      0.120408
chas       0.138583
nox        0.136033
rm        -0.138660
age        0.107329
dis       -0.091702
rad        0.131848
tax        0.133819
ptratio    0.021859
b         -0.143008
lstat      0.027869
dtype: float64


In [None]:
# check std
print(x_train.std(axis=0))
print(x_test.std(axis=0))

crim       1.0
zn         1.0
indus      1.0
chas       1.0
nox        1.0
rm         1.0
age        1.0
dis        1.0
rad        1.0
tax        1.0
ptratio    1.0
b          1.0
lstat      1.0
dtype: float64
crim       0.763473
zn         1.050736
indus      1.008171
chas       1.226452
nox        1.054913
rm         0.967565
age        0.974814
dis        0.965007
rad        1.095531
tax        1.040803
ptratio    1.051794
b          1.120162
lstat      0.964901
dtype: float64


In [None]:
# check size
print(x_train.shape)

(354, 13)


## Step 5. Neural Network - Sequential

### (1) Define Neural Network

In [None]:
model = Sequential()
model.add(Dense(128, input_dim=13, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='linear'))

### (2) Compile the keras model

In [None]:
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 128)               1792      
                                                                 
 dense_4 (Dense)             (None, 64)                8256      
                                                                 
 dense_5 (Dense)             (None, 1)                 65        
                                                                 
Total params: 10,113
Trainable params: 10,113
Non-trainable params: 0
_________________________________________________________________


### (3) Fit the keras model on the dataset

In [None]:
history = model.fit(x_train, y_train, epochs=100, validation_split=0.05)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

### (4) Plot Training Performance (Loss, MAE)

In [None]:
# plot training loss
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scattergl(y=history.history['loss'],name='Train'))
fig.add_trace(go.Scattergl(y=history.history['val_loss'],name='Valid'))
fig.update_layout(height=500, width=700,xaxis_title='Epoch',yaxis_title='Loss')
fig.show()

In [None]:
# plot mean absolute error
fig = go.Figure()
fig.add_trace(go.Scattergl(y=history.history['mae'],name='Train'))
fig.add_trace(go.Scattergl(y=history.history['val_mae'],name='Valid'))
fig.update_layout(height=500, width=700,xaxis_title='Epoch',yaxis_title='Mean Absolute Error')
fig.show() 

### (5) Evaluate Model using Test Data

In [None]:
mse_testdata, mae_testdata = model.evaluate(x_test, y_test)
print('Mean squared error on test data: ', mse_testdata)
print('Mean absolute error on test data: ', mae_testdata)

Mean squared error on test data:  18.892393112182617
Mean absolute error on test data:  2.783689498901367
