In [1]:
import pathlib

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

### The Auto MPG Dataset

In [2]:
# Download Dataset
dataset_path = keras.utils.get_file("auto-mpg.data", "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data")

Downloading data from http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data


In [3]:
# Path
dataset_path

'/Users/tarik.setia/.keras/datasets/auto-mpg.data'

#### Import data into Pandas

In [5]:
column_names = ['MPG','Cylinders','Displacement','Horsepower','Weight',
                'Acceleration', 'Model Year', 'Origin']
raw_dataset = pd.read_csv(dataset_path, names=column_names,
                      na_values = "?", comment='\t',
                      sep=" ", skipinitialspace=True)

dataset = raw_dataset.copy()
dataset.tail()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
393,27.0,4,140.0,86.0,2790.0,15.6,82,1
394,44.0,4,97.0,52.0,2130.0,24.6,82,2
395,32.0,4,135.0,84.0,2295.0,11.6,82,1
396,28.0,4,120.0,79.0,2625.0,18.6,82,1
397,31.0,4,119.0,82.0,2720.0,19.4,82,1


#### Clean The Dataset

In [6]:
dataset.isna().sum()

MPG             0
Cylinders       0
Displacement    0
Horsepower      6
Weight          0
Acceleration    0
Model Year      0
Origin          0
dtype: int64

In [7]:
dataset = dataset.dropna()

The "Origin" column is really categorical, not numeric. So convert that to a one-hot:

In [8]:
origin = dataset.pop('Origin')

In [9]:
dataset['USA'] = (origin == 1)*1.0
dataset['Europe'] = (origin == 2)*1.0
dataset['Japan'] = (origin == 3)*1.0
dataset.tail()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,USA,Europe,Japan
393,27.0,4,140.0,86.0,2790.0,15.6,82,1.0,0.0,0.0
394,44.0,4,97.0,52.0,2130.0,24.6,82,0.0,1.0,0.0
395,32.0,4,135.0,84.0,2295.0,11.6,82,1.0,0.0,0.0
396,28.0,4,120.0,79.0,2625.0,18.6,82,1.0,0.0,0.0
397,31.0,4,119.0,82.0,2720.0,19.4,82,1.0,0.0,0.0


In [10]:
train_dataset = dataset.sample(frac=0.8,random_state=0)
test_dataset = dataset.drop(train_dataset.index)

In [11]:
train_stats = train_dataset.describe()
train_stats.pop("MPG")
train_stats = train_stats.transpose()
train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Cylinders,314.0,5.477707,1.699788,3.0,4.0,4.0,8.0,8.0
Displacement,314.0,195.318471,104.331589,68.0,105.5,151.0,265.75,455.0
Horsepower,314.0,104.869427,38.096214,46.0,76.25,94.5,128.0,225.0
Weight,314.0,2990.251592,843.898596,1649.0,2256.5,2822.5,3608.0,5140.0
Acceleration,314.0,15.559236,2.78923,8.0,13.8,15.5,17.2,24.8
Model Year,314.0,75.898089,3.675642,70.0,73.0,76.0,79.0,82.0
USA,314.0,0.624204,0.485101,0.0,0.0,1.0,1.0,1.0
Europe,314.0,0.178344,0.383413,0.0,0.0,0.0,0.0,1.0
Japan,314.0,0.197452,0.398712,0.0,0.0,0.0,0.0,1.0


In [12]:
train_labels = train_dataset.pop('MPG')
test_labels = test_dataset.pop('MPG')

#### Normalization or Data Scaling

It is good practice to normalize features that use different scales and ranges. Although the model might converge without feature normalization, it makes training more difficult, and it makes the resulting model dependent on the choice of units used in the input.

In [14]:
def norm(x):
  return (x - train_stats['mean']) / train_stats['std']

In [15]:
normed_train_data = norm(train_dataset)
normed_test_data = norm(test_dataset)

#### Modelling

##### Build The Model

In [18]:
input_shape = [len(train_dataset.keys())]
model = keras.Sequential()
model.add(layers.Dense(64, activation=tf.nn.relu, input_shape=input_shape))
model.add(layers.Dense(64, activation=tf.nn.relu))
model.add(layers.Dense(1))

In [19]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 64)                640       
_________________________________________________________________
dense_4 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 65        
Total params: 4,865
Trainable params: 4,865
Non-trainable params: 0
_________________________________________________________________


In [34]:
model.layers[0].weights

[<tf.Variable 'dense_3/kernel:0' shape=(9, 64) dtype=float32, numpy=
 array([[-0.0101608 ,  0.20827788, -0.07146697, -0.08586022,  0.08948386,
          0.21811244,  0.00618821, -0.24579984, -0.18639244, -0.14776604,
          0.15807515,  0.09399867, -0.16379012,  0.09845096, -0.19883108,
          0.02958992,  0.24268755, -0.00034997, -0.17855774,  0.03927991,
         -0.07999367,  0.09025058, -0.12088725,  0.10983363, -0.03656524,
         -0.15782835,  0.01943055,  0.10108909,  0.15910572, -0.13888712,
         -0.14388922, -0.08230145,  0.0758273 , -0.22839019,  0.01388642,
          0.20711076, -0.20306183, -0.25396878,  0.28655282,  0.14185151,
          0.14742228, -0.15139234,  0.16450435,  0.0958384 , -0.12173906,
          0.23111346, -0.06351894, -0.07005392, -0.02161387,  0.23396268,
          0.2364383 ,  0.2540057 , -0.27323952, -0.24618173,  0.18980289,
         -0.26513538, -0.09569389,  0.12643692, -0.1301795 , -0.2757776 ,
          0.016478  , -0.21669652, -0.11370

##### Let us try to make prediction on this untrained model
Take a batch of 10 examples from the training data and call model.predict on it.

In [40]:
example_batch = normed_train_data[:10]
example_result = model.predict(example_batch)
example_result

array([[ 0.12262779],
       [ 0.15393837],
       [-0.1827798 ],
       [ 0.36731938],
       [-0.35578564],
       [ 0.09139582],
       [-0.30913576],
       [-0.15643771],
       [ 0.09058893],
       [-0.28850204]], dtype=float32)