In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [2]:
import os
import collections
import itertools

In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [4]:
from six.moves import urllib

In [5]:
print(np.__version__)
print(pd.__version__)
print(tf.__version__)

1.13.1
0.22.0
1.3.0


In [6]:
URL_PATH = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"

DOWNLOADED_FILENAME = "automobiles.csv"

def download_data():
    if not os.path.exists(DOWNLOADED_FILENAME):
        filename, _ = urllib.request.urlretrieve(URL_PATH, DOWNLOADED_FILENAME)

    print('Found and verified file from this path: ', URL_PATH)
    print('Downloaded file: ', DOWNLOADED_FILENAME)        

In [7]:
download_data()

Found and verified file from this path:  https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data
Downloaded file:  automobiles.csv


In [8]:
COLUMN_TYPES = collections.OrderedDict([
    ("symboling", int),
    ("normalized-losses", float),
    ("make", str),
    ("fuel-type", str),
    ("aspiration", str),
    ("num-of-doors", str),
    ("body-style", str),
    ("drive-wheels", str),
    ("engine-location", str),
    ("wheel-base", float),
    ("length", float),
    ("width", float),
    ("height", float),
    ("curb-weight", float),
    ("engine-type", str),
    ("num-of-cylinders", str),
    ("engine-size", float),
    ("fuel-system", str),
    ("bore", float),
    ("stroke", float),
    ("compression-ratio", float),
    ("horsepower", float),
    ("peak-rpm", float),
    ("city-mpg", float),
    ("highway-mpg", float),
    ("price", float)
])

In [9]:
df = pd.read_csv(DOWNLOADED_FILENAME, names=COLUMN_TYPES.keys(),
                 dtype=COLUMN_TYPES, na_values="?")

In [10]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130.0,mpfi,3.47,2.68,9.0,111.0,5000.0,21.0,27.0,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130.0,mpfi,3.47,2.68,9.0,111.0,5000.0,21.0,27.0,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152.0,mpfi,2.68,3.47,9.0,154.0,5000.0,19.0,26.0,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109.0,mpfi,3.19,3.4,10.0,102.0,5500.0,24.0,30.0,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136.0,mpfi,3.19,3.4,8.0,115.0,5500.0,18.0,22.0,17450.0


In [11]:
df.count()

symboling            205
normalized-losses    164
make                 205
fuel-type            205
aspiration           205
num-of-doors         203
body-style           205
drive-wheels         205
engine-location      205
wheel-base           205
length               205
width                205
height               205
curb-weight          205
engine-type          205
num-of-cylinders     205
engine-size          205
fuel-system          205
bore                 201
stroke               201
compression-ratio    205
horsepower           203
peak-rpm             203
city-mpg             205
highway-mpg          205
price                201
dtype: int64

In [12]:
df = df.dropna()

In [13]:
df.count()

symboling            159
normalized-losses    159
make                 159
fuel-type            159
aspiration           159
num-of-doors         159
body-style           159
drive-wheels         159
engine-location      159
wheel-base           159
length               159
width                159
height               159
curb-weight          159
engine-type          159
num-of-cylinders     159
engine-size          159
fuel-system          159
bore                 159
stroke               159
compression-ratio    159
horsepower           159
peak-rpm             159
city-mpg             159
highway-mpg          159
price                159
dtype: int64

In [14]:
TRIMMED_CSV_COLUMNS = [
    "make", "fuel-type", "aspiration", "num-of-doors", "body-style",
    "drive-wheels", "curb-weight", "engine-type", "num-of-cylinders", "engine-size",
    "fuel-system", "horsepower", "peak-rpm", "city-mpg", "highway-mpg", "price"
]

In [15]:
df = df[TRIMMED_CSV_COLUMNS]

In [16]:
df.head()

Unnamed: 0,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,horsepower,peak-rpm,city-mpg,highway-mpg,price
3,audi,gas,std,four,sedan,fwd,2337.0,ohc,four,109.0,mpfi,102.0,5500.0,24.0,30.0,13950.0
4,audi,gas,std,four,sedan,4wd,2824.0,ohc,five,136.0,mpfi,115.0,5500.0,18.0,22.0,17450.0
6,audi,gas,std,four,sedan,fwd,2844.0,ohc,five,136.0,mpfi,110.0,5500.0,19.0,25.0,17710.0
8,audi,gas,turbo,four,sedan,fwd,3086.0,ohc,five,131.0,mpfi,140.0,5500.0,17.0,20.0,23875.0
10,bmw,gas,std,two,sedan,rwd,2395.0,ohc,four,108.0,mpfi,101.0,5800.0,23.0,29.0,16430.0


In [17]:
Y_NAME = "price"

def get_training_test_prediction_data(df):
    
    # Generate a unique shuffle each time
    np.random.seed(None)

    # Split the data into train/test subsets.
    x_train = df.sample(frac=0.8, random_state=None)
    
    # Remove the training data from the original dataset
    x_test = df.drop(x_train.index)
    
    # Choose a small sample from the test data for prediction
    x_predict = x_test.sample(frac=0.2, random_state=None)
    
    # Extract the label from the features DataFrame.
    y_train = x_train.pop(Y_NAME)
    y_test = x_test.pop(Y_NAME)
    y_predict = x_predict.pop(Y_NAME)
    
    return (x_train, y_train), (x_test, y_test), (x_predict, y_predict) 

In [18]:
(x_train, y_train), (x_test, y_test), (x_predict, y_predict) = \
    get_training_test_prediction_data(df)

In [19]:
x_train.head()

Unnamed: 0,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,horsepower,peak-rpm,city-mpg,highway-mpg
180,toyota,gas,std,four,sedan,rwd,3131.0,dohc,six,171.0,mpfi,156.0,5200.0,20.0,24.0
173,toyota,gas,std,four,sedan,fwd,2326.0,ohc,four,122.0,mpfi,92.0,4200.0,29.0,34.0
79,mitsubishi,gas,turbo,two,hatchback,fwd,2145.0,ohc,four,98.0,spdi,102.0,5500.0,24.0,30.0
188,volkswagen,gas,std,four,sedan,fwd,2300.0,ohc,four,109.0,mpfi,100.0,5500.0,26.0,32.0
168,toyota,gas,std,two,hardtop,rwd,2536.0,ohc,four,146.0,mpfi,116.0,4800.0,24.0,30.0


In [20]:
y_train.head()

180    15690.0
173     8948.0
79      7689.0
188     9995.0
168     9639.0
Name: price, dtype: float64

In [21]:
PRICE_SCALING_FACTOR = 10000

y_train /= PRICE_SCALING_FACTOR
y_test /= PRICE_SCALING_FACTOR

In [22]:
y_train.head()

180    1.5690
173    0.8948
79     0.7689
188    0.9995
168    0.9639
Name: price, dtype: float64

In [23]:
df['make'].unique()

array(['audi', 'bmw', 'chevrolet', 'dodge', 'honda', 'jaguar', 'mazda',
       'mercedes-benz', 'mitsubishi', 'nissan', 'peugot', 'plymouth',
       'porsche', 'saab', 'subaru', 'toyota', 'volkswagen', 'volvo'], dtype=object)

In [24]:
df['fuel-type'].unique()

array(['gas', 'diesel'], dtype=object)

In [25]:
df['aspiration'].unique()

array(['std', 'turbo'], dtype=object)

In [26]:
df['num-of-doors'].unique()

array(['four', 'two'], dtype=object)

In [27]:
df['body-style'].unique()

array(['sedan', 'hatchback', 'wagon', 'hardtop', 'convertible'], dtype=object)

In [28]:
df['drive-wheels'].unique()

array(['fwd', '4wd', 'rwd'], dtype=object)

In [29]:
df['engine-type'].unique()

array(['ohc', 'l', 'dohc', 'ohcv', 'ohcf'], dtype=object)

In [30]:
df['num-of-cylinders'].unique()

array(['four', 'five', 'six', 'three', 'eight'], dtype=object)

In [31]:
df['fuel-system'].unique()

array(['mpfi', '2bbl', 'mfi', '1bbl', 'idi', 'spdi'], dtype=object)

In [32]:
curb_weight = tf.feature_column.numeric_column("curb-weight")

engine_size = tf.feature_column.numeric_column("engine-size")

horsepower = tf.feature_column.numeric_column("horsepower")

peak_rpm = tf.feature_column.numeric_column("peak-rpm")

city_mpg = tf.feature_column.numeric_column("city-mpg")

highway_mpg = tf.feature_column.numeric_column("highway-mpg")

In [33]:
body_style = tf.feature_column.categorical_column_with_vocabulary_list(
      key="body-style", vocabulary_list=df['body-style'].unique())

fuel_type = tf.feature_column.categorical_column_with_vocabulary_list(
      key="fuel-type", vocabulary_list=df['fuel-type'].unique())

aspiration = tf.feature_column.categorical_column_with_vocabulary_list(
      key="aspiration", vocabulary_list=df['aspiration'].unique())

num_of_doors = tf.feature_column.categorical_column_with_vocabulary_list(
      key="num-of-doors", vocabulary_list=df['num-of-doors'].unique())

drive_wheels = tf.feature_column.categorical_column_with_vocabulary_list(
      key="drive-wheels", vocabulary_list=df['drive-wheels'].unique())

engine_type = tf.feature_column.categorical_column_with_vocabulary_list(
      key="engine-type", vocabulary_list=df['engine-type'].unique())

num_of_cylinders = tf.feature_column.categorical_column_with_vocabulary_list(
      key="num-of-cylinders", vocabulary_list=df['num-of-cylinders'].unique())

fuel_system = tf.feature_column.categorical_column_with_vocabulary_list(
      key="fuel-system", vocabulary_list=df['fuel-system'].unique())

In [34]:
make = tf.feature_column.categorical_column_with_hash_bucket(
      key="make", hash_bucket_size=50)

In [35]:
feature_columns = [
    curb_weight, engine_size, horsepower, peak_rpm, city_mpg, highway_mpg,

    tf.feature_column.indicator_column(body_style),

    tf.feature_column.embedding_column(fuel_type, dimension=3),

    tf.feature_column.embedding_column(aspiration, dimension=3),
    tf.feature_column.embedding_column(num_of_doors, dimension=3),
    tf.feature_column.embedding_column(drive_wheels, dimension=3),
    tf.feature_column.embedding_column(engine_type, dimension=3),
    tf.feature_column.embedding_column(num_of_cylinders, dimension=3),
    tf.feature_column.embedding_column(fuel_system, dimension=3),

    tf.feature_column.embedding_column(make, dimension=4)    
]

In [36]:
def input_fn(x_data, y_data, num_epochs, shuffle):

    return tf.estimator.inputs.pandas_input_fn(
          x=x_data,
          y=y_data,
          batch_size=64,
          num_epochs=num_epochs,
          shuffle=shuffle)            

In [37]:
model = tf.estimator.DNNRegressor(
      hidden_units=[24, 16, 24], feature_columns=feature_columns)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\tara\\AppData\\Local\\Temp\\tmpwpeggrbw', '_tf_random_seed': 1, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_save_checkpoints_steps': None, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100}


In [38]:
model.train(input_fn=input_fn(x_train, y_train, num_epochs=None, shuffle=True), steps=20000)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into C:\Users\tara\AppData\Local\Temp\tmpwpeggrbw\model.ckpt.
INFO:tensorflow:loss = 2.87233e+07, step = 1
INFO:tensorflow:global_step/sec: 85.191
INFO:tensorflow:loss = 8.03739, step = 101 (1.219 sec)
INFO:tensorflow:global_step/sec: 108.108
INFO:tensorflow:loss = 12.8202, step = 201 (0.880 sec)
INFO:tensorflow:global_step/sec: 128.111
INFO:tensorflow:loss = 6.07589, step = 301 (0.781 sec)
INFO:tensorflow:global_step/sec: 125.07
INFO:tensorflow:loss = 9.50914, step = 401 (0.800 sec)
INFO:tensorflow:global_step/sec: 127.786
INFO:tensorflow:loss = 6.60544, step = 501 (0.784 sec)
INFO:tensorflow:global_step/sec: 126.17
INFO:tensorflow:loss = 8.61999, step = 601 (0.793 sec)
INFO:tensorflow:global_step/sec: 114.077
INFO:tensorflow:loss = 9.31159, step = 701 (0.879 sec)
INFO:tensorflow:global_step/sec: 126.81
INFO:tensorflow:loss = 5.80941, step = 801 (0.784 sec)
INFO:tensorflow:global_step/sec: 164.091
INF

INFO:tensorflow:global_step/sec: 235.682
INFO:tensorflow:loss = 6.78874, step = 8401 (0.424 sec)
INFO:tensorflow:global_step/sec: 243.129
INFO:tensorflow:loss = 3.65842, step = 8501 (0.412 sec)
INFO:tensorflow:global_step/sec: 237.366
INFO:tensorflow:loss = 6.18679, step = 8601 (0.421 sec)
INFO:tensorflow:global_step/sec: 243.136
INFO:tensorflow:loss = 5.2371, step = 8701 (0.411 sec)
INFO:tensorflow:global_step/sec: 241.954
INFO:tensorflow:loss = 6.89925, step = 8801 (0.415 sec)
INFO:tensorflow:global_step/sec: 249.096
INFO:tensorflow:loss = 6.75883, step = 8901 (0.398 sec)
INFO:tensorflow:global_step/sec: 237.926
INFO:tensorflow:loss = 6.99544, step = 9001 (0.421 sec)
INFO:tensorflow:global_step/sec: 243.729
INFO:tensorflow:loss = 5.94034, step = 9101 (0.410 sec)
INFO:tensorflow:global_step/sec: 241.373
INFO:tensorflow:loss = 5.49686, step = 9201 (0.413 sec)
INFO:tensorflow:global_step/sec: 242.546
INFO:tensorflow:loss = 7.72204, step = 9301 (0.412 sec)
INFO:tensorflow:global_step/sec

INFO:tensorflow:global_step/sec: 266.477
INFO:tensorflow:loss = 6.52868, step = 16801 (0.376 sec)
INFO:tensorflow:global_step/sec: 261.929
INFO:tensorflow:loss = 9.22004, step = 16901 (0.382 sec)
INFO:tensorflow:global_step/sec: 293.909
INFO:tensorflow:loss = 6.03705, step = 17001 (0.340 sec)
INFO:tensorflow:global_step/sec: 290.491
INFO:tensorflow:loss = 6.71624, step = 17101 (0.344 sec)
INFO:tensorflow:global_step/sec: 297.407
INFO:tensorflow:loss = 9.7001, step = 17201 (0.336 sec)
INFO:tensorflow:global_step/sec: 294.775
INFO:tensorflow:loss = 6.6895, step = 17301 (0.338 sec)
INFO:tensorflow:global_step/sec: 272.285
INFO:tensorflow:loss = 6.27706, step = 17401 (0.368 sec)
INFO:tensorflow:global_step/sec: 287.152
INFO:tensorflow:loss = 6.76437, step = 17501 (0.348 sec)
INFO:tensorflow:global_step/sec: 279.131
INFO:tensorflow:loss = 5.38392, step = 17601 (0.357 sec)
INFO:tensorflow:global_step/sec: 291.338
INFO:tensorflow:loss = 4.75781, step = 17701 (0.343 sec)
INFO:tensorflow:global

<tensorflow.python.estimator.canned.dnn.DNNRegressor at 0x8d3eeac160>

In [39]:
results = model.evaluate(input_fn=input_fn(x_test, y_test, num_epochs=1, shuffle=False))

INFO:tensorflow:Starting evaluation at 2018-06-04-11:23:38
INFO:tensorflow:Restoring parameters from C:\Users\tara\AppData\Local\Temp\tmpwpeggrbw\model.ckpt-20000
INFO:tensorflow:Finished evaluation at 2018-06-04-11:23:43
INFO:tensorflow:Saving dict for global step 20000: average_loss = 0.223781, global_step = 20000, loss = 7.161


In [40]:
for key in sorted(results):
    print("%s: %s" % (key, results[key]))


average_loss: 0.223781
global_step: 20000
loss: 7.161


In [41]:
average_loss = results["average_loss"]

In [42]:
print("\nRMS error for the test set: ${:.0f}"
        .format(PRICE_SCALING_FACTOR * average_loss**0.5))


RMS error for the test set: $4731


In [43]:
len(x_predict), len(y_predict)

(6, 6)

In [44]:
predict_results = model.predict(input_fn=input_fn(x_predict, y_predict, num_epochs=1, shuffle=False))

In [45]:
predictions = list(itertools.islice(predict_results, len(x_predict)))

INFO:tensorflow:Restoring parameters from C:\Users\tara\AppData\Local\Temp\tmpwpeggrbw\model.ckpt-20000


In [46]:
predictions

[{'predictions': array([ 1.22546482], dtype=float32)},
 {'predictions': array([ 0.95887655], dtype=float32)},
 {'predictions': array([ 0.93870825], dtype=float32)},
 {'predictions': array([ 1.39684391], dtype=float32)},
 {'predictions': array([ 0.88005537], dtype=float32)},
 {'predictions': array([ 1.00222111], dtype=float32)}]

In [47]:
predicted_prices = [obj['predictions'][0] * PRICE_SCALING_FACTOR for obj in predictions]

In [48]:
predicted_prices

[12254.648208618164,
 9588.7655019760132,
 9387.0824575424194,
 13968.439102172852,
 8800.5536794662476,
 10022.211074829102]

In [49]:
compare_df = x_predict.copy()

In [50]:
compare_df

Unnamed: 0,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,horsepower,peak-rpm,city-mpg,highway-mpg
137,saab,gas,turbo,four,sedan,fwd,2847.0,dohc,four,121.0,mpfi,160.0,5500.0,19.0,26.0
158,toyota,diesel,std,four,sedan,fwd,2275.0,ohc,four,110.0,idi,56.0,4500.0,34.0,36.0
26,dodge,gas,std,four,sedan,fwd,1989.0,ohc,four,90.0,2bbl,68.0,5500.0,31.0,38.0
106,nissan,gas,std,two,hatchback,rwd,3139.0,ohcv,six,181.0,mpfi,160.0,5200.0,19.0,25.0
151,toyota,gas,std,two,hatchback,fwd,2040.0,ohc,four,92.0,2bbl,62.0,4800.0,31.0,38.0
122,plymouth,gas,std,four,sedan,fwd,2191.0,ohc,four,98.0,2bbl,68.0,5500.0,31.0,38.0


In [51]:
compare_df['actual-price'] = y_predict
compare_df['predicted-price'] = predicted_prices

In [52]:
compare_df

Unnamed: 0,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,horsepower,peak-rpm,city-mpg,highway-mpg,actual-price,predicted-price
137,saab,gas,turbo,four,sedan,fwd,2847.0,dohc,four,121.0,mpfi,160.0,5500.0,19.0,26.0,18620.0,12254.648209
158,toyota,diesel,std,four,sedan,fwd,2275.0,ohc,four,110.0,idi,56.0,4500.0,34.0,36.0,7898.0,9588.765502
26,dodge,gas,std,four,sedan,fwd,1989.0,ohc,four,90.0,2bbl,68.0,5500.0,31.0,38.0,7609.0,9387.082458
106,nissan,gas,std,two,hatchback,rwd,3139.0,ohcv,six,181.0,mpfi,160.0,5200.0,19.0,25.0,18399.0,13968.439102
151,toyota,gas,std,two,hatchback,fwd,2040.0,ohc,four,92.0,2bbl,62.0,4800.0,31.0,38.0,6338.0,8800.553679
122,plymouth,gas,std,four,sedan,fwd,2191.0,ohc,four,98.0,2bbl,68.0,5500.0,31.0,38.0,7609.0,10022.211075
