In [None]:
!pip install --upgrade numpy pandas matplotlib seaborn tensorflow pydot

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from matplotlib import pyplot as plt
import seaborn as sns

# The following lines adjust the granularity of reporting.
pd.options.display.max_rows = 10
pd.options.display.float_format = "{:.1f}".format

In [None]:
train_df = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv")
train_df = train_df.reindex(np.random.permutation(train_df.index))

test_df = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/california_housing_test.csv")

## Represent data

The following code cell creates preprocessing layers outputting three features:

* `latitude` X `longitude` (a feature cross)
* `median_income`
* `population`

This code cell specifies the features that you'll ultimately train the model on and how each of those features will be represented. The transformations (collected in `prepocessing_layers`) don't actually get applied until you pass a DataFrame to it, which will happen when we train the model.

We'll use `preprocessing_layers` for both our linear regression model and our neural network model.

(The [`keras.FeatureSpace`](https://keras.io/examples/structured_data/structured_data_classification_with_feature_space) utility offers an alternative to building individual Keras preprocessing layers -- give it a try, if you're feeling adventurous!)

In [None]:
# Keras Input tensors of float values.
inputs = {
    'latitude':
        tf.keras.layers.Input(
            shape=(1,),
            dtype=tf.float32,
            name='latitude'),
    'longitude':
        tf.keras.layers.Input(
            shape=(1,),
            dtype=tf.float32,
            name='longitude'),
    'median_income':
        tf.keras.layers.Input(
            shape=(1,),
            dtype=tf.float32,
            name='median_income'),
    'population':
        tf.keras.layers.Input(
            shape=(1,),
            dtype=tf.float32,
            name='population')
}

# Normalization Layers

This layer will shift and scale inputs into a distribution centered around 0 with standard deviation 1. It accomplishes this by precomputing the mean and variance of the data, and calling `(input - mean) / sqrt(var)` at runtime.

The mean and variance values for the layer must be either supplied on construction or learned via `adapt()`. `adapt()` will compute the mean and variance of the data and store them as the layer's weights. `adapt()` should be called before `fit()`, `evaluate()`, or `predict()`.

In [None]:
# Create a Normalization layer to normalize the median_income data.
median_income = tf.keras.layers.Normalization(
    name='normalization_median_income',
    axis=None)
median_income.adapt(train_df['median_income'])
median_income = median_income(inputs.get('median_income'))

# Create a Normalization layer to normalize the population data.
population = tf.keras.layers.Normalization(
    name='normalization_population',
    axis=None)
population.adapt(train_df['population'])
population = population(inputs.get('population'))

# Create a Normalization layer to normalize the latitude data.
latitude = tf.keras.layers.Normalization(
    name='normalization_latitude',
    axis=None)
latitude.adapt(train_df['latitude'])
latitude = latitude(inputs.get('latitude'))

# Create a Normalization layer to normalize the longitude data.
longitude = tf.keras.layers.Normalization(
    name='normalization_longitude',
    axis=None)
longitude.adapt(train_df['longitude'])
longitude = longitude(inputs.get('longitude'))

# Create Normalization layers to normalize the median_house_value data.
# Because median_house_value is our label (i.e., the target value we're
# predicting), these layers won't be added to our model.
train_median_house_value_normalized = tf.keras.layers.Normalization(axis=None)
train_median_house_value_normalized.adapt(
    np.array(train_df['median_house_value']))

test_median_house_value_normalized = tf.keras.layers.Normalization(axis=None)
test_median_house_value_normalized.adapt(
    np.array(test_df['median_house_value']))

# Bucketizing Values

Create a list of numbers representing the bucket boundaries for latitude. Because we're using a Normalization layer, values for latitude and longitude
will be in the range of approximately -3 to 3 (representing the Z score). We'll create 20 buckets, which requires 21 bucket boundaries (hence, 20+1).

To do this, we'll use a `Discretization()` layer, which will place each element of its input data into one of several contiguous ranges and output an integer index indicating which range each element was placed in.

In [None]:
latitude_boundaries = np.linspace(-3, 3, 20+1)
longitude_boundaries = np.linspace(-3, 3, 20+1)

# Create a Discretization layer to separate the latitude data into buckets.
latitude = tf.keras.layers.Discretization(
    bin_boundaries=latitude_boundaries,
    name='discretization_latitude')(latitude)

# Create a Discretization layer to separate the longitude data into buckets.
longitude = tf.keras.layers.Discretization(
    bin_boundaries=longitude_boundaries,
    name='discretization_longitude')(longitude)

# Feature Crosses

This layer performs crosses of categorical features using the "hashing trick". Conceptually, the transformation can be thought of as: `hash(concatenate(features)) % num_bins`.

In [None]:
# Cross the latitude and longitude features into a single one-hot vector.
feature_cross = tf.keras.layers.HashedCrossing(
    # num_bins can be adjusted: Higher values improve accuracy, lower values
    # improve performance.
    num_bins=len(latitude_boundaries) * len(longitude_boundaries),
    output_mode='one_hot',
    name='cross_latitude_longitude')([latitude, longitude])

# Concatenation

Finally, we will combine all three of our inputs into a single tensor using a `Concatenate()` layer

In [None]:
# Concatenate our inputs into a single tensor.
preprocessing_layers = tf.keras.layers.Concatenate()(
    [feature_cross, median_income, population])

dense_output = tf.keras.layers.Dense(
    units=1,
    name='dense_output')(preprocessing_layers)

# Define an output dictionary we'll send to the model constructor.
outputs = {
'dense_output': dense_output
}

-----

# Basline Linear Regression

Let's first train a multiple linear regression on the preprocessing layers we just created. This will give us a good baseline model to compare our neural network against.

In [None]:
learning_rate = 0.01
epochs = 15
batch_size = 1000
label_name = "median_house_value"

validation_split = 0.2

In [None]:
lin_model = tf.keras.Model(inputs=inputs, outputs=outputs)
lin_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
    loss="mean_squared_error",
    metrics=[tf.keras.metrics.MeanSquaredError()]
)
lin_model.summary()

In [None]:
# Split the dataset into features and label.
features = {name:np.array(value) for name, value in train_df.items()}
label = train_median_house_value_normalized(
    np.array(features.pop(label_name))
)

In [None]:
history = lin_model.fit(
    x=features,
    y=label,
    batch_size=batch_size,
    epochs=epochs,
    shuffle=True,
    validation_split=validation_split)

# Get details that will be useful for plotting the loss curve.
epochs = history.epoch
hist = pd.DataFrame(history.history)
mse = hist["mean_squared_error"]

In [None]:
plt.figure()
plt.xlabel("Epoch")
plt.ylabel("Mean Squared Error")

plt.plot(epochs, mse, label="Training Loss")
plt.plot(epochs, history.history["val_mean_squared_error"], label="Validation Loss")

# mse_training is a pandas Series, so convert it to a list first.
merged_mse_lists = mse.tolist() + history.history["val_mean_squared_error"]
highest_loss = max(merged_mse_lists)
lowest_loss = min(merged_mse_lists)
top_of_y_axis = highest_loss * 1.03
bottom_of_y_axis = lowest_loss * 0.97

plt.ylim([bottom_of_y_axis, top_of_y_axis])
plt.legend()
plt.show()

-----

# Train a Neural Network

In [None]:
dense_output = tf.keras.layers.Dense(
    units=20,
    activation='relu',
    name='hidden_dense_layer_1')(preprocessing_layers)

# Create a Dense layer with 12 nodes.
dense_output = tf.keras.layers.Dense(
    units=12,
    activation='relu',
    name='hidden_dense_layer_2')(dense_output)

# Create the Dense output layer.
dense_output = tf.keras.layers.Dense(
    units=1,
    name='dense_output')(dense_output)

# Define an output dictionary we'll send to the model constructor.
outputs = {
'dense_output': dense_output
}

In [None]:
dnn_model = tf.keras.Model(inputs=inputs, outputs=outputs)
dnn_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
    loss="mean_squared_error",
    metrics=[tf.keras.metrics.MeanSquaredError()]
)

In [None]:
tf.keras.utils.plot_model(dnn_model)

In [None]:
learning_rate = 0.01
epochs = 15
batch_size = 1000
label_name = "median_house_value"

validation_split = 0.2

In [None]:
history = dnn_model.fit(
    x=features,
    y=label,
    batch_size=batch_size,
    epochs=epochs,
    shuffle=True,
    validation_split=validation_split)

# Get details that will be useful for plotting the loss curve.
epochs = history.epoch
hist = pd.DataFrame(history.history)
mse = hist["mean_squared_error"]

In [None]:
plt.figure()
plt.xlabel("Epoch")
plt.ylabel("Mean Squared Error")

plt.plot(epochs, mse, label="Training Loss")
plt.plot(epochs, history.history["val_mean_squared_error"], label="Validation Loss")

# mse_training is a pandas Series, so convert it to a list first.
merged_mse_lists = mse.tolist() + history.history["val_mean_squared_error"]
highest_loss = max(merged_mse_lists)
lowest_loss = min(merged_mse_lists)
top_of_y_axis = highest_loss * 1.03
bottom_of_y_axis = lowest_loss * 0.97

plt.ylim([bottom_of_y_axis, top_of_y_axis])
plt.legend()
plt.show()

In [None]:
# After building a model against the training set, test that model
# against the test set.
test_features = {name:np.array(value) for name, value in test_df.items()}
test_label = test_median_house_value_normalized(np.array(test_features.pop(label_name)))

print("\n Evaluate the new model against the test set:")
dnn_model.evaluate(
    x=test_features,
    y=test_label,
    batch_size=batch_size,
    return_dict=True)

# Lab Exercise

Craigslist is the world's largest collection of used vehicles for sale. GitHub user [@AustinReese](https://github.com/AustinReese/UsedVehicleSearch) scraped Craigslist used car ads to create a dataset of 4.25 million listings. We will be using a subset of around 2 million of these listings.

For this lab:

  1. Clean and preprocess your data:
      - Create a training dataset that includes `price`, `ageOfCar` (in years), `manufacturer` (dummy-encoded), `condition` (dummy-encoded), `odometer`, and `isCarvana` (0 or 1)
      - Be sure to remove any outliers from your dataset as you go
  2. Define a feed-forward neural network
      - It should take `ageOfCar`, `manufacturer`, `condition`, `odometer`, and `isCarvana` as inputs
      - It should have 2 hidden layers
          - How many neurons do you think each layer should include? Why?
          - What activation function(s) do you want to use in each layer?
      - It should use MSE as its loss function
      - The final output layer should be a single neuron, and should use either a ReLU or linear activation function
  3. Train your model and evaluate its performance between the training and validation samples
  4. Evaluate your model against the holdout testing dataset

In [None]:
data = pd.read_csv("https://cdn.c18l.org/vehicles_lab.csv")
data.head()