## California Housing Price
- predict median price per district
- model: regression/labeled supervised learning
- dataset: https://github.com/ageron/handson-ml2/tree/master/datasets/housing

### 1. Read Data

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf
from tensorflow import feature_column
from tensorflow.keras import layers

In [2]:
df=pd.read_csv("california_housing.csv")
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
df.shape

(20640, 10)

In [5]:
df.isna().sum()
#total_badrooms 207/20640 is missing

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [6]:
df=df.dropna()
df.isna().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

### 2. Data Exploration

In [None]:
df.describe()

#### Take away:
- comparing to 75%, max for `total_rooms`, `population`, `households` need a further check.
- abnormal data for target col `median_house_value`.

In [None]:
df.median_house_value.hist(bins=100)
#abnormal data here, outlier

In [None]:
df.households.hist(bins=100)

In [None]:
df=df[(df.total_rooms<=5000)&(df.total_bedrooms<=1000)&(df.population<=2500)&(df.households<=1000)&
      (df.median_income<=8)&(df.median_house_value<500)]

#df.median_house_value.hist(bins=100)
#df.total_bedrooms.hist(bins=100)
#df.median_income.hist(bins=100)
#df.total_rooms.hist(bins=100)
#df.population.hist(bins=100)
#df.households.hist(bins=100)
df.hist(bins=100,figsize=(15,10))
plt.show()

In [None]:
df.shape

In [None]:
df.head()

### 3. Feature Engineering

### 3.0 Train/Test Split

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df= train_test_split(df,test_size=0.22, random_state=123)
print("Total df size: %i\n train_df size: %i \n test_df size: %i"\
%(df.shape[0],train_df.shape[0],test_df.shape[0]))

In [None]:
resolution_in_degrees = 0.4 

feature_columns = []

latitude_num = tf.feature_column.numeric_column("latitude")
latitude_bins = list(np.arange(int(min(train_df['latitude'])), int(max(train_df['latitude'])), resolution_in_degrees))
latitude = tf.feature_column.bucketized_column(latitude_num, latitude_bins)

longitude_num = tf.feature_column.numeric_column("longitude")
longitude_bins = list(np.arange(int(min(train_df['longitude'])), int(max(train_df['longitude'])), resolution_in_degrees))
longitude = tf.feature_column.bucketized_column(longitude_num, longitude_bins)

lat_x_lon = tf.feature_column.crossed_column([latitude, longitude], hash_bucket_size=100)
crossed_feature = tf.feature_column.indicator_column(lat_x_lon)
feature_columns.append(crossed_feature)

cross_feature_layer = layers.DenseFeatures(feature_columns)

### 4. Modeling

### 4.1 Define functions that build and train a model
- build_model(learning_rate), which builds a randomly-initialized model.
- train_model(model, feature, label, epochs), which trains the model from the examples (feature and label) you pass.

In [None]:
#Define the functions that build and train a model
def build_model(my_learning_rate, feature_layer):
  # Create and compile a simple linear regression model.
  model = tf.keras.models.Sequential() # Most simple tf.keras models are sequential.

  # Describe the topography of the model.
  model.add(feature_layer)
  model.add(tf.keras.layers.Dense(units=1, input_shape=(1,))) #a single node in a single layer.

  # Compile into TensorFlow. 
  model.compile(optimizer=tf.keras.optimizers.RMSprop(lr=my_learning_rate),
                loss="mean_squared_error",
                metrics=[tf.keras.metrics.RootMeanSquaredError()])

  return model        

#Train the model by feeding feature and label.
def train_model(model, df, epochs, batch_size, label_name):
    
  features = {name:np.array(value) for name, value in df.items()}
  label = np.array(features.pop(label_name))
    
  history = model.fit(x=features,y=label,
                      batch_size=batch_size,epochs=epochs,
                      shuffle=True) # specified number of epochs. 

  # Gather the trained model's weight and bias.
  #trained_weight = model.get_weights()[0]
  #trained_bias = model.get_weights()[1]

  epochs = history.epoch # The list of epochs is stored separately from the rest of history.
  
  hist = pd.DataFrame(history.history) # Isolate the error for each epoch.
  rmse = hist["root_mean_squared_error"] # Take a snapshot of the model's root mean squared error at each epoch. 

  return epochs, rmse

### 4.2 Define plotting functions
- a loss curve

In [None]:
#Plot a curve of loss vs. epoch.
def plot_the_loss_curve(epochs, rmse):
  plt.figure()
  plt.xlabel("Epoch")
  plt.ylabel("Root Mean Squared Error")

  plt.plot(epochs, rmse, label="Loss")
  plt.legend()
  plt.ylim([rmse.min()*0.94, rmse.max()* 1.05])
  plt.show()  

### 4.3 Call the model functions

In [None]:
# Hyperparameters:
learning_rate = 0.05
epochs = 30
batch_size = 100

label_name="median_house_value" 

# Invoke the functions.
my_model = build_model(learning_rate,cross_feature_layer)
epochs, rmse = train_model(my_model, train_df, 
                           epochs, batch_size, label_name)

plot_the_loss_curve(epochs, rmse)

In [None]:
print("\n: Evaluate the new model on the test set:")
test_features = {name:np.array(value) for name, value in test_df.items()}
test_label = np.array(test_features.pop(label_name))
my_model.evaluate(x=test_features, y=test_label, batch_size=batch_size)