<B><H2> HOUSE PRICE ESTIMATOR </H2></B><H5>TENSORFLOW REGRESSION SEQUENTIAL DNN MODEL</H5>


In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
import docker
import requests
import json
import numpy as np
import pandas as pd



#TENSORFLOW API
import tensorflow as tf
import tensorflow_data_validation as tfdv
# Feature Engineering
from tensorflow import feature_column as fc
from keras.models import Sequential
from keras import layers
from keras import utils
# TF Dataset for input pipeline
import tensorflow_datasets as tfds
# If using GPU use the below config
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0],True)

# VISUALISATION API
import matplotlib.pyplot as plt
import seaborn as sns # Seaborn is a Python data visualization library based on matplotlib.

# Import train_test_split function from sklearn.model_selection
from sklearn.model_selection import train_test_split

# Used to calculate stats such as Z score, standard deviation etc.
from scipy import stats
# Used to calculate MSE
from sklearn.metrics import mean_squared_error

city_list = ["Melbourne", "Sydney","Brisbane","Perth","Adelaide","Hobart"]
city = city_list[0]

<H5> READ THE DATA INTO PANDAS DATAFRAME.</H5> Remove any NULL values

In [None]:
# Import data from CSV into pandas DF
df= pd.read_csv(f"data/{city}/{city}_area.csv")
# Check if there any NULL or NaN values
df.isnull().sum()
# Drop any NA values
df = df.dropna(how='any',axis=0)
# Check that there is no longer Null values
df.isnull().sum()
# Look at DF to get a feel of data
df.head(10)

<H3> <B> EXPLORATORY DATA ANALYSIS (EDA) </B></H3>

Explore Data statistics using Pandas

In [None]:
df.describe()

Explore Data Types using Pandas

In [None]:
df.info()

Explore Data statistics ,schema & types using TensorFlow Data Validation (TFDV)

In [None]:
data = tfdv.generate_statistics_from_csv(f"data/{city}/{city}_area.csv")
tfdv.visualize_statistics(data)
schema = tfdv.infer_schema(statistics=data)
tfdv.display_schema(schema=schema)
# Check eval data for errors by validating the eval data stats using the previously inferred schema.
anomalies = tfdv.validate_statistics(statistics=data, schema=schema)
tfdv.display_anomalies(anomalies)

Categorise fields into CATEGORICAL,NUMERICAL & to be DROPPED

In [None]:
# Above there is a combination of categorical & numerical features with 'Price' being the label. We will need to classify the 
# features into the following:
num_feat = ['Bedrooms','Bathrooms','Cars','Area','Latitude','Longitude','Distance','Price','Date']
cat_feat = ['Suburb','Type','Method']
drop_feat = ['Street','Address','State','Postcode','Agent']

# Drop features that are not required
if set(drop_feat).issubset(df.columns):
  df = df.drop(drop_feat, axis = 1) 

Explore Relationships between Numerical Features & Price (label)

In [None]:
sns.pairplot(data = df[num_feat], height=3,diag_kind='kde')

Feature correlation

In [None]:
# Heatmap for all the remaining numerical data including the target 'Price'
# Define the heatmap parameters
pd.options.display.float_format = "{:,.2f}".format

# Define correlation matrix
corr_matrix = df[num_feat].corr()

# Replace correlation < |0.3| by 0 for a better visibility
corr_matrix[(corr_matrix < 0.3) & (corr_matrix > -0.3)] = 0

# plot the heatmap
sns.heatmap(corr_matrix, vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot_kws={"size": 9, "color": "black"},annot=True)
plt.title("Price Correlation")

## Lets visualize individually 

corr =df.corr()["Price"].sort_values(ascending = False)[1:len(num_feat)] ## selecting cols other than Saleprice, LogPrice
corr

Check Price Distribution

In [None]:
sns.displot(df['Price'])

Check Skew

In [None]:
num_feat = df.dtypes[df.dtypes != 'object'].index
skew_feats = df[num_feat].skew().sort_values(ascending=False)
skewness = pd.DataFrame({'skew': skew_feats})
skewness

<H3> <B>FEATURE ENGINEERING & TRANSFORMATION </B></H3>

1. Crafting new Feature YEAR from DATE
2. Crafing new Feature 'LogPrice' from 'Price' to reduce significant skew present in 'Price'
3. Transforming Feature 'Area' from 'Price' to reduce significant skew present in 'Price'


In [None]:
df['Date']= df['Date'].astype('datetime64[ns]')
df['Year'] = df['Date'].dt.year
# df['PricePerSqm'] = df['Price']/df['Area']
df["LogPrice"] = np.log10(df["Price"])
# Added Area after I realised that Area has large skew, hence applying log10 to reduce Skew & make it more Gausian/normal distribution
# then applu Z score as it works best on Normal/Gausian Distribution
df['Area'] = np.log10(df['Area'])
num_feat = ['Bedrooms','Bathrooms','Cars','Area','Latitude','Longitude','Distance','Year','LogPrice']


In [None]:
num_feat = ['Bedrooms','Bathrooms','Cars','Area','Latitude','Longitude','Distance','Year','LogPrice']
#num_feat = df.dtypes[df.dtypes != 'object'].index
skew_feats = df[num_feat].skew().sort_values(ascending=False)
skewness = pd.DataFrame({'skew': skew_feats})
skewness

<H3> <B> ANOMALY DETECTION & REMOVAL </B></H3>

Filter out Outliers using either Z Score Method (normal discribution) or IQR method.

In [None]:
method = 'Zscore'
# Interquartile range (IQR) method should be used for NON normal distribution

def remove_outlier(df_in, col_name):
    q1 = df_in[col_name].quantile(0.25)
    q3 = df_in[col_name].quantile(0.75)
    iqr = q3-q1 #Interquartile range
    fence_low  = q1-1.5*iqr
    fence_high = q3+1.5*iqr
    df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)]
    return df_out

if method == 'Zscore':
  #df = df[(np.abs(stats.zscore(df['Price'])) < 3)]
  df = df[(np.abs(stats.zscore(df[num_feat])) < 3).all(axis=1)]
  
else:
  df = remove_outlier(df, num_feat)

Explore Data after removing anomalies & using Log10 on Price to reduce skew

In [None]:
sns.pairplot(data = df[num_feat], height=3,diag_kind='kde')







sns.displot(df['Price'])
sns.displot(df['LogPrice'])

df.describe()

num_feat = df.dtypes[df.dtypes != 'object'].index
skew_feats = df[num_feat].skew().sort_values(ascending=False)
skewness = pd.DataFrame({'skew': skew_feats})
skewness

# Heatmap for all the remaining numerical data including the target 'Price'
# Define the heatmap parameters
pd.options.display.float_format = "{:,.2f}".format
# Define correlation matrix
corr_matrix = df[num_feat].corr()

# Replace correlation < |0.3| by 0 for a better visibility
corr_matrix[(corr_matrix < 0.3) & (corr_matrix > -0.3)] = 0

# plot the heatmap
sns.heatmap(corr_matrix, vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot_kws={"size": 9, "color": "black"},annot=True)
plt.title("Price Correlation")

## Lets visualize individually 

corr =df.corr()["Price"].sort_values(ascending = False)[1:len(num_feat)] ## selecting cols other than Saleprice, LogPrice
corr

In [None]:
# Define correlation matrix
corr_matrix = df[num_feat].corr()

# Replace correlation < |0.3| by 0 for a better visibility
corr_matrix[(corr_matrix < 0.3) & (corr_matrix > -0.3)] = 0

# plot the heatmap
sns.heatmap(corr_matrix, vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot_kws={"size": 9, "color": "black"},annot=True)
plt.title("Price Correlation")

## Lets visualize individually 

corr =df.corr()["LogPrice"].sort_values(ascending = False)[1:len(num_feat)] ## selecting cols other than Saleprice, LogPrice
corr


Check Skew of Area since it was really high

In [None]:
sns.displot(df['Area'])

Confirm that Outliers have been removed

Check Data After reducing Skew of Area

In [None]:
# Define correlation matrix
corr_matrix = df[num_feat].corr()

# Replace correlation < |0.3| by 0 for a better visibility
corr_matrix[(corr_matrix < 0.3) & (corr_matrix > -0.3)] = 0

# plot the heatmap
sns.heatmap(corr_matrix, vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot_kws={"size": 9, "color": "black"},annot=True)
plt.title("Price Correlation")

## Lets visualize individually 

corr =df.corr()["Price"].sort_values(ascending = False)[1:len(num_feat)] ## selecting cols other than Saleprice, LogPrice
corr

In [None]:
num_feat = df.dtypes[df.dtypes != 'object'].index
skew_feats = df[num_feat].skew().sort_values(ascending=False)
skewness = pd.DataFrame({'skew': skew_feats})
skewness

<H4> Plot of Lattitude & longitude showing pricing in each location</H4>
Observe that suburbs closer to the city centre generally are more expensive

In [None]:
plt.scatter(x = df['Longitude'], y = df['Latitude'],c =df['Price'],alpha=0.8,s=df['Price'],cmap='nipy_spectral' )

In [None]:
df = df.drop(['Price','Date'], axis = 1)
num_feat = ['Bedrooms','Bathrooms','Cars','Area','Latitude','Longitude','Distance','Year']
df.info()

Data Splitting into TRAIN,VALIDATION & TEST DF using Scikit Learn function

In [None]:
# Let's split the dataset into train, validation, and test sets as Pandas DF format
train, test = train_test_split(df, test_size=0.2,random_state=42)
train, val = train_test_split(train, test_size=0.2,random_state=42)

Y_test = test['LogPrice']

print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

In [None]:
# 'get_scal' function takes a list of numerical features and returns a 'minmax' function
# 'Minmax' function itself takes a 'numerical' number from a particular feature and return scaled value of that number.
# Scalar def get_scal(feature):
# TODO 1d
def get_scal(feature):
    def minmax(x):
        mini = train[feature].min()
        maxi = train[feature].max()
        return (x - mini)/(maxi-mini)
        return(minmax)

In [None]:
feature_columns = []

# Numeric columns with Normalization
for header in num_feat:
    scal_input_fn = get_scal(header)
    feature_columns.append(fc.numeric_column(header,normalizer_fn=scal_input_fn))

# Categorical features with One Hot Encoding
Type = fc.categorical_column_with_vocabulary_list('Type', df.Type.unique())
Type_ohe = fc.indicator_column(Type)
feature_columns.append(Type_ohe)

Method = fc.categorical_column_with_vocabulary_list('Method', df.Method.unique())
Method_ohe = fc.indicator_column(Method)
feature_columns.append(Method_ohe)


# Embedding categorical column with MANY unique values
Suburb = fc.categorical_column_with_vocabulary_list('Suburb', df.Suburb.unique())
# Commented out below & replaced with Suburb_ohe instead
# Suburb_embedded = fc.embedding_column(Suburb, dimension=len(df.Suburb.unique()))
Suburb_ohe = fc.indicator_column(Suburb)
feature_columns.append(Suburb_ohe)

In [None]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('LogPrice')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds


batch_size = 32 # A small batch sized is used for demonstration purposes
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

Defining a normalization/scaling function to be used for numerical features

Normalizing the Training Data,Valuation & test Numerical Features

<h3> Defining & Training a Tensorflow Sequential Regression DNN model.</h3>

In [None]:
# Model create
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
# `tf.keras.Sequential()` groups a linear stack of layers into a tf.keras.Model.
model = tf.keras.Sequential([
  feature_layer,
  layers.Dense(256, activation='relu'),
  layers.Dense(128, activation='relu'),
  layers.Dense(64, activation='relu'),
  layers.Dense(1, activation='linear',  name='Price')
])

# Model compile
model.compile(optimizer='adam',
              loss='mse',
              metrics=['mse'])

# Model Fit
history = model.fit(train_ds, validation_data = val_ds, epochs=60)



<h4> Evaluating trained model performance using EVALUATION data </h4>

In [None]:
loss, mse = model.evaluate(val_ds)
print("Mean Squared Error", 10**mse)
print("Root MEAN SQUARE ERROR $",np.sqrt(10**mse))

<h4> PLOTTING OF LOSS/MSE for TRAINING & EVALUATION DATA  </h4>
Useful to determine underfitting/overfitting as well as optimal number of epcochs for training

In [None]:
# Use matplotlib to draw the model's loss curves for training and validation
def plot_curves(history, metrics):
    nrows = 1
    ncols = 2
    fig = plt.figure(figsize=(10, 5))

    for idx, key in enumerate(metrics):  
        ax = fig.add_subplot(nrows, ncols, idx+1)
        plt.plot(history.history[key])
        plt.plot(history.history['val_{}'.format(key)])
        plt.title('model {}'.format(key))
        plt.ylabel(key)
        plt.xlabel('epoch')
        plt.legend(['train', 'validation'], loc='upper left');

# Plotting
plot_curves(history, ['loss', 'mse'])

<H3> PREDICTION </H3>
<h4>Using TEST data to make PRICE prediction & compare ACTUAL vs PREDICTED</h4>
 

In [None]:
inp = {'Suburb':'BAULKHAM HILLS','Type': 'House','Method': 'auction','Bedrooms':  4, 'Bathrooms':  2, 'Cars':  1, 'Area':  450,'Latitude':  -33.77157324083317 ,'Longitude':  150.98026592490677,'Distance':  26.5, 'Year':  2022}
input_df = pd.DataFrame.from_dict(inp)
inp_ds = tf.data.Dataset.from_tensor_slices((dict(input_df)))
inp_ds = inp_ds.batch(1)

pred = model.predict(inp_ds)
print(pred)

In [None]:
predictions = model10.predict(test_ds)
print(predictions[1])

test["Price"] = round(10 ** test['LogPrice'])
predictions = np.round(10 ** predictions)

#
for i in range(60):
    diff = round((predictions[i][0] - test['Price'].iloc[i])/1000)
    print('PREDICTION: ${0}'.format(predictions[i][0]) + '   ACTUAL: '+format(test['Price'].iloc[i]) + f' DIFFERENCE: ${diff}')


x = range(0,4200000)
y = x
plt.scatter(test['Price'].iloc[0:len(predictions)],predictions[0:len(predictions)])
plt.title("ACTUAL VS PREDICTED PRICE")
plt.xlabel("ACTUAL PRICE: $")
plt.ylabel("PREDICATED PRICE: $")
plt.plot(x,y,'red')

print("ROOT MEAN SQUARE ERROR ON TEST DATA: $",np.sqrt(mean_squared_error(test["Price"],predictions)))

<h5> SAVE MODEL </h5>

In [None]:
# vers = city_list.index(city)+1
vers = 11
model.save(f"saved_models/{vers}")

In [None]:
model8 = tf.keras.models.load_model('saved_models/8')
model9 = tf.keras.models.load_model('saved_models/9')
model10 = tf.keras.models.load_model('saved_models/10')
model11 = tf.keras.models.load_model('saved_models/11')


# Check its architecture
# model1.summary()

<h4> SERVE MODEL</h4>
<h5> RUN TF SERVING DOCKER CONTAINER </h5>

In [None]:
#docker run -it -v /home/khaled/AUTOMATION-EXCEL\:/tf_serving -p 8601:8601 --entrypoint /bin/bash tensorflow/serving
#tensorflow_model_server --rest_api_port=8601 --model_name=melb_price --model_base_path=/tf_serving/saved_models/

client = docker.from_env()
container = client.containers.run(image = "tensorflow/serving", ports = {8601:8601},volumes = ['/home/khaled/MLrealestate:/tf_serving'], detach=True)
container.exec_run('/bin/bash')
print(container.exec_run('ls'))
container.exec_run('tensorflow_model_server --rest_api_port=8601 --model_name=real_estate_price_est --model_base_path=/tf_serving/saved_models/',detach = True)

In [None]:
iterator = examples['train_ds'].__iter__()
next_element = iterator.get_next()
pt = next_element[0]
en = next_element[1]
print(pt.numpy())
print(en.numpy())

In [None]:
input = list(inp.values())
data = {"instances": [inp]}
test.head(5)


<h4> API CALL FOR PREDICTION</h4>
<h5>  REST API POST for LOCAL TF SERVING CONTAINER - MODEL NAME: real_estate_est</h5>

In [None]:
#%%bash
#curl -d '{"instances": [[0.25,0.07553956834532373,0.0,0.25,0.0,0.1937046004842615,0.9103448275862069,0.516068393160683,0.4633053471477789,1.0]]}' \
#-X POST http://localhost:8601/v1/models/melb_price:predict

r = requests.post(url="http://localhost:8601/v1/models/real_estate_price_est:predict", data=json.dumps(data))
print(r.json())

In [None]:
df= pd.read_csv('data/{city}/{city}_area.csv')
#len(df['Suburb'].unique())
df = df.drop_duplicates(subset=['Suburb','Type','Method'], keep='last')
df.to_csv('data/{city}/{city}_template.csv',index = False)
df.info()
#df.head(20)

In [None]:
%%bash
heroku auth:login
heroku container:login
heroku container:push web -a tf-serve-model
heroku container:release web -a tf-serve-model
heroku logs -a tf-serve-model --tail