
# Predicting House Prizes
* Chris Arnold (Cardiff University)
* DS3 Data Science Summer School
* August 25th, 2023

In [None]:
# Housekeeping
from IPython.display import Image
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
import glob
import cv2
import os
import locale
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import concatenate



### Paper Source
This is [the link](https://arxiv.org/pdf/1609.08399.pdf) to Eman H. Ahmed, Mohamed N. Moustafa (2016) "House price estimation from visual and textual features"

If you want to download the data yourself, clone into the original data set use:


In [None]:
!git clone https://github.com/emanhamed/Houses-dataset

They predict house prizes on the basis of
* images from the bedroom, bathroom, kitchen and the front of the house
* And information about the number of bedrooms, bathrooms, the area, the zipcode (FE)

In [None]:
# You might have to mount your drive and then cd into it
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

In [None]:
# Where are we?
%pwd

In [None]:
%ls

In [None]:
%cd Houses-dataset/Houses\ Dataset

In [None]:
%ls

In [None]:
# Let's take a look at a house
Image(filename='220_frontal.jpg')

In [None]:
Image(filename='220_kitchen.jpg')

In [None]:
Image(filename='220_bedroom.jpg')

In [None]:
Image(filename='220_bathroom.jpg')

### Load the Attributes

In [None]:
def load_house_attributes(inputPath):
	# initialize the list of column names in the CSV file and then
	# load it using Pandas
	cols = ["bedrooms", "bathrooms", "area", "zipcode", "price"]
	df = pd.read_csv(inputPath, sep=" ", header=None, names=cols)
	# determine (1) the unique zip codes and (2) the number of data
	# points with each zip code
	zipcodes = df["zipcode"].value_counts().keys().tolist()
	counts = df["zipcode"].value_counts().tolist()
	# loop over each of the unique zip codes and their corresponding
	# count
	for (zipcode, count) in zip(zipcodes, counts):
		# the zip code counts for our housing dataset is *extremely*
		# unbalanced (some only having 1 or 2 houses per zip code)
		# so let's sanitize our data by removing any houses with less
		# than 25 houses per zip code
		if count < 25:
			idxs = df[df["zipcode"] == zipcode].index
			df.drop(idxs, inplace=True)
	# return the data frame
	return df

In [None]:
# Process that data and get reshaped data
def process_house_attributes(df, train, test):
	# initialize the column names of the continuous data
	continuous = ["bedrooms", "bathrooms", "area"]
	# performin min-max scaling each continuous feature column to
	# the range [0, 1]
	cs = MinMaxScaler()
	trainContinuous = cs.fit_transform(train[continuous])
	testContinuous = cs.transform(test[continuous])
	# one-hot encode the zip code categorical data (by definition of
	# one-hot encoding, all output features are now in the range [0, 1])
	zipBinarizer = LabelBinarizer().fit(df["zipcode"])
	trainCategorical = zipBinarizer.transform(train["zipcode"])
	testCategorical = zipBinarizer.transform(test["zipcode"])
	# construct our training and testing data points by concatenating
	# the categorical features with the continuous features
	trainX = np.hstack([trainCategorical, trainContinuous])
	testX = np.hstack([testCategorical, testContinuous])
	# return the concatenated training and testing data
	return (trainX, testX)

### Load the Images


In [None]:
# This loads the data and concatenates the four images into one
def load_house_images(df, inputPath):
	# initialize our images array (i.e., the house images themselves)
	images = []
	# loop over the indexes of the houses
	for i in df.index.values:
		# find the four images for the house and sort the file paths,
		# ensuring the four are always in the *same order*
		basePath = os.path.sep.join([inputPath, "{}_*".format(i + 1)])
		housePaths = sorted(list(glob.glob(basePath)))
		# initialize our list of input images along with the output image
		# after *combining* the four input images
		inputImages = []
		outputImage = np.zeros((64, 64, 3), dtype="uint8")
		# loop over the input house paths
		for housePath in housePaths:
			# load the input image, resize it to be 32 32, and then
			# update the list of input images
			image = cv2.imread(housePath)
			image = cv2.resize(image, (32, 32))
			inputImages.append(image)
		# tile the four input images in the output image such the first
		# image goes in the top-right corner, the second image in the
		# top-left corner, the third image in the bottom-right corner,
		# and the final image in the bottom-left corner
		outputImage[0:32, 0:32] = inputImages[0]
		outputImage[0:32, 32:64] = inputImages[1]
		outputImage[32:64, 32:64] = inputImages[2]
		outputImage[32:64, 0:32] = inputImages[3]
		# add the tiled image to our set of images the network will be
		# trained on
		images.append(outputImage)
	# return our set of images
	return np.array(images)

### Building the Model

In [None]:
# import the necessary packages
def create_mlp(dim, regress=False):
	# define our MLP network
	model = Sequential()
	model.add(Dense(8, input_dim=dim, activation="relu"))
	model.add(Dense(4, activation="relu"))
	# check to see if the regression node should be added
	if regress:
		model.add(Dense(1, activation="linear"))
	# return our model
	return model

In [None]:
def create_cnn(width, height, depth, filters=(16, 32, 64), regress=False):
  # initialize the input shape and channel dimension, assuming
  # TensorFlow/channels-last ordering
  inputShape = (height, width, depth)
  chanDim = -1
  # define the model input
  inputs = Input(shape=inputShape)

  # CONV => RELU => BN => POOL
  x = Conv2D(16, (3, 3), padding="same")(inputs)
  x = Activation("relu")(x)
  x = BatchNormalization(axis=chanDim)(x)
  x = MaxPooling2D(pool_size=(2, 2))(x)

  x = Conv2D(32, (3, 3), padding="same")(x)
  x = Activation("relu")(x)
  x = BatchNormalization(axis=chanDim)(x)
  x = MaxPooling2D(pool_size=(2, 2))(x)

  x = Conv2D(64, (3, 3), padding="same")(x)
  x = Activation("relu")(x)
  x = BatchNormalization(axis=chanDim)(x)
  x = MaxPooling2D(pool_size=(2, 2))(x)

  # flatten the volume, then FC => RELU => BN => DROPOUT
  x = Flatten()(x)
  x = Dense(16)(x)
  x = Activation("relu")(x)
  x = BatchNormalization(axis=chanDim)(x)
  x = Dropout(0.5)(x)
  # apply another FC layer, this one to match the number of nodes
  # coming out of the MLP
  x = Dense(4)(x)
  x = Activation("relu")(x)
  # check to see if the regression node should be added
  if regress:
  	x = Dense(1, activation="linear")(x)
  # construct the CNN
  model = Model(inputs, x)
  # return the CNN
  return model


### Construct the Data Pipeline


In [None]:
# Load the attributes
inputPath_house_data = "HousesInfo.txt"
df_house_att = load_house_attributes(inputPath_house_data)

In [None]:
# Loads the images
df_house_img = load_house_images(df_house_att, "/content/drive/MyDrive/Colab Notebooks/Houses-dataset/House Dataset")

In [None]:
# norm the data to be between 0 and 1
df_house_img = df_house_img / 255.0

### Preparing the Data

In [None]:
# partition the data into training and testing splits using 75% of
# the data for training and the remaining 25% for testing
split = train_test_split(df_house_att, df_house_img, test_size=0.25, random_state=42)
(trainAttrX, testAttrX, trainImagesX, testImagesX) = split


In [None]:
# NB: Size of the data here!
print(trainAttrX.shape)
print(testAttrX.shape)

In [None]:
# find the largest house price in the training set and use it to
# scale our house prices to the range [0, 1] (will lead to better
# training and convergence)
maxPrice = trainAttrX["price"].max()
trainY = trainAttrX["price"] / maxPrice
testY = testAttrX["price"] / maxPrice


In [None]:
# process the house attributes data by performing min-max scaling
# on continuous features, one-hot encoding on categorical features,
# and then finally concatenating them together
(trainAttrX, testAttrX) = process_house_attributes(df_house_att,
	trainAttrX, testAttrX)


### Building the Model

In [None]:
# create the MLP and CNN models
mlp = create_mlp(trainAttrX.shape[1], regress=False)
cnn = create_cnn(64, 64, 3, regress=False)


In [None]:
mlp.summary()

In [None]:
cnn.summary()

In [None]:
# create the input to our final set of layers as the *output* of both
# the MLP and CNN
combinedInput = concatenate([mlp.output, cnn.output])
# our final FC layer head will have two dense layers, the final one
# being the regression head
x = Dense(4, activation="relu")(combinedInput)
x = Dense(1, activation="linear")(x)
# our final model will accept categorical/numerical data on the MLP
# input and images on the CNN input, outputting a single value (the
# predicted price of the house)
model = Model(inputs=[mlp.input, cnn.input], outputs=x)

In [None]:
model.summary()

### Train

In [None]:
# compile the model using mean absolute percentage error as our loss,
# implying that we seek to minimize the absolute percentage difference
# between our price *predictions* and the *actual prices*
opt = Adam(learning_rate=1e-3)
model.compile(loss="mean_absolute_percentage_error", optimizer=opt)


In [None]:
# train the model
# for a good fit, run 200 epochs
model.fit(
	x=[trainAttrX, trainImagesX], y=trainY,
	validation_data=([testAttrX, testImagesX], testY),
	epochs=200, batch_size=8)


In [None]:
# make predictions on the testing data
preds = model.predict([testAttrX, testImagesX])

In [None]:
# compute the difference between the *predicted* house prices and the
# *actual* house prices, then compute the percentage difference and
# the absolute percentage difference
diff = preds.flatten() - testY
percentDiff = (diff / testY) * 100
absPercentDiff = np.abs(percentDiff)
# compute the mean and standard deviation of the absolute percentage
# difference
mean = np.mean(absPercentDiff)
std = np.std(absPercentDiff)
# finally, show some statistics on our model
locale.setlocale(locale.LC_ALL, "en_US.UTF-8")
print("Avg. house price: {}, std house price: {}".format(
	locale.currency(df_house_att["price"].mean(), grouping=True),
	locale.currency(df_house_att["price"].std(), grouping=True)))
print("Mean: {:.2f}%, std: {:.2f}%".format(mean, std))

### Some Thoughts on What We Did

#### Training with substreams only
For comparison:
* Fusion: MAE 18% - 24%
* DNN alone: MAE 22.71%
* CNN alone: MAE 56.91%

#### Data Set Size
* Number of observations
* Number of variables

#### Information Flow
* What is the dominant data stream?
* Regress your results from one stream on the results of the other to see whether there is information left that can be modeled

#### Optimizers
* What is your favourite optimizer for CNN?
* What is your favourite optimizer for text?
* What is your favourite optimizer for micro data?

#### How would you fine tune your model?
* Transfer learning
* Pre training


Sources:
* https://github.com/emanhamed/Houses-dataset
* https://pyimagesearch.com/2019/02/04/keras-multiple-inputs-and-mixed-data/