In [49]:
from pathlib import Path
import numpy as np
import pandas as pd
import pickle
from os.path import expanduser
from collections import Counter
import string
import re

In [90]:
filename = expanduser("~/data/houses_for_sale.pickle")
with open(filename,'rb') as f:
    data = pickle.load(f, encoding='utf-8')

This dataset was acquired by scraping a website that compiles property listings available for sale in Spain. Our objective is to leverage this data to gain insights into properties that offer a favorable value proposition for potential buyers.

In [91]:
len(data)

913

In [92]:
# data is a list of dictionaries
data[0]

{'price': '320.000 €',
 'title': 'Piso Tallers. Piso con 2 habitaciones con ascensor',
 'loc_string': 'Barcelona - Sant Antoni',
 'loc': None,
 'features': ['85 m2', '2 hab.', '1 baño', '3.647 €/m2'],
 'type': 'FLAT',
 'subtype': 'FLAT',
 'selltype': 'SECOND_HAND',
 'desc': 'Piso en última planta a reformar en calle Tallers junto a plaza Universitat.\n\nLa propiedad ofrece muchas posibilidades para personalizarla según sus necesidades, ya que todas las paredes son tabiques.\n\nLa distribución actual cuenta con salón comedor amplio, cocina, dos habitaciones dobles exteriores, dos habitaciones interiores, baño y despensa/trastero. Balcones a ambos lados de la vivienda, uno a patio de manzana y los otros dos a carrer Tallers. Suelos originales.\n\nFinca del 1882 con ascensor.\n\nCalle peatonal con encanto en el centro de Barcelona, rodeada de todo tipo de comercios y servicios. Buenas comunicaciones mediante transportes publicos, metro, autobuses y FFCC.'}

## Removing ads from multiple units
If features starts with "desde" it means that the ad is advertising multiple units. See for example this add in which the price is None and features start with "desde".

In [93]:
#data[10]

In [94]:
data[10]['price']

In [95]:
data[10]['features']

['desde 70 m2', '15.785,71 €/m2']

In [96]:
 bool(re.match("^desde", 'desde 70 m2')),  bool(re.match("^desde", '70 m2'))

(True, False)

In [97]:
data_v1 = [] 
for i, d in enumerate(data):
    features = d["features"]
    if bool(re.match("^desde", features[0])) == False:
        if bool(re.search("\d+ m2", features[0])) == True:
            data_v1.append(d)

In [98]:
def saving_versions(data, version):
    filename = expanduser("~/data")
    filename +=  "/houses_for_sale_train_{}.pickle".format(version)
    file = open(filename, 'wb')
    pickle.dump(data, file)
    file.close()
    print(filename)
    
def load_versions(version):
    filename = expanduser("~/data")
    filename += "/houses_for_sale_train_{}.pickle".format(version)
    with open(filename,'rb') as f:
        data = pickle.load(f, encoding='utf-8')
    return data

In [99]:
# saving the new version of the data
saving_versions(data_v1, "v1")
len(data_v1)

/Users/yinterian/data/houses_for_sale_train_v1.pickle


866

### Checking number of rooms feature
and fixing issues

In [100]:
data_v1 = load_versions("v1")
len(data_v1)

866

In [101]:
[d["features"] for d in data_v1[:10]]

[['85 m2', '2 hab.', '1 baño', '3.647 €/m2'],
 ['65 m2', '2 hab.', '1 baño', '5.000 €/m2'],
 ['77 m2', '2 hab.', '1 baño', '4.286 €/m2'],
 ['96 m2', '3 hab.', '2 baños', '4.531 €/m2'],
 ['84 m2', '2 hab.', '1 baño', '4.881 €/m2'],
 ['91 m2', '4 hab.', '2 baños', '4.780 €/m2'],
 ['96 m2', '2 hab.', '2 baños', '4.271 €/m2'],
 ['76 m2', '2 hab.', '1 baño', '4.605 €/m2'],
 ['103 m2', '3 hab.', '2 baños', '4.223 €/m2'],
 ['82 m2', '3 hab.', '1 baño', '2.988 €/m2']]

In [102]:
bool(re.match("\d hab", '3 hab.'))

True

In [103]:
# listing without number of rooms
inds = []
for i, d in enumerate(data_v1):
    features = d["features"]
    if not bool(re.match("\d hab", features[1])):
        print(i, features)
        inds.append(i)

137 ['45 m2', '1 baño', '5.689 €/m2']
186 ['113 m2', '4 baños', '2.389 €/m2']
192 ['93 m2', '1 baño', '3.215 €/m2']
503 ['54 m2', '1 baño', '6.944 €/m2']
537 ['93 m2', '1 baño', '2.151 €/m2']
574 ['102 m2', '1 baño', '3.431 €/m2']
729 ['82 m2', '22 hab.', '1 baño', '4.268 €/m2']
766 ['52 m2', '3.442 €/m2']


In [104]:
inds

[137, 186, 192, 503, 537, 574, 729, 766]

In [105]:
def get_new_feature(fea):
    return [fea[0], "0 hab.", fea[1], fea[-1]]

In [106]:
# let's add "0 hab." This is most likely a studio
get_new_feature(['45 m2', '1 baño', '5.689 €/m2'])

['45 m2', '0 hab.', '1 baño', '5.689 €/m2']

In [107]:
data_v2 = []
for i, d in enumerate(data_v1):
    features = d["features"]
    if not bool(re.match("\d hab", features[1])):
        if len(features) == 3:
            features = get_new_feature(features)
            d["features"] = features
    if len(features) == 4:
        data_v2.append(d)

In [108]:
saving_versions(data_v2, "v2")
len(data_v2)

/Users/yinterian/data/houses_for_sale_train_v2.pickle


862

In [109]:
bool(re.match("\d baño", '1 baño'))

True

In [110]:
# checking for bathrooms
for i, d in enumerate(data_v2):
    features = d["features"]
    if not bool(re.match("\d baño", features[2])):
        print(i, features)

In [111]:
# checking the price
for i, d in enumerate(data_v2):
    if not bool(re.match("\d+\.\d+ €", d["price"])):
        print(i, features)

### Task 1
1. Tokenize descriptions (with SpaCy)
2. Exclude stopwords (with SpaCy)
3. Create a centroid vector in a similar fashion as the hw. Use Spacy word vector.
4. Create a baseline *regression* model using just the centroid. Use R^2 as the metric. Use a linear model as well as a Random Forest model.

In [112]:
import spacy
nlp = spacy.load("es_core_news_sm")

In [113]:
text = data_v2[1]["desc"]
text

'Ubicado en la zona del Camp de l’Arpa, cerca de la Sagrada Familia y de zona dels Encants, encontramos este magnífico piso reformado de 65m2 con doble orientación y mucha luminosidad.\n\nEl piso se sitúa en la tercera planta real de esta finca clásica del año 1900.\n\nSu distribución actual se compone de un salón comedor exterior con salida a balcón, una cocina equipada y semi abierta, un dormitorio individual interior, un cuarto de baño completo con plato de ducha, un dormitorio doble amplio exterior a galería y patio de manzana. En la galería, se encuentra la zona de aguas y un trastero.\n\nLa vivienda goza de mucha luz natural y ventilación cruzada, el balcón orienta a noroeste.\n\nLa propiedad cuenta con suelos de parquet laminado de la marca Quickstep en la mayoría de las estancias y suelos de gres porcelánico en el baño.\n\nEl piso dispone de carpinterías de aluminio con doble acristalamiento, persianas eléctricas, calefacción y aire acondicionado por bomba de calor.\n\nLa finca

In [210]:
# use this for empty descriptions
doc = nlp(" ")
empy_vector = np.array([tok.vector for tok in doc]).mean(axis=0)

In [115]:
def get_centroid(text, empty_vector=empy_vector):
    # Write your code here
    return vector

In [116]:
get_centroid(text).shape

(96,)

In [117]:
for d in data_v2:
    d["centroid"] = get_centroid(d["desc"])

### Splitting into training and testing
Training data is used to train the model Testing data is used to measure the preformance of the model.

In [121]:
import random
random.seed(4)
random.shuffle(data_v2)

N = int(len(data_v2)*.8)
print(N)

train = data_v2[:N] 
test = data_v2[N:]

len(train), len(test)

689


(689, 173)

### Model with centroids

In [122]:
# before runing this you need to fix the price

X_train = [d["centroid"] for d in train]
X_test = [d["centroid"] for d in test]
y_train = [d["price"] for d in train]
y_test = [d["price"] for d in test]

In [125]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [141]:
regr = RandomForestRegressor(n_estimators=50, max_depth=15, random_state=0)
regr.fit(X_train, y_train)

In [142]:
## here is prediction on train
y_pred_train = regr.predict(X_train)
r2_score(y_train, y_pred_train)

0.8767592192269306

In [143]:
## here is prediction on test (not seen by the model)
y_pred_test = regr.predict(X_test)
r2_score(y_test, y_pred_test)

0.15435532501046723

This is not a very good model, can we improve it by including other features?

### Task 2: Add location features
1. Fix issues with location strings.
2. Encode the location as a categorical variable with an index.
3. Concatenate the index to the centroid vector and retrain the model.
4. Make a new key "x" by concatenting the centroid and loc
5. Train a new model

hint: np.concatenate can concatenate two numpy arrays

In [211]:
# your code here

In [169]:
X_train = [d["x"] for d in train]
X_test = [d["x"] for d in test]

In [212]:
# your code here to train the model

In [171]:
y_pred_train = regr.predict(X_train)
r2_score(y_train, y_pred_train)

0.8760008578030183

In [172]:
y_pred_test = regr.predict(X_test)
r2_score(y_test, y_pred_test)

0.1336770379720581

### Task 3: add bathrooms, size and other features

1. Add size, bedrooms and bathrooms
2. Retrain the model

In [213]:
# your code here

In [180]:
X_train = [d["x"] for d in train]
X_test = [d["x"] for d in test]

In [214]:
# your code here

### Task 4:
In this task we will experiment with individual words that might be useful. For example the word balcony (balcon, balcones) could be very useful.

Provide a set of words, such as ["balcón", "balcon", "balcones"], introduce a feature that calculates the presence or absence of these words in the description.

Write functions that will make it easier to add more words. Here are other potentially useful set of words: `["doble", "dobles"], ["exterior", "exteriores"], ["interior", "interiores"]`

In [215]:
key_words = ["balcón", "balcon", "balcones"]

In [216]:
# Your code here

### Other models
Can you experiment with Gradient Boosting? Do you get better results?
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html

## Finding good properties to buy.
How can I use this model?

In [184]:
data = data_v2
len(data)

862

In [185]:
# divide data in 3 folds
np.random.shuffle(data)

# Calculate the size of each fold
fold_size = len(data) // 3

# Split the data into 3 folds
fold1 = data[:fold_size]
fold2 = data[fold_size:2 * fold_size]
fold3 = data[2 * fold_size:]

In [189]:
def train_model(foldA, foldB):
    """ train model in two parts of the data"""
    train = foldA + foldB
    X_train = [d["x"] for d in train]
    y_train = [d["price"] for d in train]
    regr = RandomForestRegressor(n_estimators=50, max_depth=15, random_state=0)
    regr.fit(X_train, y_train)
    return regr

In [192]:
def make_predictions(foldC, regr):
    """ Predict the price return the K listings that are the best values
    
    (according to the model)
    """
    for d in foldC:
        d["pred"] = regr.predict([d["x"]])[0]  

In [193]:
# train on 1 and 2 predict on 3
regr = train_model(fold1, fold2)
make_predictions(fold3, regr)

In [194]:
# train on 2 and 3 predict on 1
regr = train_model(fold2, fold3)
make_predictions(fold1, regr)

In [195]:
# train on 1 and 3 predict on 2
regr = train_model(fold1, fold3)
make_predictions(fold2, regr)

In [197]:
## find outliers
# get a list of price - pred
diff = [d["price"] - d["pred"] for d in data]

In [201]:
# maybe better as ratio
diff = [(d["price"] - d["pred"])/d["price"] for d in data]

In [202]:
sorted(diff)[:10]

[-1.4714333333333331,
 -0.979595387840671,
 -0.9631646060606062,
 -0.9265386243386242,
 -0.913641176470588,
 -0.8428592222222221,
 -0.8262333333333333,
 -0.8035875598086124,
 -0.7901197969543147,
 -0.7899468772433598]

In [204]:
top_10 = []
for d in data:
    if (d["price"] - d["pred"])/d["price"] < -0.79:
        top_10.append(d)

In [209]:
# print a summary
keys_to_include = ["price", "pred", "title", "loc_string", "features"]

for d in top_10:
    d_sum = {}
    for k in keys_to_include:
        d_sum[k] = d[k] 
    d_sum["pred"] = int(d_sum["pred"])
    print(d_sum)

{'price': 197.0, 'pred': 352, 'title': 'Piso en Carrer de lepant 323. Piso con 2 habitaciones con ascensor', 'loc_string': 'Barcelona - Sagrada Família', 'features': ['58 m2', '2 hab.', '1 baño', '3.267 €/m2']}
{'price': 120.0, 'pred': 219, 'title': 'Estudio Carrer del comte borrell. Estudio con ascensor', 'loc_string': 'Barcelona - Sant Antoni', 'features': ['25 m2', '1 hab.', '1 baño', '4.800 €/m2']}
{'price': 159.0, 'pred': 314, 'title': "Piso Passatge pla. Piso en venta en la dreta de l'eixample", 'loc_string': 'Barcelona - Dreta de l´Eixample', 'features': ['44 m2', '1 hab.', '1 baño', '3.614 €/m2']}
{'price': 209.0, 'pred': 376, 'title': 'Piso Carrer de múrcia. Piso con 3 habitaciones con ascensor', 'loc_string': 'Barcelona - Navas', 'features': ['93 m2', '3 hab.', '1 baño', '2.247 €/m2']}
{'price': 90.0, 'pred': 222, 'title': 'Piso Carrer de viladomat. Piso amueblado con ascensor', 'loc_string': 'Barcelona - Sant Antoni', 'features': ['13 m2', '1 hab.', '1 baño', '6.923 €/m2']}


### Task 5:
Find most overvalued properties according to this model.