In [12]:
from pathlib import Path
import numpy as np
import pandas as pd
import pickle
from os.path import expanduser

In [13]:
import re

In [5]:
filename = expanduser("~/data/houses_for_sale.pickle")
with open(filename,'rb') as f:
    data = pickle.load(f, encoding='utf-8')

In [19]:
len(data)

913

In [20]:
data[0]

{'price': '320.000 €',
 'title': 'Piso Tallers. Piso con 2 habitaciones con ascensor',
 'loc_string': 'Barcelona - Sant Antoni',
 'loc': None,
 'features': ['85 m2', '2 hab.', '1 baño', '3.647 €/m2'],
 'type': 'FLAT',
 'subtype': 'FLAT',
 'selltype': 'SECOND_HAND',
 'desc': 'Piso en última planta a reformar en calle Tallers junto a plaza Universitat.\n\nLa propiedad ofrece muchas posibilidades para personalizarla según sus necesidades, ya que todas las paredes son tabiques.\n\nLa distribución actual cuenta con salón comedor amplio, cocina, dos habitaciones dobles exteriores, dos habitaciones interiores, baño y despensa/trastero. Balcones a ambos lados de la vivienda, uno a patio de manzana y los otros dos a carrer Tallers. Suelos originales.\n\nFinca del 1882 con ascensor.\n\nCalle peatonal con encanto en el centro de Barcelona, rodeada de todo tipo de comercios y servicios. Buenas comunicaciones mediante transportes publicos, metro, autobuses y FFCC.'}

This dataset was acquired by scraping a website that compiles property listings available for sale in Spain. Our objective is to leverage this data to gain insights into properties that offer a favorable value proposition for potential buyers.

One approach to achieve this is by training a model that can predict the asking price based on various "features" associated with each property listing. We intend to employ models that take a numerical feature vector as input to estimate the price. However, there are several data quality issues that need to be addressed before we can even begin creating features for this dataset:

Remove listings that advertise multiple units for sale.
Rectify discrepancies in the reported "number of bedrooms."
Generate relevant features from textual information."

This dataset was obatines from scrapping a website that lists properties available for sale in Spain. Our aim is to utilize this data to gain insights into homes that appear to offer a favorable value for prospective buyers.

One way to do this is to train a model that given "features" from each house for sales predicts the asking price. We will be using models that take as imputs a vector of numerical features to predict a price. There are a number of issues with real data. Here are a set of issues we have to fixed before trying to even create features for this data:
1. Delete adds that are selling multiple units.
2. Fix issues with "number of bedrooms"
3. Create features from text

In [6]:
len(data)

913

In [9]:
# data is a list of dictionaries
data[0]

{'price': '320.000 €',
 'title': 'Piso Tallers. Piso con 2 habitaciones con ascensor',
 'loc_string': 'Barcelona - Sant Antoni',
 'loc': None,
 'features': ['85 m2', '2 hab.', '1 baño', '3.647 €/m2'],
 'type': 'FLAT',
 'subtype': 'FLAT',
 'selltype': 'SECOND_HAND',
 'desc': 'Piso en última planta a reformar en calle Tallers junto a plaza Universitat.\n\nLa propiedad ofrece muchas posibilidades para personalizarla según sus necesidades, ya que todas las paredes son tabiques.\n\nLa distribución actual cuenta con salón comedor amplio, cocina, dos habitaciones dobles exteriores, dos habitaciones interiores, baño y despensa/trastero. Balcones a ambos lados de la vivienda, uno a patio de manzana y los otros dos a carrer Tallers. Suelos originales.\n\nFinca del 1882 con ascensor.\n\nCalle peatonal con encanto en el centro de Barcelona, rodeada de todo tipo de comercios y servicios. Buenas comunicaciones mediante transportes publicos, metro, autobuses y FFCC.'}

## Removing ads from multiple units
If features[0] starts with "desde"

In [10]:
data[10]['price'], data[10]['features']

(None, ['desde 70 m2', '15.785,71 €/m2'])

In [14]:
 bool(re.match("^desde", 'desde 70 m2')),  bool(re.match("^desde", '70 m2'))

(True, False)

In [15]:
data_v1 = [] 
for i, d in enumerate(data):
    features = d["features"]
    if bool(re.match("^desde", features[0])) == False:
        if bool(re.search("\d+ m2", features[0])) == True:
            data_v1.append(d)

In [17]:
def saving_versions(data, version):
    filename = expanduser("~/data")
    filename +=  "houses_for_sale_train_{}.pickle".format(version)
    file = open(filename, 'wb')
    pickle.dump(data, file)
    file.close()
    print(filename)

In [18]:
saving_versions(data_v1, "v1")
len(data_v1)

/Users/yinterian/datahouses_for_sale_train_v1.pickle


866

### Checking number of rooms feature
and fixing issues

In [57]:
data_v1 = load_versions("v1")
len(data_v1)

866

In [44]:
bool(re.match("\d hab", '3 hab.'))

True

In [59]:
inds = []
for i, d in enumerate(data_v1):
    features = d["features"]
    if not bool(re.match("\d hab", features[1])):
        print(i, features)
        inds.append(i)

137 ['45 m2', '1 baño', '5.689 €/m2']
186 ['113 m2', '4 baños', '2.389 €/m2']
192 ['93 m2', '1 baño', '3.215 €/m2']
503 ['54 m2', '1 baño', '6.944 €/m2']
537 ['93 m2', '1 baño', '2.151 €/m2']
574 ['102 m2', '1 baño', '3.431 €/m2']
729 ['82 m2', '22 hab.', '1 baño', '4.268 €/m2']
766 ['52 m2', '3.442 €/m2']


In [62]:
inds

[137, 186, 192, 503, 537, 574, 729, 766]

In [65]:
def get_new_feature(fea):
    return [fea[0], "0 hab.", fea[1], fea[-1]]

In [66]:
get_new_feature(['45 m2', '1 baño', '5.689 €/m2'])

['45 m2', '0 hab.', '1 baño', '5.689 €/m2']

In [69]:
data_v2 = []
for i, d in enumerate(data_v1):
    features = d["features"]
    if not bool(re.match("\d hab", features[1])):
        if len(features) == 3:
            features = get_new_feature(features)
            d["features"] = features
    if len(features) == 4:
        data_v2.append(d)

In [73]:
saving_versions(data_v2, "v2")
len(data_v2)

train_data_ml/houses_for_sale_train_v2.pickle


866

In [75]:
bool(re.match("\d baño", '1 baño'))

True

In [79]:
for i, d in enumerate(data_v2):
    features = d["features"]
    if not bool(re.match("\d baño", features[2])):
        print(i, features)

In [90]:
for i, d in enumerate(data_v2):
    if not bool(re.match("\d+\.\d+ €", d["price"])):
        print(i, features)

### Finding features in descriptions
Let's tokenize the descriptions and compute in how many documents each word apeards.

`pip install -U pip setuptools wheel` <br>
`pip install -U spacy` <br>
`python -m spacy download en_core_web_sm` <br>
`python -m spacy download es_core_news_sm`<br>

In [93]:
import spacy
nlp = spacy.load("es_core_news_sm")

In [109]:
text = data_v2[0]["desc"]
text

'HANNAN-PIPER Real Estate les presenta, en exclusiva esta propiedad amplia y soleada vivienda de 115 metros cuadrados,\nTransporte y comercios variados a pocos metros de la finca.\nEstá en un edificio construido en los años 50, con ITE favorable, en perfecto estado de conservación, con acceso habilitado para minusválidos y ascensor.\nEs una vivienda en planta alta, completamente exterior y con sol todo el día.\nEn muy buen estado de conservación. Baño y Cocina reformados.\nTiene una perfecta distribución, con estancias cuadradas o rectangulares, sin metros desaprovechados, pero con muchas posibilidades para adecuarla a las necesidades de sus nuevos propietarios,\nDispone de amplio hall de acceso, salón comedor con salida a balcón, tres dormitorios dobles, dos de ellos muy amplios, cocina con cómoda y soleada zona de lavandería y baño completo con plato de ducha. Armario empotrado en el pasillo.\nSuelos de parquet, salvo cerámica en cocina, lavandería y baño. Cerramientos de aluminio y 

In [110]:
doc = nlp(text)
for token in doc[60:80]:
    print(token.text, token.pos_)


 SPACE
Es AUX
una DET
vivienda NOUN
en ADP
planta NOUN
alta ADJ
, PUNCT
completamente ADV
exterior ADJ
y CCONJ
con ADP
sol NOUN
todo DET
el DET
día NOUN
. PUNCT

 SPACE
En ADP
muy ADV


In [126]:
def clean_text(text):
    doc = nlp(text)
    tokens = [token.text.lower() for token in doc if token.pos_ in ["VERB", "NOUN", "ADJ", "ADV"]]
    return tokens

In [125]:
clean_text(text)[:10]

['presenta',
 'exclusiva',
 'propiedad',
 'amplia',
 'soleada',
 'vivienda',
 'metros',
 'cuadrados',
 'comercios',
 'variados']

In [127]:
### Cleaning text
for d in data_v2:
    main_tokens = clean_text(d["desc"])
    d["tokens"] = main_tokens

In [128]:
saving_versions(data_v2, "v2")
len(data_v2)

train_data_ml/houses_for_sale_train_v2.pickle


862

In [8]:
data_v2 = load_versions("v2")
len(data_v2)

862

### Splitting into training and testing
Training data is used to train the model Testing data is used to measure the preformance of the model.

In [9]:
import random
random.shuffle(data_v2)

N = int(len(data_v2)*.8)
print(N)

train = data_v2[:N] 
test = data_v2[N:]

len(train), len(test)

689


(689, 173)

### Features from descriptions
Let's compute the number of documents each word appears in. We do this just with the training data.

In [10]:
from collections import defaultdict

doc_freq = defaultdict(list) 
for i, d in enumerate(train):
    tokens = set(d["tokens"])
    for token in tokens:
        doc_freq[token].append(i)

In [11]:
tokens = train[0]["tokens"]
tokens[:5]

['presenta', 'piso', 'finca', 'ascensor', 'vivienda']

In [19]:
prices = [float(d["price"].replace(" €", "")) for d in train]

In [20]:
prices[:10]

[381.0, 299.0, 495.0, 345.0, 395.0, 295.0, 486.81, 349.0, 350.0, 240.0]

In [22]:
np.mean(prices), np.median(prices)

(346.00156603773587, 350.0)

In [30]:
median_prices = {} 
for word in doc_freq:
    L = len(doc_freq[word])
    if L >= 20 and L < 400:
        price_word = [prices[i] for i in doc_freq[word]]
        median_prices[word] = np.median(price_word)

In [33]:
keywords = []
High = 350 + 35
Low = 350 - 35

for word in median_prices:
    if median_prices[word] > High or median_prices[word] < Low:
        print(word, median_prices[word])
        keywords.append(word)

grandes 393.5
permite 386.027
paredes 394.5
gimnasios 390.0
empotrado 299.9
actual 411.5
estrenar 390.0
ascensores 413.5
suite 410.0
hipotecario 299.9
valoración 299.9
línea 307.0
deseas 307.495
mediana 390.0
parking 400.0
baños 414.0
listo 307.45
interiores 410.0
buenas 410.0
alta 399.0
residencial 395.0
obra 410.0
alto 405.0
conductos 405.0
carácter 388.027
principales 386.0
contactarnos 399.0
proyecto 386.0
múltiples 400.0
puertas 386.5
completos 419.5
obtener 399.5
directo 390.5
central 399.0
office 410.0
registrales 304.995
calidad 399.0
materiales 404.5
vez 395.0
comunitario 310.0
noche 410.0
pequeño 299.9
sur 299.9
elementos 386.027
cerrado 299.9
balcones 390.0
ventanales 406.5
paseo 390.0
despejadas 398.0
parte 395.0
cerramientos 390.0
maravilloso 397.0
vecinos 395.0
comunitaria 299.9
estilo 390.0
cristal 395.0
comercio 395.0
amplios 410.0
vista 399.0
equipado 392.0
privilegiada 387.5
privado 400.0
persianas 412.5
farmacias 310.0
perder 304.5
almacenaje 386.0
multitud 399.0
dan

### Keywords to ids

In [39]:
key_map = {k: i for i, k in enumerate(keywords)}
key_map

{'grandes': 0,
 'permite': 1,
 'paredes': 2,
 'gimnasios': 3,
 'empotrado': 4,
 'actual': 5,
 'estrenar': 6,
 'ascensores': 7,
 'suite': 8,
 'hipotecario': 9,
 'valoración': 10,
 'línea': 11,
 'deseas': 12,
 'mediana': 13,
 'parking': 14,
 'baños': 15,
 'listo': 16,
 'interiores': 17,
 'buenas': 18,
 'alta': 19,
 'residencial': 20,
 'obra': 21,
 'alto': 22,
 'conductos': 23,
 'carácter': 24,
 'principales': 25,
 'contactarnos': 26,
 'proyecto': 27,
 'múltiples': 28,
 'puertas': 29,
 'completos': 30,
 'obtener': 31,
 'directo': 32,
 'central': 33,
 'office': 34,
 'registrales': 35,
 'calidad': 36,
 'materiales': 37,
 'vez': 38,
 'comunitario': 39,
 'noche': 40,
 'pequeño': 41,
 'sur': 42,
 'elementos': 43,
 'cerrado': 44,
 'balcones': 45,
 'ventanales': 46,
 'paseo': 47,
 'despejadas': 48,
 'parte': 49,
 'cerramientos': 50,
 'maravilloso': 51,
 'vecinos': 52,
 'comunitaria': 53,
 'estilo': 54,
 'cristal': 55,
 'comercio': 56,
 'amplios': 57,
 'vista': 58,
 'equipado': 59,
 'privilegiada

### Creating features

In [46]:
def tokens_to_features(tokens, key_map):
    feature = np.zeros(len(key_map))
    for k in tokens:
        if k in key_map:
            feature[key_map[k]] = 1
    return feature

In [None]:
tokens = train[1]["tokens"]

tokens_to_features(tokens, key_map)

#### location features

In [50]:
np.unique([d["loc_string"] for d in train])

array(['Barcelona - Diagonal Mar i el Front Marítim del Poblenou',
       'Barcelona - Dreta de l´Eixample',
       'Barcelona - El Camp de l´Arpa del Clot', 'Barcelona - El Clot',
       'Barcelona - El Parc i la Llacuna del Poblenou',
       'Barcelona - Fort Pienc',
       'Barcelona - La Nova Esquerra de l´Eixample',
       'Barcelona - La Nova Esquerra de l´Eixample\nVer mapa',
       'Barcelona - La Vila Olímpica del Poblenou',
       'Barcelona - L´Antiga Esquerra de l´Eixample', 'Barcelona - Navas',
       'Barcelona - Poblenou', 'Barcelona - Poblenou\nVer mapa',
       'Barcelona - Provençals del Poblenou',
       'Barcelona - Sagrada Família', 'Barcelona - Sant Antoni'],
      dtype='<U56')

In [54]:
## clean loc
locations = np.unique([d["loc_string"].split("\n")[0] for d in train])
locations

array(['Barcelona - Diagonal Mar i el Front Marítim del Poblenou',
       'Barcelona - Dreta de l´Eixample',
       'Barcelona - El Camp de l´Arpa del Clot', 'Barcelona - El Clot',
       'Barcelona - El Parc i la Llacuna del Poblenou',
       'Barcelona - Fort Pienc',
       'Barcelona - La Nova Esquerra de l´Eixample',
       'Barcelona - La Vila Olímpica del Poblenou',
       'Barcelona - L´Antiga Esquerra de l´Eixample', 'Barcelona - Navas',
       'Barcelona - Poblenou', 'Barcelona - Provençals del Poblenou',
       'Barcelona - Sagrada Família', 'Barcelona - Sant Antoni'],
      dtype='<U56')

In [62]:
loc_map = {v:k for k, v in enumerate(locations)}
loc_map

{'Barcelona - Diagonal Mar i el Front Marítim del Poblenou': 0,
 'Barcelona - Dreta de l´Eixample': 1,
 'Barcelona - El Camp de l´Arpa del Clot': 2,
 'Barcelona - El Clot': 3,
 'Barcelona - El Parc i la Llacuna del Poblenou': 4,
 'Barcelona - Fort Pienc': 5,
 'Barcelona - La Nova Esquerra de l´Eixample': 6,
 'Barcelona - La Vila Olímpica del Poblenou': 7,
 'Barcelona - L´Antiga Esquerra de l´Eixample': 8,
 'Barcelona - Navas': 9,
 'Barcelona - Poblenou': 10,
 'Barcelona - Provençals del Poblenou': 11,
 'Barcelona - Sagrada Família': 12,
 'Barcelona - Sant Antoni': 13}

In [63]:
def loc_to_feature(loc, loc_map):
    loc = loc.split("\n")[0]
    return loc_map.get(loc, -1)

In [64]:
loc_to_feature("Other", loc_map)

-1

In [65]:
loc_to_feature('Barcelona - La Nova Esquerra de l´Eixample\nVer mapa', loc_map)

6

In [69]:
train[0]["price"]

'381.000 €'

In [88]:
def features_to_features(features, price):
    m2 = int(features[0].replace(" m2", ""))
    bedrooms = int(features[1].replace(" hab.", ""))
    bathrooms = int(features[2].replace(" baños", "").replace(" baño", ""))
    price = float(price.replace(" €", ""))
    return np.array([m2, bedrooms, bathrooms, price])

In [71]:
features_to_features(train[0]["features"], train[0]["price"])

array([ 85.,   4.,   2., 381.])

In [75]:
# not very useful
from collections import Counter
Counter([d['selltype'] for d in train])

Counter({'SECOND_HAND': 689})

In [89]:
def all_features(d, key_map, loc_map):
    fea = []
    fea.append(tokens_to_features(d["tokens"], key_map))
    fea.append([loc_to_feature(d["loc_string"], loc_map)])
    fea.append(features_to_features(d["features"], d["price"]))
    return np.concatenate(fea)

In [90]:
header = np.array(keywords + ["loc", "size", "bedrooms", "bathrooms", "price"])

In [91]:
fea = all_features(train[0], key_map, loc_map)

In [96]:
train_features = np.stack([all_features(d, key_map, loc_map) for d in train])

In [97]:
test_features = np.stack([all_features(d, key_map, loc_map) for d in test])

In [98]:
len(header)

75

In [99]:
df_train = pd.DataFrame(train_features, columns=header)
df_train.head()

Unnamed: 0,grandes,permite,paredes,gimnasios,empotrado,actual,estrenar,ascensores,suite,hipotecario,...,almacenaje,multitud,dan,originales,aguas,loc,size,bedrooms,bathrooms,price
0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,12.0,85.0,4.0,2.0,381.0
1,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,12.0,69.0,3.0,1.0,299.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,117.0,4.0,2.0,495.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,5.0,89.0,4.0,1.0,345.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,13.0,70.0,2.0,2.0,395.0


In [100]:
df_test = pd.DataFrame(test_features, columns=header)

In [101]:
df_train.to_csv("houses_for_sale_train.csv",index=False)
df_test.to_csv("houses_for_sale_test.csv",index=False)