In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import pickle
from os.path import expanduser
from collections import Counter

In [2]:
import re

Install the following packages: <br>
`pip install -U pip setuptools wheel` <br>
`pip install -U spacy` <br>
`python -m spacy download en_core_web_sm` <br>
`python -m spacy download es_core_news_sm`<br>

In [3]:
filename = expanduser("~/data/houses_for_sale.pickle")
with open(filename,'rb') as f:
    data = pickle.load(f, encoding='utf-8')

This dataset was acquired by scraping a website that compiles property listings available for sale in Spain. Our objective is to leverage this data to gain insights into properties that offer a favorable value proposition for potential buyers.

One approach to achieve this is by training a model that can predict the asking price based on various "features" associated with each property listing. We intend to employ models that take a numerical feature vector as input to estimate the price. However, there are several data quality issues that need to be addressed before we can even begin creating features for this dataset:

1. Remove listings that advertise multiple units for sale.
2. Fix the "number of bedrooms" in some listings.
4. Fix the location feature. Encoding it as a categorical variable.
3. Generate relevant features from textual information.

In [4]:
len(data)

913

In [5]:
# data is a list of dictionaries
data[0]

{'price': '320.000 €',
 'title': 'Piso Tallers. Piso con 2 habitaciones con ascensor',
 'loc_string': 'Barcelona - Sant Antoni',
 'loc': None,
 'features': ['85 m2', '2 hab.', '1 baño', '3.647 €/m2'],
 'type': 'FLAT',
 'subtype': 'FLAT',
 'selltype': 'SECOND_HAND',
 'desc': 'Piso en última planta a reformar en calle Tallers junto a plaza Universitat.\n\nLa propiedad ofrece muchas posibilidades para personalizarla según sus necesidades, ya que todas las paredes son tabiques.\n\nLa distribución actual cuenta con salón comedor amplio, cocina, dos habitaciones dobles exteriores, dos habitaciones interiores, baño y despensa/trastero. Balcones a ambos lados de la vivienda, uno a patio de manzana y los otros dos a carrer Tallers. Suelos originales.\n\nFinca del 1882 con ascensor.\n\nCalle peatonal con encanto en el centro de Barcelona, rodeada de todo tipo de comercios y servicios. Buenas comunicaciones mediante transportes publicos, metro, autobuses y FFCC.'}

## Removing ads from multiple units
If features starts with "desde" it means that the ad is advertising multiple units. See for example this add in which the price is None and features start with "desde".

In [5]:
#data[10]

In [6]:
data[10]['price']

In [7]:
data[10]['features']

['desde 70 m2', '15.785,71 €/m2']

In [9]:
 bool(re.match("^desde", 'desde 70 m2')),  bool(re.match("^desde", '70 m2'))

(True, False)

In [10]:
data_v1 = [] 
for i, d in enumerate(data):
    features = d["features"]
    if bool(re.match("^desde", features[0])) == False:
        if bool(re.search("\d+ m2", features[0])) == True:
            data_v1.append(d)

In [56]:
def saving_versions(data, version):
    filename = expanduser("~/data")
    filename +=  "/houses_for_sale_train_{}.pickle".format(version)
    file = open(filename, 'wb')
    pickle.dump(data, file)
    file.close()
    print(filename)
    
def load_versions(version):
    filename = expanduser("~/data")
    filename += "/houses_for_sale_train_{}.pickle".format(version)
    with open(filename,'rb') as f:
        data = pickle.load(f, encoding='utf-8')
    return data

In [12]:
# saving the new version of the data
saving_versions(data_v1, "v1")
len(data_v1)

/Users/yinterian/datahouses_for_sale_train_v1.pickle


866

### Checking number of rooms feature
and fixing issues

In [13]:
data_v1 = load_versions("v1")
len(data_v1)

866

In [14]:
[d["features"] for d in data_v1[:10]]

[['85 m2', '2 hab.', '1 baño', '3.647 €/m2'],
 ['65 m2', '2 hab.', '1 baño', '5.000 €/m2'],
 ['77 m2', '2 hab.', '1 baño', '4.286 €/m2'],
 ['96 m2', '3 hab.', '2 baños', '4.531 €/m2'],
 ['84 m2', '2 hab.', '1 baño', '4.881 €/m2'],
 ['91 m2', '4 hab.', '2 baños', '4.780 €/m2'],
 ['96 m2', '2 hab.', '2 baños', '4.271 €/m2'],
 ['76 m2', '2 hab.', '1 baño', '4.605 €/m2'],
 ['103 m2', '3 hab.', '2 baños', '4.223 €/m2'],
 ['82 m2', '3 hab.', '1 baño', '2.988 €/m2']]

In [15]:
bool(re.match("\d hab", '3 hab.'))

True

In [16]:
# listing without number of rooms
inds = []
for i, d in enumerate(data_v1):
    features = d["features"]
    if not bool(re.match("\d hab", features[1])):
        print(i, features)
        inds.append(i)

137 ['45 m2', '1 baño', '5.689 €/m2']
186 ['113 m2', '4 baños', '2.389 €/m2']
192 ['93 m2', '1 baño', '3.215 €/m2']
503 ['54 m2', '1 baño', '6.944 €/m2']
537 ['93 m2', '1 baño', '2.151 €/m2']
574 ['102 m2', '1 baño', '3.431 €/m2']
729 ['82 m2', '22 hab.', '1 baño', '4.268 €/m2']
766 ['52 m2', '3.442 €/m2']


In [26]:
inds

[137, 186, 192, 503, 537, 574, 729, 766]

In [17]:
def get_new_feature(fea):
    return [fea[0], "0 hab.", fea[1], fea[-1]]

In [18]:
# let's add "0 hab." This is most likely a studio
get_new_feature(['45 m2', '1 baño', '5.689 €/m2'])

['45 m2', '0 hab.', '1 baño', '5.689 €/m2']

In [19]:
data_v2 = []
for i, d in enumerate(data_v1):
    features = d["features"]
    if not bool(re.match("\d hab", features[1])):
        if len(features) == 3:
            features = get_new_feature(features)
            d["features"] = features
    if len(features) == 4:
        data_v2.append(d)

In [20]:
saving_versions(data_v2, "v2")
len(data_v2)

/Users/yinterian/datahouses_for_sale_train_v2.pickle


862

In [19]:
bool(re.match("\d baño", '1 baño'))

True

In [20]:
# checking for bathrooms
for i, d in enumerate(data_v2):
    features = d["features"]
    if not bool(re.match("\d baño", features[2])):
        print(i, features)

In [21]:
# checking the price
for i, d in enumerate(data_v2):
    if not bool(re.match("\d+\.\d+ €", d["price"])):
        print(i, features)

### Exercise: 
Explore features, 'subtype', 'selltype'. Do you think they would be usedful to our model?  

### Finding features in descriptions
Let's tokenize the descriptions and compute in how many documents each word apeards.

In [21]:
import spacy
nlp = spacy.load("es_core_news_sm")

In [22]:
text = data_v2[0]["desc"]
text

'Piso en última planta a reformar en calle Tallers junto a plaza Universitat.\n\nLa propiedad ofrece muchas posibilidades para personalizarla según sus necesidades, ya que todas las paredes son tabiques.\n\nLa distribución actual cuenta con salón comedor amplio, cocina, dos habitaciones dobles exteriores, dos habitaciones interiores, baño y despensa/trastero. Balcones a ambos lados de la vivienda, uno a patio de manzana y los otros dos a carrer Tallers. Suelos originales.\n\nFinca del 1882 con ascensor.\n\nCalle peatonal con encanto en el centro de Barcelona, rodeada de todo tipo de comercios y servicios. Buenas comunicaciones mediante transportes publicos, metro, autobuses y FFCC.'

In [23]:
# tokenizing with spacy, you get part of speech (POS) for free
doc = nlp(text)
for token in doc[60:80]:
    print(token.text, token.pos_)

. PUNCT
Balcones NOUN
a ADP
ambos NUM
lados NOUN
de ADP
la DET
vivienda NOUN
, PUNCT
uno PRON
a ADP
patio NOUN
de ADP
manzana NOUN
y CCONJ
los DET
otros DET
dos NUM
a ADP
carrer VERB


In [24]:
# you can filter using POS
pos_list = ["VERB", "NOUN", "ADJ", "ADV"]
def clean_text(text):
    doc = nlp(text)
    tokens = [token.text.lower() for token in doc if token.pos_ in pos_list]
    return tokens

In [25]:
clean_text(text)[:10]

['piso',
 'última',
 'planta',
 'reformar',
 'calle',
 'junto',
 'propiedad',
 'ofrece',
 'posibilidades',
 'personalizarla']

In [26]:
### Cleaning text
for d in data_v2:
    main_tokens = clean_text(d["desc"])
    d["tokens"] = main_tokens

In [27]:
saving_versions(data_v2, "v2")
len(data_v2)

/Users/yinterian/datahouses_for_sale_train_v2.pickle


862

In [28]:
data_v2 = load_versions("v2")
len(data_v2)

862

### Splitting into training and testing
Training data is used to train the model Testing data is used to measure the preformance of the model.

In [29]:
import random
random.seed(4)
random.shuffle(data_v2)

N = int(len(data_v2)*.8)
print(N)

train = data_v2[:N] 
test = data_v2[N:]

len(train), len(test)

689


(689, 173)

### Features from descriptions
Let's compute the number of documents each word appears in. We do this just with the training data.

In [30]:
from collections import defaultdict

doc_freq = defaultdict(list) 
for i, d in enumerate(train):
    tokens = set(d["tokens"])
    for token in tokens:
        doc_freq[token].append(i)

In [31]:
tokens = train[0]["tokens"]
tokens[:5]

['nfo', 'presenta', 'precioso', 'piso', 'muy']

In [32]:
prices = [float(d["price"].replace(" €", "")) for d in train]

In [33]:
prices[:10]

[450.0, 230.0, 340.0, 213.0, 420.0, 299.0, 460.0, 340.0, 280.0, 399.0]

In [34]:
np.mean(prices), np.median(prices)

(343.678388969521, 349.0)

In [34]:
median_prices = {} 
for word in doc_freq:
    L = len(doc_freq[word])
    if L >= 20 and L < 400:
        price_word = [prices[i] for i in doc_freq[word]]
        median_prices[word] = np.median(price_word)

In [35]:
keywords = []
High = 350 + 35
Low = 350 - 35

for word in median_prices:
    if median_prices[word] > High or median_prices[word] < Low:
        print(word, median_prices[word])
        keywords.append(word)

profesional 299.9
central 399.5
parking 395.0
precioso 398.0
gratuita 299.9
baños 415.0
departamento 299.9
deseas 299.9
hipotecario 299.9
directo 392.0
completos 417.0
mediana 386.0
línea 305.0
empotrados 392.5
bañera 312.5
empotrado 310.0
valoración 299.9
desea 299.9
estrenar 395.0
suite 425.0
abierta 390.0
así 399.0
grandes 388.5
trastero 390.0
vistas 387.5
ascensores 409.95
pequeño 299.9
comunitaria 299.9
puertas 389.0
armarios 399.0
office 399.0
grava 299.9
materiales 392.5
hipotecaria 309.95
registrales 299.99
notariales 312.5
calidad 400.0
tampoco 299.9
estar 390.0
segundo 390.0
perder 305.0
barrios 395.0
estancias 392.0
gas 390.0
construido 299.945
presentamos 300.0
cristal 395.0
obtener 399.5
actual 424.0
€ 305.0
radiadores 390.0
bonito 394.5
fachada 390.0
estilo 390.0
portero 398.0
variedad 390.0
espacios 395.0
nuevo 395.0
maravilloso 399.99
alto 405.0
múltiples 395.0
amplitud 399.5
noche 412.0
ventanales 410.0
proyecto 397.0
amplios 415.0
privado 412.0
alta 399.0
acabados 390

### Keywords to ids

In [36]:
key_map = {k: i for i, k in enumerate(keywords)}
key_map

{'profesional': 0,
 'central': 1,
 'parking': 2,
 'precioso': 3,
 'gratuita': 4,
 'baños': 5,
 'departamento': 6,
 'deseas': 7,
 'hipotecario': 8,
 'directo': 9,
 'completos': 10,
 'mediana': 11,
 'línea': 12,
 'empotrados': 13,
 'bañera': 14,
 'empotrado': 15,
 'valoración': 16,
 'desea': 17,
 'estrenar': 18,
 'suite': 19,
 'abierta': 20,
 'así': 21,
 'grandes': 22,
 'trastero': 23,
 'vistas': 24,
 'ascensores': 25,
 'pequeño': 26,
 'comunitaria': 27,
 'puertas': 28,
 'armarios': 29,
 'office': 30,
 'grava': 31,
 'materiales': 32,
 'hipotecaria': 33,
 'registrales': 34,
 'notariales': 35,
 'calidad': 36,
 'tampoco': 37,
 'estar': 38,
 'segundo': 39,
 'perder': 40,
 'barrios': 41,
 'estancias': 42,
 'gas': 43,
 'construido': 44,
 'presentamos': 45,
 'cristal': 46,
 'obtener': 47,
 'actual': 48,
 '€': 49,
 'radiadores': 50,
 'bonito': 51,
 'fachada': 52,
 'estilo': 53,
 'portero': 54,
 'variedad': 55,
 'espacios': 56,
 'nuevo': 57,
 'maravilloso': 58,
 'alto': 59,
 'múltiples': 60,
 'am

### Creating features

In [37]:
def tokens_to_features(tokens, key_map):
    feature = np.zeros(len(key_map))
    for k in tokens:
        if k in key_map:
            feature[key_map[k]] = 1
    return feature

In [38]:
tokens = train[1]["tokens"]

tokens_to_features(tokens, key_map)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0.])

### location features
Note that some of the location strings have issues.

In [39]:
np.unique([d["loc_string"] for d in train])

array(['Barcelona - Besòs - Maresme',
       'Barcelona - Diagonal Mar i el Front Marítim del Poblenou',
       'Barcelona - Dreta de l´Eixample',
       'Barcelona - Dreta de l´Eixample\nVer mapa',
       'Barcelona - El Camp de l´Arpa del Clot', 'Barcelona - El Clot',
       'Barcelona - El Parc i la Llacuna del Poblenou',
       'Barcelona - Fort Pienc',
       'Barcelona - La Nova Esquerra de l´Eixample',
       'Barcelona - La Nova Esquerra de l´Eixample\nVer mapa',
       'Barcelona - La Vila Olímpica del Poblenou',
       'Barcelona - L´Antiga Esquerra de l´Eixample', 'Barcelona - Navas',
       'Barcelona - Poblenou', 'Barcelona - Poblenou\nVer mapa',
       'Barcelona - Provençals del Poblenou',
       'Barcelona - Sagrada Família', 'Barcelona - Sant Antoni'],
      dtype='<U56')

In [40]:
## clean loc
locations = np.unique([d["loc_string"].split("\n")[0] for d in train])
locations

array(['Barcelona - Besòs - Maresme',
       'Barcelona - Diagonal Mar i el Front Marítim del Poblenou',
       'Barcelona - Dreta de l´Eixample',
       'Barcelona - El Camp de l´Arpa del Clot', 'Barcelona - El Clot',
       'Barcelona - El Parc i la Llacuna del Poblenou',
       'Barcelona - Fort Pienc',
       'Barcelona - La Nova Esquerra de l´Eixample',
       'Barcelona - La Vila Olímpica del Poblenou',
       'Barcelona - L´Antiga Esquerra de l´Eixample', 'Barcelona - Navas',
       'Barcelona - Poblenou', 'Barcelona - Provençals del Poblenou',
       'Barcelona - Sagrada Família', 'Barcelona - Sant Antoni'],
      dtype='<U56')

In [41]:
loc_map = {v:k for k, v in enumerate(locations)}
loc_map

{'Barcelona - Besòs - Maresme': 0,
 'Barcelona - Diagonal Mar i el Front Marítim del Poblenou': 1,
 'Barcelona - Dreta de l´Eixample': 2,
 'Barcelona - El Camp de l´Arpa del Clot': 3,
 'Barcelona - El Clot': 4,
 'Barcelona - El Parc i la Llacuna del Poblenou': 5,
 'Barcelona - Fort Pienc': 6,
 'Barcelona - La Nova Esquerra de l´Eixample': 7,
 'Barcelona - La Vila Olímpica del Poblenou': 8,
 'Barcelona - L´Antiga Esquerra de l´Eixample': 9,
 'Barcelona - Navas': 10,
 'Barcelona - Poblenou': 11,
 'Barcelona - Provençals del Poblenou': 12,
 'Barcelona - Sagrada Família': 13,
 'Barcelona - Sant Antoni': 14}

In [42]:
def loc_to_feature(loc, loc_map):
    loc = loc.split("\n")[0]
    return loc_map.get(loc, -1)

In [44]:
loc_to_feature("Other", loc_map)

-1

In [43]:
loc_to_feature('Barcelona - La Nova Esquerra de l´Eixample\nVer mapa', loc_map)

7

In [44]:
train[0]["price"]

'450.000 €'

In [45]:
def features_to_features(features, price):
    m2 = int(features[0].replace(" m2", ""))
    bedrooms = int(features[1].replace(" hab.", ""))
    bathrooms = int(features[2].replace(" baños", "").replace(" baño", ""))
    price = float(price.replace(" €", ""))
    return np.array([m2, bedrooms, bathrooms, price])

In [46]:
features_to_features(train[0]["features"], train[0]["price"])

array([106.,   4.,   2., 450.])

In [47]:
# not very useful
from collections import Counter
Counter([d['selltype'] for d in train])

Counter({'SECOND_HAND': 689})

In [48]:
def all_features(d, key_map, loc_map):
    fea = []
    fea.append(tokens_to_features(d["tokens"], key_map))
    fea.append([loc_to_feature(d["loc_string"], loc_map)])
    fea.append(features_to_features(d["features"], d["price"]))
    return np.concatenate(fea)

In [49]:
header = np.array(keywords + ["loc", "size", "bedrooms", "bathrooms", "price"])

In [50]:
fea = all_features(train[0], key_map, loc_map)

In [51]:
train_features = np.stack([all_features(d, key_map, loc_map) for d in train])

In [52]:
test_features = np.stack([all_features(d, key_map, loc_map) for d in test])

In [53]:
len(header)

97

In [54]:
df_train = pd.DataFrame(train_features, columns=header)
df_train.head()

Unnamed: 0,profesional,central,parking,precioso,gratuita,baños,departamento,deseas,hipotecario,directo,...,llave,blanco,comercio,vas,dan,loc,size,bedrooms,bathrooms,price
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,10.0,106.0,4.0,2.0,450.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,13.0,75.0,2.0,2.0,230.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,9.0,52.0,2.0,1.0,340.0
3,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,60.0,1.0,1.0,213.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,7.0,75.0,2.0,2.0,420.0


In [82]:
df_test = pd.DataFrame(test_features, columns=header)

In [83]:
df_train.to_csv("houses_for_sale_train.csv",index=False)
df_test.to_csv("houses_for_sale_test.csv",index=False)

### Exercise: 
There are many ways to do selecting the words to include in you model (feature selection). Implement the following strategies. Create new data frame based on these new text features.
1. Term Frequency (TF): Calculate the frequency of each word in the corpus and select the top-N words with the highest TF values. This can be a simple way to start feature selection.
2. Inverse Document Frequency (IDF): Calculate the IDF for each word to measure how unique and informative it is across the entire corpus. Select words with high IDF scores.
3. Correlation Coefficient: Compute the correlation coefficient between the presence/absence of each word and the continuous labels. Words with high absolute correlation values may be good features.

If you have time, you can use the tfidf score instead of 0,1 for each feature.

In [88]:
# use the train set to compute your list of features
np.array(train[0]["tokens"])

array(['oportunidad', 'piso', 'situado', 'muy', 'cercano', 'tipo',
       'comercios', 'hospital', 'colegios', 'muy', 'bien', 'comunicado',
       'línea', 'línea', 'parada', 'líneas', 'autobuses', 'vivienda',
       'consta', 'distribuidos', 'baño', 'ducha', 'cocina',
       'independiente', 'comedor', 'piso', 'perfecto', 'actualizar',
       'suelos', 'gres', 'finca', 'dispone', 'ascensor', 'tercero',
       'real', 'altura', 'año', 'gastos', 'honorarios', 'agencia', 'no',
       'incluidos', 'precio'], dtype='<U13')