In [38]:
from pathlib import Path
import numpy as np
import pandas as pd
import pickle
from os.path import expanduser
from collections import Counter

In [2]:
import re

In [3]:
filename = expanduser("~/data/houses_for_sale.pickle")
with open(filename,'rb') as f:
    data = pickle.load(f, encoding='utf-8')

This dataset was acquired by scraping a website that compiles property listings available for sale in Spain. Our objective is to leverage this data to gain insights into properties that offer a favorable value proposition for potential buyers.

One approach to achieve this is by training a model that can predict the asking price based on various "features" associated with each property listing. We intend to employ models that take a numerical feature vector as input to estimate the price. However, there are several data quality issues that need to be addressed before we can even begin creating features for this dataset:

1. Remove listings that advertise multiple units for sale.
2. Fix the "number of bedrooms" in some listings.
3. Generate relevant features from textual information.

In [6]:
len(data)

913

In [7]:
# data is a list of dictionaries
data[0]

{'price': '320.000 €',
 'title': 'Piso Tallers. Piso con 2 habitaciones con ascensor',
 'loc_string': 'Barcelona - Sant Antoni',
 'loc': None,
 'features': ['85 m2', '2 hab.', '1 baño', '3.647 €/m2'],
 'type': 'FLAT',
 'subtype': 'FLAT',
 'selltype': 'SECOND_HAND',
 'desc': 'Piso en última planta a reformar en calle Tallers junto a plaza Universitat.\n\nLa propiedad ofrece muchas posibilidades para personalizarla según sus necesidades, ya que todas las paredes son tabiques.\n\nLa distribución actual cuenta con salón comedor amplio, cocina, dos habitaciones dobles exteriores, dos habitaciones interiores, baño y despensa/trastero. Balcones a ambos lados de la vivienda, uno a patio de manzana y los otros dos a carrer Tallers. Suelos originales.\n\nFinca del 1882 con ascensor.\n\nCalle peatonal con encanto en el centro de Barcelona, rodeada de todo tipo de comercios y servicios. Buenas comunicaciones mediante transportes publicos, metro, autobuses y FFCC.'}

## Removing ads from multiple units
If features[0] starts with "desde" it means that the ad is advertising multiple units. See for example this add in which the price is None and features start with "desde".

In [12]:
#data[10]

In [13]:
data[10]['price']

In [14]:
data[10]['features']

['desde 70 m2', '15.785,71 €/m2']

In [15]:
 bool(re.match("^desde", 'desde 70 m2')),  bool(re.match("^desde", '70 m2'))

(True, False)

In [16]:
data_v1 = [] 
for i, d in enumerate(data):
    features = d["features"]
    if bool(re.match("^desde", features[0])) == False:
        if bool(re.search("\d+ m2", features[0])) == True:
            data_v1.append(d)

In [21]:
def saving_versions(data, version):
    filename = expanduser("~/data")
    filename +=  "houses_for_sale_train_{}.pickle".format(version)
    file = open(filename, 'wb')
    pickle.dump(data, file)
    file.close()
    print(filename)
    
def load_versions(version):
    filename = expanduser("~/data")
    filename += "houses_for_sale_train_{}.pickle".format(version)
    with open(filename,'rb') as f:
        data = pickle.load(f, encoding='utf-8')
    return data

In [19]:
# saving the new version of the data
saving_versions(data_v1, "v1")
len(data_v1)

/Users/yinterian/datahouses_for_sale_train_v1.pickle


866

### Checking number of rooms feature
and fixing issues

In [22]:
data_v1 = load_versions("v1")
len(data_v1)

866

In [24]:
[d["features"] for d in data_v1[:10]]

[['85 m2', '2 hab.', '1 baño', '3.647 €/m2'],
 ['65 m2', '2 hab.', '1 baño', '5.000 €/m2'],
 ['77 m2', '2 hab.', '1 baño', '4.286 €/m2'],
 ['96 m2', '3 hab.', '2 baños', '4.531 €/m2'],
 ['84 m2', '2 hab.', '1 baño', '4.881 €/m2'],
 ['91 m2', '4 hab.', '2 baños', '4.780 €/m2'],
 ['96 m2', '2 hab.', '2 baños', '4.271 €/m2'],
 ['76 m2', '2 hab.', '1 baño', '4.605 €/m2'],
 ['103 m2', '3 hab.', '2 baños', '4.223 €/m2'],
 ['82 m2', '3 hab.', '1 baño', '2.988 €/m2']]

In [23]:
bool(re.match("\d hab", '3 hab.'))

True

In [25]:
# listing without number of rooms
inds = []
for i, d in enumerate(data_v1):
    features = d["features"]
    if not bool(re.match("\d hab", features[1])):
        print(i, features)
        inds.append(i)

137 ['45 m2', '1 baño', '5.689 €/m2']
186 ['113 m2', '4 baños', '2.389 €/m2']
192 ['93 m2', '1 baño', '3.215 €/m2']
503 ['54 m2', '1 baño', '6.944 €/m2']
537 ['93 m2', '1 baño', '2.151 €/m2']
574 ['102 m2', '1 baño', '3.431 €/m2']
729 ['82 m2', '22 hab.', '1 baño', '4.268 €/m2']
766 ['52 m2', '3.442 €/m2']


In [26]:
inds

[137, 186, 192, 503, 537, 574, 729, 766]

In [27]:
def get_new_feature(fea):
    return [fea[0], "0 hab.", fea[1], fea[-1]]

In [29]:
# let's add "0 hab." This is most likely a studio
get_new_feature(['45 m2', '1 baño', '5.689 €/m2'])

['45 m2', '0 hab.', '1 baño', '5.689 €/m2']

In [30]:
data_v2 = []
for i, d in enumerate(data_v1):
    features = d["features"]
    if not bool(re.match("\d hab", features[1])):
        if len(features) == 3:
            features = get_new_feature(features)
            d["features"] = features
    if len(features) == 4:
        data_v2.append(d)

In [31]:
saving_versions(data_v2, "v2")
len(data_v2)

/Users/yinterian/datahouses_for_sale_train_v2.pickle


862

In [32]:
bool(re.match("\d baño", '1 baño'))

True

In [35]:
# checking for bathrooms
for i, d in enumerate(data_v2):
    features = d["features"]
    if not bool(re.match("\d baño", features[2])):
        print(i, features)

In [36]:
# checking the price
for i, d in enumerate(data_v2):
    if not bool(re.match("\d+\.\d+ €", d["price"])):
        print(i, features)

### Exercise: 
Explore features, 'subtype', 'selltype'. Do you think they would be usedful to our model?  

### Finding features in descriptions
Let's tokenize the descriptions and compute in how many documents each word apeards.

`pip install -U pip setuptools wheel` <br>
`pip install -U spacy` <br>
`python -m spacy download en_core_web_sm` <br>
`python -m spacy download es_core_news_sm`<br>

In [42]:
import spacy
nlp = spacy.load("es_core_news_sm")

In [43]:
text = data_v2[0]["desc"]
text

'Piso en última planta a reformar en calle Tallers junto a plaza Universitat.\n\nLa propiedad ofrece muchas posibilidades para personalizarla según sus necesidades, ya que todas las paredes son tabiques.\n\nLa distribución actual cuenta con salón comedor amplio, cocina, dos habitaciones dobles exteriores, dos habitaciones interiores, baño y despensa/trastero. Balcones a ambos lados de la vivienda, uno a patio de manzana y los otros dos a carrer Tallers. Suelos originales.\n\nFinca del 1882 con ascensor.\n\nCalle peatonal con encanto en el centro de Barcelona, rodeada de todo tipo de comercios y servicios. Buenas comunicaciones mediante transportes publicos, metro, autobuses y FFCC.'

In [45]:
# tokenizing with spacy, you get part of speech (POS) for free
doc = nlp(text)
for token in doc[60:80]:
    print(token.text, token.pos_)

. PUNCT
Balcones NOUN
a ADP
ambos NUM
lados NOUN
de ADP
la DET
vivienda NOUN
, PUNCT
uno PRON
a ADP
patio NOUN
de ADP
manzana NOUN
y CCONJ
los DET
otros DET
dos NUM
a ADP
carrer VERB


In [47]:
# you can filter using POS
def clean_text(text):
    doc = nlp(text)
    tokens = [token.text.lower() for token in doc if token.pos_ in ["VERB", "NOUN", "ADJ", "ADV"]]
    return tokens

In [48]:
clean_text(text)[:10]

['piso',
 'última',
 'planta',
 'reformar',
 'calle',
 'junto',
 'propiedad',
 'ofrece',
 'posibilidades',
 'personalizarla']

In [49]:
### Cleaning text
for d in data_v2:
    main_tokens = clean_text(d["desc"])
    d["tokens"] = main_tokens

In [50]:
saving_versions(data_v2, "v2")
len(data_v2)

/Users/yinterian/datahouses_for_sale_train_v2.pickle


862

In [51]:
data_v2 = load_versions("v2")
len(data_v2)

862

### Splitting into training and testing
Training data is used to train the model Testing data is used to measure the preformance of the model.

In [52]:
import random
random.shuffle(data_v2)

N = int(len(data_v2)*.8)
print(N)

train = data_v2[:N] 
test = data_v2[N:]

len(train), len(test)

689


(689, 173)

### Features from descriptions
Let's compute the number of documents each word appears in. We do this just with the training data.

In [53]:
from collections import defaultdict

doc_freq = defaultdict(list) 
for i, d in enumerate(train):
    tokens = set(d["tokens"])
    for token in tokens:
        doc_freq[token].append(i)

In [54]:
tokens = train[0]["tokens"]
tokens[:5]

['oportunidad', 'piso', 'situado', 'muy', 'cercano']

In [55]:
prices = [float(d["price"].replace(" €", "")) for d in train]

In [56]:
prices[:10]

[259.0, 445.0, 250.0, 330.0, 255.0, 400.0, 350.0, 330.0, 480.0, 300.735]

In [57]:
np.mean(prices), np.median(prices)

(343.760793904209, 349.0)

In [58]:
median_prices = {} 
for word in doc_freq:
    L = len(doc_freq[word])
    if L >= 20 and L < 400:
        price_word = [prices[i] for i in doc_freq[word]]
        median_prices[word] = np.median(price_word)

In [59]:
keywords = []
High = 350 + 35
Low = 350 - 35

for word in median_prices:
    if median_prices[word] > High or median_prices[word] < Low:
        print(word, median_prices[word])
        keywords.append(word)

línea 299.0
mano 395.0
suite 409.0
obra 410.0
altos 400.0
interiores 412.0
materiales 385.5
comunitaria 299.9
alta 399.0
principales 386.0
pequeño 299.9
portero 398.0
noche 412.0
privado 405.0
perder 304.5
inmobiliario 284.0
amplitud 399.5
vas 275.0
hipotecario 299.9
gratuita 299.9
valoración 299.9
deseas 299.9
departamento 307.45
grandes 397.0
así 397.0
parte 395.0
ventanales 406.5
paredes 404.5
conductos 405.0
suelo 390.5
buenas 409.0
parking 405.0
frío 388.0
regia 391.0
elementos 395.0
vista 397.0
cristal 398.0
amplios 410.0
espaciosa 400.0
puertas 389.0
sur 299.9
estilo 390.0
balcones 390.0
despejadas 398.0
baños 400.0
completos 405.0
office 410.0
alto 404.5
directo 392.0
calidades 386.027
central 399.5
ascensores 398.0
actual 424.5
almacenaje 411.0
originales 386.054
construido 314.5
oro 299.9
cerrado 299.9
carácter 386.027
registrales 310.0
grava 299.9


### Keywords to ids

In [60]:
key_map = {k: i for i, k in enumerate(keywords)}
key_map

{'línea': 0,
 'mano': 1,
 'suite': 2,
 'obra': 3,
 'altos': 4,
 'interiores': 5,
 'materiales': 6,
 'comunitaria': 7,
 'alta': 8,
 'principales': 9,
 'pequeño': 10,
 'portero': 11,
 'noche': 12,
 'privado': 13,
 'perder': 14,
 'inmobiliario': 15,
 'amplitud': 16,
 'vas': 17,
 'hipotecario': 18,
 'gratuita': 19,
 'valoración': 20,
 'deseas': 21,
 'departamento': 22,
 'grandes': 23,
 'así': 24,
 'parte': 25,
 'ventanales': 26,
 'paredes': 27,
 'conductos': 28,
 'suelo': 29,
 'buenas': 30,
 'parking': 31,
 'frío': 32,
 'regia': 33,
 'elementos': 34,
 'vista': 35,
 'cristal': 36,
 'amplios': 37,
 'espaciosa': 38,
 'puertas': 39,
 'sur': 40,
 'estilo': 41,
 'balcones': 42,
 'despejadas': 43,
 'baños': 44,
 'completos': 45,
 'office': 46,
 'alto': 47,
 'directo': 48,
 'calidades': 49,
 'central': 50,
 'ascensores': 51,
 'actual': 52,
 'almacenaje': 53,
 'originales': 54,
 'construido': 55,
 'oro': 56,
 'cerrado': 57,
 'carácter': 58,
 'registrales': 59,
 'grava': 60}

### Creating features

In [61]:
def tokens_to_features(tokens, key_map):
    feature = np.zeros(len(key_map))
    for k in tokens:
        if k in key_map:
            feature[key_map[k]] = 1
    return feature

In [62]:
tokens = train[1]["tokens"]

tokens_to_features(tokens, key_map)

array([0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

#### location features

In [63]:
np.unique([d["loc_string"] for d in train])

array(['Barcelona - Besòs - Maresme',
       'Barcelona - Diagonal Mar i el Front Marítim del Poblenou',
       'Barcelona - Dreta de l´Eixample',
       'Barcelona - Dreta de l´Eixample\nVer mapa',
       'Barcelona - El Camp de l´Arpa del Clot',
       'Barcelona - El Parc i la Llacuna del Poblenou',
       'Barcelona - Fort Pienc',
       'Barcelona - La Nova Esquerra de l´Eixample',
       'Barcelona - La Nova Esquerra de l´Eixample\nVer mapa',
       'Barcelona - La Vila Olímpica del Poblenou',
       'Barcelona - L´Antiga Esquerra de l´Eixample', 'Barcelona - Navas',
       'Barcelona - Poblenou', 'Barcelona - Poblenou\nVer mapa',
       'Barcelona - Provençals del Poblenou',
       'Barcelona - Sagrada Família', 'Barcelona - Sant Antoni'],
      dtype='<U56')

In [64]:
## clean loc
locations = np.unique([d["loc_string"].split("\n")[0] for d in train])
locations

array(['Barcelona - Besòs - Maresme',
       'Barcelona - Diagonal Mar i el Front Marítim del Poblenou',
       'Barcelona - Dreta de l´Eixample',
       'Barcelona - El Camp de l´Arpa del Clot',
       'Barcelona - El Parc i la Llacuna del Poblenou',
       'Barcelona - Fort Pienc',
       'Barcelona - La Nova Esquerra de l´Eixample',
       'Barcelona - La Vila Olímpica del Poblenou',
       'Barcelona - L´Antiga Esquerra de l´Eixample', 'Barcelona - Navas',
       'Barcelona - Poblenou', 'Barcelona - Provençals del Poblenou',
       'Barcelona - Sagrada Família', 'Barcelona - Sant Antoni'],
      dtype='<U56')

In [65]:
loc_map = {v:k for k, v in enumerate(locations)}
loc_map

{'Barcelona - Besòs - Maresme': 0,
 'Barcelona - Diagonal Mar i el Front Marítim del Poblenou': 1,
 'Barcelona - Dreta de l´Eixample': 2,
 'Barcelona - El Camp de l´Arpa del Clot': 3,
 'Barcelona - El Parc i la Llacuna del Poblenou': 4,
 'Barcelona - Fort Pienc': 5,
 'Barcelona - La Nova Esquerra de l´Eixample': 6,
 'Barcelona - La Vila Olímpica del Poblenou': 7,
 'Barcelona - L´Antiga Esquerra de l´Eixample': 8,
 'Barcelona - Navas': 9,
 'Barcelona - Poblenou': 10,
 'Barcelona - Provençals del Poblenou': 11,
 'Barcelona - Sagrada Família': 12,
 'Barcelona - Sant Antoni': 13}

In [66]:
def loc_to_feature(loc, loc_map):
    loc = loc.split("\n")[0]
    return loc_map.get(loc, -1)

In [67]:
loc_to_feature("Other", loc_map)

-1

In [68]:
loc_to_feature('Barcelona - La Nova Esquerra de l´Eixample\nVer mapa', loc_map)

6

In [69]:
train[0]["price"]

'259.000 €'

In [70]:
def features_to_features(features, price):
    m2 = int(features[0].replace(" m2", ""))
    bedrooms = int(features[1].replace(" hab.", ""))
    bathrooms = int(features[2].replace(" baños", "").replace(" baño", ""))
    price = float(price.replace(" €", ""))
    return np.array([m2, bedrooms, bathrooms, price])

In [71]:
features_to_features(train[0]["features"], train[0]["price"])

array([ 82.,   3.,   1., 259.])

In [72]:
# not very useful
from collections import Counter
Counter([d['selltype'] for d in train])

Counter({'SECOND_HAND': 689})

In [73]:
def all_features(d, key_map, loc_map):
    fea = []
    fea.append(tokens_to_features(d["tokens"], key_map))
    fea.append([loc_to_feature(d["loc_string"], loc_map)])
    fea.append(features_to_features(d["features"], d["price"]))
    return np.concatenate(fea)

In [74]:
header = np.array(keywords + ["loc", "size", "bedrooms", "bathrooms", "price"])

In [75]:
fea = all_features(train[0], key_map, loc_map)

In [76]:
train_features = np.stack([all_features(d, key_map, loc_map) for d in train])

In [77]:
test_features = np.stack([all_features(d, key_map, loc_map) for d in test])

In [78]:
len(header)

66

In [81]:
df_train = pd.DataFrame(train_features, columns=header)
df_train.head()

Unnamed: 0,línea,mano,suite,obra,altos,interiores,materiales,comunitaria,alta,principales,...,oro,cerrado,carácter,registrales,grava,loc,size,bedrooms,bathrooms,price
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,82.0,3.0,1.0,259.0
1,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,63.0,1.0,1.0,445.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,6.0,81.0,1.0,1.0,250.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,74.0,3.0,2.0,330.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,9.0,79.0,4.0,2.0,255.0


In [82]:
df_test = pd.DataFrame(test_features, columns=header)

In [83]:
df_train.to_csv("houses_for_sale_train.csv",index=False)
df_test.to_csv("houses_for_sale_test.csv",index=False)

### Exercise: 
There are many ways to do selecting the words to include in you model (feature selection). Implement the following strategies. Create new data frame based on these new text features.
1. Term Frequency (TF): Calculate the frequency of each word in the corpus and select the top-N words with the highest TF values. This can be a simple way to start feature selection.
2. Inverse Document Frequency (IDF): Calculate the IDF for each word to measure how unique and informative it is across the entire corpus. Select words with high IDF scores.
3. Correlation Coefficient: Compute the correlation coefficient between the presence/absence of each word and the continuous labels. Words with high absolute correlation values may be good features.

In [85]:
# use the train set to compute your list of features
train[0].token

AttributeError: 'dict' object has no attribute 'tokens'