In [2]:
from pathlib import Path
import numpy as np
import pandas as pd
import pickle
from os.path import expanduser
from collections import Counter

In [3]:
import re
import spacy

Install the following packages: <br>
`pip install -U pip setuptools wheel` <br>
`pip install -U spacy` <br>
`python -m spacy download en_core_web_sm` <br>
`python -m spacy download es_core_news_sm`<br>

### Using re in Python

In [22]:
# \d+ matches numbers
# Searches the entire string for the first occurrence of the pattern and
# returns a match object if found; otherwise, it returns None.
result = re.search(r"\d+", "There are 123 apples")
result.group()

'123'

In [11]:
# Returns a list of all non-overlapping matches of the pattern in the string.
result = re.findall(r"\d+", "There are 123 apples and 456 oranges")
result

['123', '456']

In [10]:
# Replaces all occurrences of the pattern in the string with the replacement text.
result = re.sub(r"\d+", "#", "There are 123 apples and 456 oranges")
result

'There are # apples and # oranges'

In [15]:
desc = """Beautiful 4-bedroom, 3-bathroom house with a large backyard. 
Features include a modern kitchen, hardwood floors, and a finished basement. 
Located in a quiet neighborhood with close access to schools and parks."""

In [17]:
re.search(r"\d-bedroom", desc).group()

'4-bedroom'

In [24]:
bathrooms = re.search(r"\d-bathroom", desc).group()
bathrooms, bathrooms.replace("-bathroom", "")

('3-bathroom', '3')

### Houses for sale

In [25]:
filename = expanduser("~/data/houses_for_sale.pickle")
with open(filename,'rb') as f:
    data = pickle.load(f, encoding='utf-8')

This dataset was acquired by scraping a website that compiles property listings available for sale in Spain. Our objective is to leverage this data to gain insights into properties that offer a favorable value proposition for potential buyers.

One approach to achieve this is by training a model that can predict the asking price based on various "features" associated with each property listing. We intend to employ models that take a numerical feature vector as input to estimate the price. However, there are several data quality issues that need to be addressed before we can even begin creating features for this dataset:

1. Remove listings that advertise multiple units for sale.
2. Fix the "number of bedrooms" in some listings.
4. Fix the location feature. Encoding it as a categorical variable.
3. Generate relevant features from textual information.

In [26]:
len(data)

913

In [27]:
# data is a list of dictionaries
data[0]

{'price': '320.000 €',
 'title': 'Piso Tallers. Piso con 2 habitaciones con ascensor',
 'loc_string': 'Barcelona - Sant Antoni',
 'loc': None,
 'features': ['85 m2', '2 hab.', '1 baño', '3.647 €/m2'],
 'type': 'FLAT',
 'subtype': 'FLAT',
 'selltype': 'SECOND_HAND',
 'desc': 'Piso en última planta a reformar en calle Tallers junto a plaza Universitat.\n\nLa propiedad ofrece muchas posibilidades para personalizarla según sus necesidades, ya que todas las paredes son tabiques.\n\nLa distribución actual cuenta con salón comedor amplio, cocina, dos habitaciones dobles exteriores, dos habitaciones interiores, baño y despensa/trastero. Balcones a ambos lados de la vivienda, uno a patio de manzana y los otros dos a carrer Tallers. Suelos originales.\n\nFinca del 1882 con ascensor.\n\nCalle peatonal con encanto en el centro de Barcelona, rodeada de todo tipo de comercios y servicios. Buenas comunicaciones mediante transportes publicos, metro, autobuses y FFCC.'}

## Removing ads from multiple units
If features starts with "desde" it means that the ad is advertising multiple units. See for example this add in which the price is None and features start with "desde".

In [6]:
data[10]

{'price': None,
 'title': 'PASEO DE GRACIA 30',
 'loc_string': 'Barcelona - Dreta de l´Eixample',
 'loc': None,
 'features': ['desde 70 m2', '15.785,71 €/m2'],
 'type': 'NEW_CONSTRUCTION',
 'subtype': 'NEW_CONSTRUCTION',
 'selltype': 'NEW_CONSTRUCTION',
 'desc': 'Pisos con Piscina comunitaria y Terraza a estrenar en Paseo de Gracia, Barcelona.\n\nEn total, la finca presenta 21 viviendas únicas, una azotea con piscina y zona chill out, un patio vegetal interior, mirador y zonas comunes. Distribuidos en 5 plantas más ático, viviendas de dormitorios, algunos con balcón y otros con agradables terrazas. El exclusivo ático, de un dormitorio y salida a una impresionante terraza con vistas inmejorables, el mejor producto para nuestros clientes más exigentes.\n\nCada tipología de vivienda tiene su personalidad, diferente distribución y distintas superficies, todas construidas con materiales de alta calidad.\n\nTodas las estancias disponen de aire acondicionado y calefacción por conductos, domót

In [7]:
data[10]['features']

['desde 70 m2', '15.785,71 €/m2']

**Task 1**: Make a new list `data_v1` in which ads that have "desde" on the first feature are ignored.

In [30]:
assert len(data_v1) == 866

In [31]:
def saving_versions(data, version):
    filename = expanduser("~/data")
    filename +=  "/houses_for_sale_train_{}.pickle".format(version)
    file = open(filename, 'wb')
    pickle.dump(data, file)
    file.close()
    print(filename)
    
def load_versions(version):
    filename = expanduser("~/data")
    filename += "/houses_for_sale_train_{}.pickle".format(version)
    with open(filename,'rb') as f:
        data = pickle.load(f, encoding='utf-8')
    return data

In [32]:
# saving the new version of the data
saving_versions(data_v1, "v1")
len(data_v1)

/Users/yinterian/data/houses_for_sale_train_v1.pickle


866

### Checking number of rooms feature
and fixing issues

In [12]:
data_v1 = load_versions("v1")
len(data_v1)

866

In [13]:
[d["features"] for d in data_v1[:10]]

[['85 m2', '2 hab.', '1 baño', '3.647 €/m2'],
 ['65 m2', '2 hab.', '1 baño', '5.000 €/m2'],
 ['77 m2', '2 hab.', '1 baño', '4.286 €/m2'],
 ['96 m2', '3 hab.', '2 baños', '4.531 €/m2'],
 ['84 m2', '2 hab.', '1 baño', '4.881 €/m2'],
 ['91 m2', '4 hab.', '2 baños', '4.780 €/m2'],
 ['96 m2', '2 hab.', '2 baños', '4.271 €/m2'],
 ['76 m2', '2 hab.', '1 baño', '4.605 €/m2'],
 ['103 m2', '3 hab.', '2 baños', '4.223 €/m2'],
 ['82 m2', '3 hab.', '1 baño', '2.988 €/m2']]

In [33]:
# see that this feature doesn't have bedrooms (hab.)
data_v1[137]["features"]

['45 m2', '1 baño', '5.689 €/m2']

**Task 2:** 
Fix the bedroom issue:

['45 m2', '1 baño', '5.689 €/m2'] -- > ['45 m2', '0 hab.', 1 baño', '5.689 €/m2']

Fix the bathroom issue

['52 m2', '3.442 €/m2']  --> ['52 m2', '0 hab.', 1 baño', '3.442 €/m2']

In [35]:
assert len(data_v1[137]["features"]) == 4

In [54]:
saving_versions(data_v1, "v1")

### Finding features in descriptions
Let's tokenize the descriptions and compute in how many documents each word apeards.

In [36]:
nlp = spacy.load("es_core_news_sm")

In [53]:
data_v1 = load_versions("v1")
text = data_v1[0]["desc"]
text

'Piso en última planta a reformar en calle Tallers junto a plaza Universitat.\n\nLa propiedad ofrece muchas posibilidades para personalizarla según sus necesidades, ya que todas las paredes son tabiques.\n\nLa distribución actual cuenta con salón comedor amplio, cocina, dos habitaciones dobles exteriores, dos habitaciones interiores, baño y despensa/trastero. Balcones a ambos lados de la vivienda, uno a patio de manzana y los otros dos a carrer Tallers. Suelos originales.\n\nFinca del 1882 con ascensor.\n\nCalle peatonal con encanto en el centro de Barcelona, rodeada de todo tipo de comercios y servicios. Buenas comunicaciones mediante transportes publicos, metro, autobuses y FFCC.'

In [40]:
# tokenizing with spacy, you get part of speech (POS) for free
doc = nlp(text)
for token in doc[60:80]:
    print(token.text, token.pos_)

. PUNCT
Balcones NOUN
a ADP
ambos NUM
lados NOUN
de ADP
la DET
vivienda NOUN
, PUNCT
uno PRON
a ADP
patio NOUN
de ADP
manzana NOUN
y CCONJ
los DET
otros DET
dos NUM
a ADP
carrer VERB


**Task 3**: Write a function that given some text extracting specific parts of speech.

In [52]:

def clean_text(text, pos_list):
    """ Cleans the input text by extracting specific parts of speech.

    This function takes a string of text as input, tokenizes and returns a list of lowercase
    tokens that match  specific parts of speech (verbs, nouns, adjectives, or adverbs).
    text: str The input text to be processed.

    Returns:
    list of str A list of lowercase tokens that are either verbs, nouns, adjectives, or adverbs.
    """
    ## Your code here
    return tokens

In [51]:
pos_list = ["VERB", "NOUN", "ADJ", "ADV"]
text = "Piso en última planta a reformar en calle Tallers junto a plaza Universitat."
assert clean_text(text, pos_list) == ['piso', 'última', 'planta', 'reformar', 'calle', 'junto']

In [31]:
saving_versions(data_v2, "v2")
len(data_v2)

/Users/yinterian/data/houses_for_sale_train_v2.pickle


862

In [32]:
data_v2 = load_versions("v2")
len(data_v2)

862

### Splitting into training and testing
Training data is used to train the model Testing data is used to measure the preformance of the model.

In [33]:
import random
random.seed(4)
random.shuffle(data_v2)

N = int(len(data_v2)*.8)
print(N)

train = data_v2[:N] 
test = data_v2[N:]

len(train), len(test)

689


(689, 173)

### Features from descriptions
Let's compute the number of documents each word appears in. We do this just with the training data.

In [34]:
from collections import defaultdict

doc_freq = defaultdict(list) 
for i, d in enumerate(train):
    tokens = set(d["tokens"])
    for token in tokens:
        doc_freq[token].append(i)

In [35]:
tokens = train[0]["tokens"]
tokens[:5]

['nfo', 'presenta', 'precioso', 'piso', 'muy']

In [36]:
prices = [float(d["price"].replace(" €", "")) for d in train]

In [37]:
prices[:10]

[450.0, 230.0, 340.0, 213.0, 420.0, 299.0, 460.0, 340.0, 280.0, 399.0]

In [38]:
np.mean(prices), np.median(prices)

(343.678388969521, 349.0)

In [39]:
median_prices = {} 
for word in doc_freq:
    L = len(doc_freq[word])
    if L >= 20 and L < 400:
        price_word = [prices[i] for i in doc_freq[word]]
        median_prices[word] = np.median(price_word)

In [40]:
keywords = []
High = 350 + 35
Low = 350 - 35

for word in median_prices:
    if median_prices[word] > High or median_prices[word] < Low:
        print(word, median_prices[word])
        keywords.append(word)

hipotecario 299.9
baños 415.0
directo 392.0
deseas 299.9
parking 395.0
precioso 398.0
mediana 386.0
central 399.5
departamento 299.9
profesional 299.9
completos 417.0
gratuita 299.9
línea 305.0
bañera 312.5
empotrados 392.5
empotrado 310.0
desea 299.9
valoración 299.9
suite 425.0
así 399.0
abierta 390.0
estrenar 395.0
grandes 388.5
comunitaria 299.9
pequeño 299.9
ascensores 409.95
trastero 390.0
vistas 387.5
puertas 389.0
office 399.0
armarios 399.0
hipotecaria 309.95
grava 299.9
notariales 312.5
materiales 392.5
tampoco 299.9
registrales 299.99
calidad 400.0
estar 390.0
perder 305.0
barrios 395.0
estancias 392.0
segundo 390.0
gas 390.0
construido 299.945
presentamos 300.0
cristal 395.0
obtener 399.5
radiadores 390.0
€ 305.0
actual 424.0
estilo 390.0
portero 398.0
fachada 390.0
bonito 394.5
maravilloso 399.99
espacios 395.0
alto 405.0
ventanales 410.0
múltiples 395.0
noche 412.0
proyecto 397.0
nuevo 395.0
amplitud 399.5
amplios 415.0
variedad 390.0
privado 412.0
acabados 390.0
alta 399

### Keywords to ids

In [41]:
key_map = {k: i for i, k in enumerate(keywords)}
key_map

{'hipotecario': 0,
 'baños': 1,
 'directo': 2,
 'deseas': 3,
 'parking': 4,
 'precioso': 5,
 'mediana': 6,
 'central': 7,
 'departamento': 8,
 'profesional': 9,
 'completos': 10,
 'gratuita': 11,
 'línea': 12,
 'bañera': 13,
 'empotrados': 14,
 'empotrado': 15,
 'desea': 16,
 'valoración': 17,
 'suite': 18,
 'así': 19,
 'abierta': 20,
 'estrenar': 21,
 'grandes': 22,
 'comunitaria': 23,
 'pequeño': 24,
 'ascensores': 25,
 'trastero': 26,
 'vistas': 27,
 'puertas': 28,
 'office': 29,
 'armarios': 30,
 'hipotecaria': 31,
 'grava': 32,
 'notariales': 33,
 'materiales': 34,
 'tampoco': 35,
 'registrales': 36,
 'calidad': 37,
 'estar': 38,
 'perder': 39,
 'barrios': 40,
 'estancias': 41,
 'segundo': 42,
 'gas': 43,
 'construido': 44,
 'presentamos': 45,
 'cristal': 46,
 'obtener': 47,
 'radiadores': 48,
 '€': 49,
 'actual': 50,
 'estilo': 51,
 'portero': 52,
 'fachada': 53,
 'bonito': 54,
 'maravilloso': 55,
 'espacios': 56,
 'alto': 57,
 'ventanales': 58,
 'múltiples': 59,
 'noche': 60,
 '

### Creating features

In [42]:
def tokens_to_features(tokens, key_map):
    feature = np.zeros(len(key_map))
    for k in tokens:
        if k in key_map:
            feature[key_map[k]] = 1
    return feature

In [43]:
tokens = train[1]["tokens"]

tokens_to_features(tokens, key_map)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0.])

### location features
Note that some of the location strings have issues.

In [44]:
np.unique([d["loc_string"] for d in train])

array(['Barcelona - Besòs - Maresme',
       'Barcelona - Diagonal Mar i el Front Marítim del Poblenou',
       'Barcelona - Dreta de l´Eixample',
       'Barcelona - Dreta de l´Eixample\nVer mapa',
       'Barcelona - El Camp de l´Arpa del Clot', 'Barcelona - El Clot',
       'Barcelona - El Parc i la Llacuna del Poblenou',
       'Barcelona - Fort Pienc',
       'Barcelona - La Nova Esquerra de l´Eixample',
       'Barcelona - La Nova Esquerra de l´Eixample\nVer mapa',
       'Barcelona - La Vila Olímpica del Poblenou',
       'Barcelona - L´Antiga Esquerra de l´Eixample', 'Barcelona - Navas',
       'Barcelona - Poblenou', 'Barcelona - Poblenou\nVer mapa',
       'Barcelona - Provençals del Poblenou',
       'Barcelona - Sagrada Família', 'Barcelona - Sant Antoni'],
      dtype='<U56')

In [45]:
## clean loc
locations = np.unique([d["loc_string"].split("\n")[0] for d in train])
locations

array(['Barcelona - Besòs - Maresme',
       'Barcelona - Diagonal Mar i el Front Marítim del Poblenou',
       'Barcelona - Dreta de l´Eixample',
       'Barcelona - El Camp de l´Arpa del Clot', 'Barcelona - El Clot',
       'Barcelona - El Parc i la Llacuna del Poblenou',
       'Barcelona - Fort Pienc',
       'Barcelona - La Nova Esquerra de l´Eixample',
       'Barcelona - La Vila Olímpica del Poblenou',
       'Barcelona - L´Antiga Esquerra de l´Eixample', 'Barcelona - Navas',
       'Barcelona - Poblenou', 'Barcelona - Provençals del Poblenou',
       'Barcelona - Sagrada Família', 'Barcelona - Sant Antoni'],
      dtype='<U56')

In [46]:
loc_map = {v:k for k, v in enumerate(locations)}
loc_map

{'Barcelona - Besòs - Maresme': 0,
 'Barcelona - Diagonal Mar i el Front Marítim del Poblenou': 1,
 'Barcelona - Dreta de l´Eixample': 2,
 'Barcelona - El Camp de l´Arpa del Clot': 3,
 'Barcelona - El Clot': 4,
 'Barcelona - El Parc i la Llacuna del Poblenou': 5,
 'Barcelona - Fort Pienc': 6,
 'Barcelona - La Nova Esquerra de l´Eixample': 7,
 'Barcelona - La Vila Olímpica del Poblenou': 8,
 'Barcelona - L´Antiga Esquerra de l´Eixample': 9,
 'Barcelona - Navas': 10,
 'Barcelona - Poblenou': 11,
 'Barcelona - Provençals del Poblenou': 12,
 'Barcelona - Sagrada Família': 13,
 'Barcelona - Sant Antoni': 14}

In [47]:
def loc_to_feature(loc, loc_map):
    loc = loc.split("\n")[0]
    return loc_map.get(loc, -1)

In [48]:
loc_to_feature("Other", loc_map)

-1

In [49]:
loc_to_feature('Barcelona - La Nova Esquerra de l´Eixample\nVer mapa', loc_map)

7

In [50]:
train[0]["price"]

'450.000 €'

In [51]:
def features_to_features(features, price):
    m2 = int(features[0].replace(" m2", ""))
    bedrooms = int(features[1].replace(" hab.", ""))
    bathrooms = int(features[2].replace(" baños", "").replace(" baño", ""))
    price = float(price.replace(" €", ""))
    return np.array([m2, bedrooms, bathrooms, price])

In [52]:
features_to_features(train[0]["features"], train[0]["price"])

array([106.,   4.,   2., 450.])

In [53]:
# not very useful
from collections import Counter
Counter([d['selltype'] for d in train])

Counter({'SECOND_HAND': 689})

In [54]:
def all_features(d, key_map, loc_map):
    fea = []
    fea.append(tokens_to_features(d["tokens"], key_map))
    fea.append([loc_to_feature(d["loc_string"], loc_map)])
    fea.append(features_to_features(d["features"], d["price"]))
    return np.concatenate(fea)

In [55]:
header = np.array(keywords + ["loc", "size", "bedrooms", "bathrooms", "price"])

In [50]:
fea = all_features(train[0], key_map, loc_map)

In [51]:
train_features = np.stack([all_features(d, key_map, loc_map) for d in train])

In [52]:
test_features = np.stack([all_features(d, key_map, loc_map) for d in test])

In [53]:
len(header)

97

In [54]:
df_train = pd.DataFrame(train_features, columns=header)
df_train.head()

Unnamed: 0,profesional,central,parking,precioso,gratuita,baños,departamento,deseas,hipotecario,directo,...,llave,blanco,comercio,vas,dan,loc,size,bedrooms,bathrooms,price
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,10.0,106.0,4.0,2.0,450.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,13.0,75.0,2.0,2.0,230.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,9.0,52.0,2.0,1.0,340.0
3,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,60.0,1.0,1.0,213.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,7.0,75.0,2.0,2.0,420.0


In [82]:
df_test = pd.DataFrame(test_features, columns=header)

In [83]:
df_train.to_csv("houses_for_sale_train.csv",index=False)
df_test.to_csv("houses_for_sale_test.csv",index=False)

### Exercise: 
There are many ways to do selecting the words to include in you model (feature selection). Implement the following strategies. Create new data frame based on these new text features.
1. Term Frequency (TF): Calculate the frequency of each word in the corpus and select the top-N words with the highest TF values. This can be a simple way to start feature selection.
2. Inverse Document Frequency (IDF): Calculate the IDF for each word to measure how unique and informative it is across the entire corpus. Select words with high IDF scores.
3. Correlation Coefficient: Compute the correlation coefficient between the presence/absence of each word and the continuous labels. Words with high absolute correlation values may be good features.

If you have time, you can use the tfidf score instead of 0,1 for each feature.

In [88]:
# use the train set to compute your list of features
np.array(train[0]["tokens"])

array(['oportunidad', 'piso', 'situado', 'muy', 'cercano', 'tipo',
       'comercios', 'hospital', 'colegios', 'muy', 'bien', 'comunicado',
       'línea', 'línea', 'parada', 'líneas', 'autobuses', 'vivienda',
       'consta', 'distribuidos', 'baño', 'ducha', 'cocina',
       'independiente', 'comedor', 'piso', 'perfecto', 'actualizar',
       'suelos', 'gres', 'finca', 'dispone', 'ascensor', 'tercero',
       'real', 'altura', 'año', 'gastos', 'honorarios', 'agencia', 'no',
       'incluidos', 'precio'], dtype='<U13')