In [2]:
from pathlib import Path
import numpy as np
import pandas as pd
import pickle
from os.path import expanduser
from collections import Counter

In [3]:
import re
import spacy

Install the following packages: <br>
`pip install -U pip setuptools wheel` <br>
`pip install -U spacy` <br>
`python -m spacy download en_core_web_sm` <br>
`python -m spacy download es_core_news_sm`<br>

### Using re in Python

In [22]:
# \d+ matches numbers
# Searches the entire string for the first occurrence of the pattern and
# returns a match object if found; otherwise, it returns None.
result = re.search(r"\d+", "There are 123 apples")
result.group()

'123'

In [11]:
# Returns a list of all non-overlapping matches of the pattern in the string.
result = re.findall(r"\d+", "There are 123 apples and 456 oranges")
result

['123', '456']

In [10]:
# Replaces all occurrences of the pattern in the string with the replacement text.
result = re.sub(r"\d+", "#", "There are 123 apples and 456 oranges")
result

'There are # apples and # oranges'

In [15]:
desc = """Beautiful 4-bedroom, 3-bathroom house with a large backyard. 
Features include a modern kitchen, hardwood floors, and a finished basement. 
Located in a quiet neighborhood with close access to schools and parks."""

In [17]:
re.search(r"\d-bedroom", desc).group()

'4-bedroom'

In [24]:
bathrooms = re.search(r"\d-bathroom", desc).group()
bathrooms, bathrooms.replace("-bathroom", "")

('3-bathroom', '3')

### Houses for sale

In [25]:
filename = expanduser("~/data/houses_for_sale.pickle")
with open(filename,'rb') as f:
    data = pickle.load(f, encoding='utf-8')

This dataset was acquired by scraping a website that compiles property listings available for sale in Spain. Our objective is to leverage this data to gain insights into properties that offer a favorable value proposition for potential buyers.

One approach to achieve this is by training a model that can predict the asking price based on various "features" associated with each property listing. We intend to employ models that take a numerical feature vector as input to estimate the price. However, there are several data quality issues that need to be addressed before we can even begin creating features for this dataset:

1. Remove listings that advertise multiple units for sale.
2. Fix the "number of bedrooms" in some listings.
4. Fix the location feature. Encoding it as a categorical variable.
3. Generate relevant features from textual information.

In [26]:
len(data)

913

In [27]:
# data is a list of dictionaries
data[0]

{'price': '320.000 €',
 'title': 'Piso Tallers. Piso con 2 habitaciones con ascensor',
 'loc_string': 'Barcelona - Sant Antoni',
 'loc': None,
 'features': ['85 m2', '2 hab.', '1 baño', '3.647 €/m2'],
 'type': 'FLAT',
 'subtype': 'FLAT',
 'selltype': 'SECOND_HAND',
 'desc': 'Piso en última planta a reformar en calle Tallers junto a plaza Universitat.\n\nLa propiedad ofrece muchas posibilidades para personalizarla según sus necesidades, ya que todas las paredes son tabiques.\n\nLa distribución actual cuenta con salón comedor amplio, cocina, dos habitaciones dobles exteriores, dos habitaciones interiores, baño y despensa/trastero. Balcones a ambos lados de la vivienda, uno a patio de manzana y los otros dos a carrer Tallers. Suelos originales.\n\nFinca del 1882 con ascensor.\n\nCalle peatonal con encanto en el centro de Barcelona, rodeada de todo tipo de comercios y servicios. Buenas comunicaciones mediante transportes publicos, metro, autobuses y FFCC.'}

## Removing ads from multiple units
If features starts with "desde" it means that the ad is advertising multiple units. See for example this add in which the price is None and features start with "desde".

In [6]:
data[10]

{'price': None,
 'title': 'PASEO DE GRACIA 30',
 'loc_string': 'Barcelona - Dreta de l´Eixample',
 'loc': None,
 'features': ['desde 70 m2', '15.785,71 €/m2'],
 'type': 'NEW_CONSTRUCTION',
 'subtype': 'NEW_CONSTRUCTION',
 'selltype': 'NEW_CONSTRUCTION',
 'desc': 'Pisos con Piscina comunitaria y Terraza a estrenar en Paseo de Gracia, Barcelona.\n\nEn total, la finca presenta 21 viviendas únicas, una azotea con piscina y zona chill out, un patio vegetal interior, mirador y zonas comunes. Distribuidos en 5 plantas más ático, viviendas de dormitorios, algunos con balcón y otros con agradables terrazas. El exclusivo ático, de un dormitorio y salida a una impresionante terraza con vistas inmejorables, el mejor producto para nuestros clientes más exigentes.\n\nCada tipología de vivienda tiene su personalidad, diferente distribución y distintas superficies, todas construidas con materiales de alta calidad.\n\nTodas las estancias disponen de aire acondicionado y calefacción por conductos, domót

In [7]:
data[10]['features']

['desde 70 m2', '15.785,71 €/m2']

### **Task 1**: 
Make a new list `data_v1` in which ads that have "desde" on the first feature are ignored.

In [30]:
assert len(data_v1) == 866

In [31]:
def saving_versions(data, version):
    filename = expanduser("~/data")
    filename +=  "/houses_for_sale_train_{}.pickle".format(version)
    file = open(filename, 'wb')
    pickle.dump(data, file)
    file.close()
    print(filename)
    
def load_versions(version):
    filename = expanduser("~/data")
    filename += "/houses_for_sale_train_{}.pickle".format(version)
    with open(filename,'rb') as f:
        data = pickle.load(f, encoding='utf-8')
    return data

In [32]:
# saving the new version of the data
saving_versions(data_v1, "v1")
len(data_v1)

/Users/yinterian/data/houses_for_sale_train_v1.pickle


866

### Checking number of rooms feature
and fixing issues

In [12]:
data_v1 = load_versions("v1")
len(data_v1)

866

In [13]:
[d["features"] for d in data_v1[:10]]

[['85 m2', '2 hab.', '1 baño', '3.647 €/m2'],
 ['65 m2', '2 hab.', '1 baño', '5.000 €/m2'],
 ['77 m2', '2 hab.', '1 baño', '4.286 €/m2'],
 ['96 m2', '3 hab.', '2 baños', '4.531 €/m2'],
 ['84 m2', '2 hab.', '1 baño', '4.881 €/m2'],
 ['91 m2', '4 hab.', '2 baños', '4.780 €/m2'],
 ['96 m2', '2 hab.', '2 baños', '4.271 €/m2'],
 ['76 m2', '2 hab.', '1 baño', '4.605 €/m2'],
 ['103 m2', '3 hab.', '2 baños', '4.223 €/m2'],
 ['82 m2', '3 hab.', '1 baño', '2.988 €/m2']]

In [33]:
# see that this feature doesn't have bedrooms (hab.)
data_v1[137]["features"]

['45 m2', '1 baño', '5.689 €/m2']

### **Task 2:** 
Fix the bedroom issue:

['45 m2', '1 baño', '5.689 €/m2'] -- > ['45 m2', '0 hab.', 1 baño', '5.689 €/m2']

Fix the bathroom issue

['52 m2', '3.442 €/m2']  --> ['52 m2', '0 hab.', 1 baño', '3.442 €/m2']

In [35]:
assert len(data_v1[137]["features"]) == 4

In [54]:
saving_versions(data_v1, "v1")

### Finding features in descriptions
Let's tokenize the descriptions and compute in how many documents each word apeards.

In [36]:
nlp = spacy.load("es_core_news_sm")

In [53]:
data_v1 = load_versions("v1")
text = data_v1[0]["desc"]
text

'Piso en última planta a reformar en calle Tallers junto a plaza Universitat.\n\nLa propiedad ofrece muchas posibilidades para personalizarla según sus necesidades, ya que todas las paredes son tabiques.\n\nLa distribución actual cuenta con salón comedor amplio, cocina, dos habitaciones dobles exteriores, dos habitaciones interiores, baño y despensa/trastero. Balcones a ambos lados de la vivienda, uno a patio de manzana y los otros dos a carrer Tallers. Suelos originales.\n\nFinca del 1882 con ascensor.\n\nCalle peatonal con encanto en el centro de Barcelona, rodeada de todo tipo de comercios y servicios. Buenas comunicaciones mediante transportes publicos, metro, autobuses y FFCC.'

In [40]:
# tokenizing with spacy, you get part of speech (POS) for free
doc = nlp(text)
for token in doc[60:80]:
    print(token.text, token.pos_)

. PUNCT
Balcones NOUN
a ADP
ambos NUM
lados NOUN
de ADP
la DET
vivienda NOUN
, PUNCT
uno PRON
a ADP
patio NOUN
de ADP
manzana NOUN
y CCONJ
los DET
otros DET
dos NUM
a ADP
carrer VERB


### **Task 3**:
 Write a function that given some text extracting specific parts of speech.

In [52]:
def clean_text(text, pos_list):
    """ Cleans the input text by extracting specific parts of speech.

    This function takes a string of text as input, tokenizes and returns a list of lowercase
    tokens that match  specific parts of speech (verbs, nouns, adjectives, or adverbs).
    text: str The input text to be processed.

    Returns:
    list of str A list of lowercase tokens that are either verbs, nouns, adjectives, or adverbs.
    """
    ## Your code here
    return tokens

In [51]:
pos_list = ["VERB", "NOUN", "ADJ", "ADV"]
text = "Piso en última planta a reformar en calle Tallers junto a plaza Universitat."
assert clean_text(text, pos_list) == ['piso', 'última', 'planta', 'reformar', 'calle', 'junto']

In [31]:
saving_versions(data_v2, "v2")
len(data_v2)

/Users/yinterian/data/houses_for_sale_train_v2.pickle


862

In [32]:
data_v2 = load_versions("v2")
len(data_v2)

862

### Splitting into training and testing
Training data is used to train the model Testing data is used to measure the preformance of the model.

In [55]:
import random
random.seed(4)
random.shuffle(data_v2)

N = int(len(data_v2)*.8)
print(N)

train = data_v2[:N] 
test = data_v2[N:]

len(train), len(test)

689


(689, 173)

### Task 4: Features from descriptions

1. Create a list of unique tokens from all training data. (`train_tokens`)
2. Create a map from words in train_tokens in which each word gets a unique integer id.
3. For each document create a vector of size len(train_tokens) that counts how many times each token is in the specific document.


In [62]:
train[0]["tokens"][:5]

['nfo', 'presenta', 'precioso', 'piso', 'muy']

### Task 5: Fix Location features
1. Note that some of the location strings have issues fix the issues. For example:
   'Barcelona - Dreta de l´Eixample\nVer mapa' --> 'Barcelona - Dreta de l´Eixample'
2. Create a unique numeric id for each location. 


In [44]:
np.unique([d["loc_string"] for d in train])

array(['Barcelona - Besòs - Maresme',
       'Barcelona - Diagonal Mar i el Front Marítim del Poblenou',
       'Barcelona - Dreta de l´Eixample',
       'Barcelona - Dreta de l´Eixample\nVer mapa',
       'Barcelona - El Camp de l´Arpa del Clot', 'Barcelona - El Clot',
       'Barcelona - El Parc i la Llacuna del Poblenou',
       'Barcelona - Fort Pienc',
       'Barcelona - La Nova Esquerra de l´Eixample',
       'Barcelona - La Nova Esquerra de l´Eixample\nVer mapa',
       'Barcelona - La Vila Olímpica del Poblenou',
       'Barcelona - L´Antiga Esquerra de l´Eixample', 'Barcelona - Navas',
       'Barcelona - Poblenou', 'Barcelona - Poblenou\nVer mapa',
       'Barcelona - Provençals del Poblenou',
       'Barcelona - Sagrada Família', 'Barcelona - Sant Antoni'],
      dtype='<U56')

### Task 6: 
Extract numeric values from price, bathrooms, bedrooms, m^2s
  '450.000 €' --> 450


In [50]:
train[0]["price"]

'450.000 €'

### Task 7:
Create a dataframe with all features like the one below.

In [55]:
header = np.array(keywords + ["loc", "size", "bedrooms", "bathrooms", "price"])

In [54]:
df_train = pd.DataFrame(train_features, columns=header)
df_train.head()

Unnamed: 0,profesional,central,parking,precioso,gratuita,baños,departamento,deseas,hipotecario,directo,...,llave,blanco,comercio,vas,dan,loc,size,bedrooms,bathrooms,price
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,10.0,106.0,4.0,2.0,450.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,13.0,75.0,2.0,2.0,230.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,9.0,52.0,2.0,1.0,340.0
3,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,60.0,1.0,1.0,213.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,7.0,75.0,2.0,2.0,420.0


In [83]:
df_train.to_csv("houses_for_sale_train.csv",index=False)
df_test.to_csv("houses_for_sale_test.csv",index=False)