In [86]:
import numpy as np
import pandas as pd
import pickle
from os.path import expanduser
from collections import Counter

In [87]:
import re
import spacy

In [88]:
def saving_versions(data, version):
    filename = expanduser("~/data")
    filename +=  "/houses_for_sale_train_{}.pickle".format(version)
    file = open(filename, 'wb')
    pickle.dump(data, file)
    file.close()
    print(filename)
    
def load_versions(version):
    filename = expanduser("~/data")
    filename += "/houses_for_sale_train_{}.pickle".format(version)
    with open(filename,'rb') as f:
        data = pickle.load(f, encoding='utf-8')
    return data

### Houses for sale

In [89]:
filename = expanduser("~/data/houses_for_sale.pickle")
with open(filename,'rb') as f:
    data = pickle.load(f, encoding='utf-8')

This dataset was acquired by scraping a website that compiles property listings available for sale in Spain. Our objective is to leverage this data to gain insights into properties that offer a favorable value proposition for potential buyers.

One approach to achieve this is by training a model that can predict the asking price based on various "features" associated with each property listing. We intend to employ models that take a numerical feature vector as input to estimate the price. However, there are several data quality issues that need to be addressed before we can even begin creating features for this dataset:

1. Remove listings that advertise multiple units for sale.
2. Fix the "number of bedrooms" in some listings.
4. Fix the location feature. Encoding it as a categorical variable.
3. Generate relevant features from textual information.

## Removing ads from multiple units
If features starts with "desde" it means that the ad is advertising multiple units. See for example this add in which the price is None and features start with "desde".

In [90]:
data[10]['features']

['desde 70 m2', '15.785,71 €/m2']

In [91]:
data_v1 = []
for ad in data:
    fea = ad["features"][0]
    if not re.search(r"^desde", fea):
        data_v1.append(ad)
len(data_v1)

867

In [92]:
# saving the new version of the data
saving_versions(data_v1, "v1")
len(data_v1)

/Users/yinterian/data/houses_for_sale_train_v1.pickle


867

### Checking number of rooms feature
and fixing issues

In [93]:
# see that this feature doesn't have bedrooms (hab.)
data_v1[137]["features"]

['45 m2', '1 baño', '5.689 €/m2']

In [94]:
for ad in data_v1:
    fea = ad["features"]
    if len(fea) == 3 and not re.search(r"hab", fea[1]):
        features = [fea[0], "0 hab.", fea[1], fea[2]]
        ad["features"] = features

In [95]:
data_v1[137]["features"]

['45 m2', '0 hab.', '1 baño', '5.689 €/m2']

In [12]:
saving_versions(data_v1, "v1")

/Users/yinterian/data/houses_for_sale_train_v1.pickle


In [96]:
# remainning issues
for ad in data_v1:
    fea = ad["features"]
    if not re.search(r"hab", fea[1]) or len(fea) != 4:
        print(fea)

['47 m2', '1 hab.', '6.051 €/m2']
['2 hab.', '3 baños']
['63 m2', '2 hab.', '4.524 €/m2']
['52 m2', '3.442 €/m2']
['100 m2', '2 hab.', '5.000 €/m2']


In [14]:
len(data_v1)

867

In [97]:
# filtering ads without m2 in the first element of features
data_v2 = []
for ad in data_v1:
    fea = ad["features"]
    if re.search(r"m2", fea[0]):
        data_v2.append(ad)
len(data_v2)

866

In [98]:
for ad in data_v2:
    fea = ad["features"]
    if len(fea) == 3 and not re.search(r"baño", fea[2]):
        features = [fea[0], fea[1], "1 baño", fea[2]]
        ad["features"] = features
        print(fea, features)

['47 m2', '1 hab.', '6.051 €/m2'] ['47 m2', '1 hab.', '1 baño', '6.051 €/m2']
['63 m2', '2 hab.', '4.524 €/m2'] ['63 m2', '2 hab.', '1 baño', '4.524 €/m2']
['100 m2', '2 hab.', '5.000 €/m2'] ['100 m2', '2 hab.', '1 baño', '5.000 €/m2']


In [99]:
## same for the other case
for ad in data_v2:
    fea = ad["features"]
    if len(fea) < 4:
        features = [fea[0], "0 hab.", "1 baño", fea[1]]
        ad["features"] = features
        print(fea, features)

['52 m2', '3.442 €/m2'] ['52 m2', '0 hab.', '1 baño', '3.442 €/m2']


In [100]:
saving_versions(data_v2, "v2")

/Users/yinterian/data/houses_for_sale_train_v2.pickle


### Finding features in descriptions
Let's tokenize the descriptions and compute in how many documents each word apeards.

In [101]:
nlp = spacy.load("es_core_news_sm")

In [102]:
data_v1 = load_versions("v1")
text = data_v1[0]["desc"]
text

'Piso en última planta a reformar en calle Tallers junto a plaza Universitat.\n\nLa propiedad ofrece muchas posibilidades para personalizarla según sus necesidades, ya que todas las paredes son tabiques.\n\nLa distribución actual cuenta con salón comedor amplio, cocina, dos habitaciones dobles exteriores, dos habitaciones interiores, baño y despensa/trastero. Balcones a ambos lados de la vivienda, uno a patio de manzana y los otros dos a carrer Tallers. Suelos originales.\n\nFinca del 1882 con ascensor.\n\nCalle peatonal con encanto en el centro de Barcelona, rodeada de todo tipo de comercios y servicios. Buenas comunicaciones mediante transportes publicos, metro, autobuses y FFCC.'

In [103]:
text.lower()

'piso en última planta a reformar en calle tallers junto a plaza universitat.\n\nla propiedad ofrece muchas posibilidades para personalizarla según sus necesidades, ya que todas las paredes son tabiques.\n\nla distribución actual cuenta con salón comedor amplio, cocina, dos habitaciones dobles exteriores, dos habitaciones interiores, baño y despensa/trastero. balcones a ambos lados de la vivienda, uno a patio de manzana y los otros dos a carrer tallers. suelos originales.\n\nfinca del 1882 con ascensor.\n\ncalle peatonal con encanto en el centro de barcelona, rodeada de todo tipo de comercios y servicios. buenas comunicaciones mediante transportes publicos, metro, autobuses y ffcc.'

In [104]:
def clean_text(text, pos_list):
    """ Cleans the input text by extracting specific parts of speech.

    This function takes a string of text as input, tokenizes and returns a list of lowercase
    tokens that match  specific parts of speech (verbs, nouns, adjectives, or adverbs).
    text: str The input text to be processed.

    Returns:
    list of str A list of lowercase tokens that are either verbs, nouns, adjectives, or adverbs.
    """
    tokens = []
    text = text.lower()
    doc = nlp(text)
    for token in doc:
        if token.pos_ in pos_list:
            tokens.append(token.text)
    return tokens

In [105]:
pos_list = ["VERB", "NOUN", "ADJ", "ADV"]
text = "Piso en última planta a reformar en calle Tallers junto a plaza Universitat."
clean_text(text, pos_list)

['piso', 'última', 'planta', 'reformar', 'calle', 'tallers', 'junto', 'plaza']

In [106]:
saving_versions(data_v2, "v2")
len(data_v2)

/Users/yinterian/data/houses_for_sale_train_v2.pickle


866

### Task 1
Using the clean_text function, add a new key tokens to all dictionaries, storing the clean tokens.

In [107]:
pos_list = ["VERB", "NOUN", "ADJ", "ADV"]
clean_text(data_v2[0]["desc"], pos_list)[:3]

['piso', 'última', 'planta']

In [108]:
# your code here


In [109]:
saving_versions(data_v2, "v2")
len(data_v2)

/Users/yinterian/data/houses_for_sale_train_v2.pickle


866

In [110]:
data_v2[10]["tokens"][:5]

['exterior', 'ascensor', 'finca', 'año', 'consta']

### Splitting into training and testing
Training data is used to train the model Testing data is used to measure the preformance of the model.

In [111]:
import random
random.seed(4)
random.shuffle(data_v2)

N = int(len(data_v2)*.8)
print(N)

train = data_v2[:N] 
test = data_v2[N:]

len(train), len(test)

692


(692, 174)

### Task 2: Features from descriptions

1. Compute the frequency of tokens in the training data.
2. Create a list of unique tokens with a frequency of at least 5 from the training data (train_tokens).
3. Create a dictionary (token2id) from the words in train_tokens, where each word is assigned a unique integer ID.
4. Write a function (create_bag_of_words) that, given a list of tokens and the token2id map, creates a bag-of-words representation. The function should return a numpy array of size len(train_tokens), where the element at index i is the number of times the word with ID i appears in the token list.
5. Compute bag-of-words representation for every ad in train and test.

In [112]:
freq_tokens = {}
# your code here


In [113]:
# fiter tokens freq >=5 
# your code here

1651

In [114]:
#tokens2id

In [115]:
def create_bag_of_words(tokens, tokens2id):
    bag = np.zeros(len(tokens2id))
    # your code here
    return bag

In [116]:
# testing create_bag_works
toy_uniq_words = ["apple", "banana", "cherry"]
toy_tok2id = {'apple': 0, 'banana': 1, 'cherry': 2}
toy_tokens = ["a", "apple", "banana", "apple", "bob"]
create_bag_of_words(toy_tokens, toy_tok2id) == np.array([2., 1., 0.])

array([ True,  True,  True])

In [117]:
# adding "bow" key to each ad
# you code here


In [118]:
test[0]["bow"]

array([0., 2., 0., ..., 0., 0., 0.])

### Fix Location features
1. Note that some of the location strings have issues fix the issues. For example:
   'Barcelona - Dreta de l´Eixample\nVer mapa' --> 'Barcelona - Dreta de l´Eixample'
2. Create a unique numeric id for each location. 


In [119]:
np.unique([d["loc_string"] for d in train])

array(['Barcelona - Besòs - Maresme',
       'Barcelona - Diagonal Mar i el Front Marítim del Poblenou',
       'Barcelona - Dreta de l´Eixample',
       'Barcelona - El Camp de l´Arpa del Clot', 'Barcelona - El Clot',
       'Barcelona - El Parc i la Llacuna del Poblenou',
       'Barcelona - Fort Pienc',
       'Barcelona - La Nova Esquerra de l´Eixample',
       'Barcelona - La Nova Esquerra de l´Eixample\nVer mapa',
       'Barcelona - La Vila Olímpica del Poblenou',
       'Barcelona - L´Antiga Esquerra de l´Eixample', 'Barcelona - Navas',
       'Barcelona - Poblenou', 'Barcelona - Poblenou\nVer mapa',
       'Barcelona - Provençals del Poblenou',
       'Barcelona - Sagrada Família', 'Barcelona - Sant Antoni'],
      dtype='<U56')

### Task 3:
Use this trick to fix location:

In [120]:
locations = np.unique([d["loc_string"].split("\n")[0] for d in train])
locations

array(['Barcelona - Besòs - Maresme',
       'Barcelona - Diagonal Mar i el Front Marítim del Poblenou',
       'Barcelona - Dreta de l´Eixample',
       'Barcelona - El Camp de l´Arpa del Clot', 'Barcelona - El Clot',
       'Barcelona - El Parc i la Llacuna del Poblenou',
       'Barcelona - Fort Pienc',
       'Barcelona - La Nova Esquerra de l´Eixample',
       'Barcelona - La Vila Olímpica del Poblenou',
       'Barcelona - L´Antiga Esquerra de l´Eixample', 'Barcelona - Navas',
       'Barcelona - Poblenou', 'Barcelona - Provençals del Poblenou',
       'Barcelona - Sagrada Família', 'Barcelona - Sant Antoni'],
      dtype='<U56')

In [121]:
loc_map = {v:k for k, v in enumerate(locations)}
loc_map

{'Barcelona - Besòs - Maresme': 0,
 'Barcelona - Diagonal Mar i el Front Marítim del Poblenou': 1,
 'Barcelona - Dreta de l´Eixample': 2,
 'Barcelona - El Camp de l´Arpa del Clot': 3,
 'Barcelona - El Clot': 4,
 'Barcelona - El Parc i la Llacuna del Poblenou': 5,
 'Barcelona - Fort Pienc': 6,
 'Barcelona - La Nova Esquerra de l´Eixample': 7,
 'Barcelona - La Vila Olímpica del Poblenou': 8,
 'Barcelona - L´Antiga Esquerra de l´Eixample': 9,
 'Barcelona - Navas': 10,
 'Barcelona - Poblenou': 11,
 'Barcelona - Provençals del Poblenou': 12,
 'Barcelona - Sagrada Família': 13,
 'Barcelona - Sant Antoni': 14}

In [122]:
def loc_to_feature(loc, loc_map):
    # return an id -1 if not found in loc_map
    # your code here

    

### Task 4: 
Write a function named extract_from_features. The returns a np array with numeric values for m^, bedrooms, bathrooms, price.
  '450.000 €' --> 450


In [123]:
def extract_from_features(features, price):
    # your code here
    

In [124]:
# Here is an example of input and output
features_to_features(['68 m2', '3 hab.', '1 baño', '5.000 €/m2'], '340.000 €')

array([ 68.,   3.,   1., 340.])

### Task 5:
Create a dataframe with all features like the one below.

Hint: Create a NumPy array for each observation (ad) by concatenating the bag-of-words vector with the vector of features for all other variables ('loc', 'size', 'bedrooms', 'bathrooms', 'price'). Then, stack all the vectors together to form a matrix. np.hstack will concatenate all arrays horizontally, while np.vstack can stack all observations vertically.

In [125]:
header = np.array(train_tokens + ["loc", "size", "bedrooms", "bathrooms", "price"])
header

array(['vende', 'piso', 'diseño', ..., 'bedrooms', 'bathrooms', 'price'],
      dtype='<U17')

In [126]:
def get_an_observation(ad):
    # your code here

get_an_observation(train[0])

array([  1.,   1.,   1., ...,   3.,   1., 340.])

In [127]:
len(get_an_observation(train[0]))

1656

In [133]:
train_features = np.vstack([get_an_observation(ad) for ad in train])

In [135]:
test_features = np.vstack([get_an_observation(ad) for ad in test])

In [134]:
df_train = pd.DataFrame(train_features, columns=header)
df_train.head()

Unnamed: 0,vende,piso,diseño,exclusivo,zona,poblenou,dormitorios,baño,tiene,aire,...,facilidad,hall,andreu,importe,interna,loc,size,bedrooms,bathrooms,price
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,11.0,68.0,3.0,1.0,340.0
1,1.0,0.0,0.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,9.0,90.0,2.0,1.0,310.0
2,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,7.0,60.0,2.0,1.0,225.0
3,0.0,2.0,0.0,0.0,2.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,9.0,63.0,2.0,2.0,460.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,60.0,1.0,1.0,200.0


In [136]:
df_test = pd.DataFrame(test_features, columns=header)

In [137]:
df_train.to_csv("houses_for_sale_train.csv",index=False)
df_test.to_csv("houses_for_sale_test.csv",index=False)

**Optional Task**: Create a Gradient Boosting or Random Forest model for the task of predicting price.

See examples here:

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html