In [22]:
import warnings
from glob import glob

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from category_encoders import OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge  # noqa F401
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.utils.validation import check_is_fitted

# Import Data

In [23]:
def wrangle(filepath):
    # Read CSV file
    df = pd.read_csv(filepath, encoding='latin-1')

    # Subset data: Apartments in "Capital Federal", less than 400,000
    mask_ba = df["place_with_parent_names"].str.contains("Capital Federal")
    mask_apt = df["property_type"] == "apartment"
    mask_price = df["price_aprox_usd"] < 400_000
    df = df[mask_ba & mask_apt & mask_price]

    # Subset data: Remove outliers for "surface_covered_in_m2"
    low, high = df["surface_covered_in_m2"].quantile([0.1, 0.9])
    mask_area = df["surface_covered_in_m2"].between(low, high)
    df = df[mask_area]

    # Split "lat-lon" column
    df[["lat", "lon"]] = df["lat-lon"].str.split(",", expand=True).astype(float)
    df.drop(columns="lat-lon", inplace=True)
    
#     extracting the neigbourhood
    df['neighbourhood'] = df['place_with_parent_names'].str.split('|', expand=True)[3]
    df.drop(columns= 'place_with_parent_names', inplace=True)
    

    return df

In [24]:
# putting the files to be used in a list so as to wrangle them all at once
files = glob("data/buenos-aires-real-estate-*.csv")
files
len(files)

5

In [25]:
# looping through the files to convert each file into a dataframe
df_list = []
for file in files:
#     print(file)
    df = wrangle(file)
    print(df.shape)
    df_list.append(df)

(1343, 17)
(1315, 17)
(1288, 17)
(1305, 17)
(1331, 17)


In [26]:
len(df_list)

5

In [27]:
combo_df = pd.concat(df_list, ignore_index=True)
print(combo_df.shape)
combo_df.head()

(6582, 17)


Unnamed: 0,operation,property_type,price,currency,price_aprox_local_currency,price_aprox_usd,surface_total_in_m2,surface_covered_in_m2,price_usd_per_m2,price_per_m2,floor,rooms,expenses,properati_url,lat,lon,neighbourhood
0,sell,apartment,129000.0,USD,1955949.6,129000.0,76.0,70.0,1697.368421,1842.857143,,,,http://chacarita.properati.com.ar/10qlv_venta_...,-34.584651,-58.454693,Chacarita
1,sell,apartment,87000.0,USD,1319128.8,87000.0,48.0,42.0,1812.5,2071.428571,,,,http://villa-luro.properati.com.ar/12m82_venta...,-34.638979,-58.500115,Villa Luro
2,sell,apartment,118000.0,USD,1789163.2,118000.0,,54.0,,2185.185185,,2.0,,http://caballito.properati.com.ar/11wqh_venta_...,-34.615847,-58.459957,Caballito
3,sell,apartment,57000.0,USD,864256.8,57000.0,42.0,42.0,1357.142857,1357.142857,5.0,2.0,364.0,http://constitucion.properati.com.ar/k2f0_vent...,-34.625222,-58.382382,Constitución
4,sell,apartment,90000.0,USD,1364616.0,90000.0,57.0,50.0,1578.947368,1800.0,,3.0,450.0,http://once.properati.com.ar/suwa_venta_depart...,-34.61061,-58.412511,Once


# Splitting the data

In [33]:
target = "price_aprox_usd"
features = ["neighbourhood"]
y_train = combo_df[target]
X_train = combo_df[features]

y_train.shape
X_train.shape

(6582, 1)

# Building The Model

In [42]:
y_mean = y_train.mean()
y_mean
y_prediction_baseline = [y_mean] * len(y_train) 
y_prediction_baseline[:7]

mean_abs_error = mean_absolute_error(y_train, y_prediction_baseline)
mean_abs_error

print("Mean apt price:", y_mean)

print("Baseline MAE:", mean_abs_error)

Mean apt price: 132383.83701458527
Baseline MAE: 44860.10834274134


# Iterating the model 

In [46]:
# use one hot encoding
# instantiate
ohe = OneHotEncoder(use_cat_names=True)
# fit
ohe.fit(X_train)
# transform
XT_train = ohe.transform(X_train)
print(XT_train.shape)
XT_train.head()


(6582, 57)


Unnamed: 0,neighbourhood_Chacarita,neighbourhood_Villa Luro,neighbourhood_Caballito,neighbourhood_Constitución,neighbourhood_Once,neighbourhood_Almagro,neighbourhood_Palermo,neighbourhood_Flores,neighbourhood_Belgrano,neighbourhood_Liniers,...,neighbourhood_Puerto Madero,neighbourhood_Agronomía,neighbourhood_Monte Castro,neighbourhood_Tribunales,neighbourhood_Villa Santa Rita,neighbourhood_Velez Sarsfield,neighbourhood_Villa Soldati,neighbourhood_Villa Real,neighbourhood_Pompeya,neighbourhood_Catalinas
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#  Build pipeline

In [48]:
# create pipeline

model = make_pipeline(
    OneHotEncoder(use_cat_names=True),
    LinearRegression()

)

# fitting model into the trainng data
model.fit(X_train, y_train)

Pipeline(steps=[('onehotencoder',
                 OneHotEncoder(cols=['neighbourhood'], use_cat_names=True)),
                ('linearregression', LinearRegression())])

#  Evaluate the model

In [50]:
y_prediction_training = model.predict(X_train)


mean_abs_error_pred_training = mean_absolute_error(y_train, y_prediction_training)
mean_abs_error_pred_training

39342.09807201459

In [53]:
# checking the perfomrnace of the model
X_test = pd.read_csv("data/buenos-aires-test-features.csv", encoding='latin-1')[features]
y_pred_test = pd.Series(model.predict(X_test))
y_pred_test.head()

0    249056.0
1    160928.0
2     98048.0
3    110240.0
4    128320.0
dtype: float64

# Communicating the results

In [56]:
# getting the intercept and coefficient of the model
intercept = model.named_steps["linearregression"].intercept_.round(2)
print("Model Intercept:", intercept) 

coefficient = model.named_steps["linearregression"].coef_.round(2)
print('Model coefficient":', coefficient)
print("coefficients len:", len(coefficient))

Model Intercept: 1.6490389780269504e+17
Model coefficient": [-1.64903898e+17 -1.64903898e+17 -1.64903898e+17 -1.64903898e+17
 -1.64903898e+17 -1.64903898e+17 -1.64903898e+17 -1.64903898e+17
 -1.64903898e+17 -1.64903898e+17 -1.64903898e+17 -1.64903898e+17
 -1.64903898e+17 -1.64903898e+17 -1.64903898e+17 -1.64903898e+17
 -1.64903898e+17 -1.64903898e+17 -1.64903898e+17 -1.64903898e+17
 -1.64903898e+17 -1.64903898e+17 -1.64903898e+17 -1.64903898e+17
 -1.64903898e+17 -1.64903898e+17 -1.64903898e+17 -1.64903898e+17
 -1.64903898e+17 -1.64903898e+17 -1.64903898e+17 -1.64903898e+17
 -1.64903898e+17 -1.64903898e+17 -1.64903898e+17 -1.64903898e+17
 -1.64903898e+17 -1.64903898e+17 -1.64903898e+17 -1.64903898e+17
 -1.64903898e+17 -1.64903898e+17 -1.64903898e+17 -1.64903898e+17
 -1.64903898e+17 -1.64903898e+17 -1.64903898e+17 -1.64903898e+17
 -1.64903898e+17 -1.64903898e+17 -1.64903898e+17 -1.64903898e+17
 -1.64903898e+17 -1.64903898e+17 -1.64903898e+17 -1.64903898e+17
 -1.64903898e+17]
coefficients

In [59]:
# extract feature names from one hot encoder
feature_names = model.named_steps['onehotencoder'].get_feature_names_out()
print("features len:", len(feature_names))
print(feature_names[:5])  # First five feature names

features len: 57
['neighbourhood_Chacarita' 'neighbourhood_Villa Luro'
 'neighbourhood_Caballito' 'neighbourhood_Constitución'
 'neighbourhood_Once']


In [61]:
# create panda series
feat_imp = pd.Series(coefficient, index=feature_names )
feat_imp.head()

neighbourhood_Chacarita      -1.649039e+17
neighbourhood_Villa Luro     -1.649039e+17
neighbourhood_Caballito      -1.649039e+17
neighbourhood_Constitución   -1.649039e+17
neighbourhood_Once           -1.649039e+17
dtype: float64