In [10]:
from datetime import datetime

today = datetime.today().date()
today

print(f"training date (2024-09-10) vs Evaluation date: ({today})")

training date (2024-09-10) vs Evaluation date: (2024-09-21)


I will extract data from the Mercado Libre API for properties that were listed today.

In [1]:
from utils.training.functions import mysql_conn
from utils.data_collection.functions import get_data
from utils.data_collection.config import ACCESS_TOKEN

# Load data into a SQL table
engine = mysql_conn()

all_results = get_data(acces_token=ACCESS_TOKEN,
                        date='today')

Mataderos
Liniers
Villa Devoto
Villa Urquiza
Villa Lugano
Flores
Villa Luro
Villa Pueyrredón
Caballito
Barracas
Floresta
Villa del Parque
Saavedra
Palermo
Parque Avellaneda
Parque Chacabuco
Monte Castro
Almagro
Boedo
Parque Patricios
Villa Crespo
Núñez
San Telmo
Belgrano
San Cristóbal
Versalles
Villa Real
Velez Sarsfield
La Boca
Colegiales
Santa Rita
Nueva Pompeya
Balvanera
Palermo Soho
Paternal
Constitución
Villa Gral. Mitre
Coghlan
Villa Soldati
Villa Ortúzar
Chacarita
Parque Chas
Palermo Hollywood
Agronomía
Monserrat
Las Cañitas
Barrio Norte
Belgrano R
Recoleta
Villa Riachuelo
Botánico
San Nicolás
Congreso
Palermo Viejo
Palermo Chico
Once
Belgrano Chico
Belgrano C
Retiro
Palermo Nuevo


The  data will be filtered to exclude any properties that were already present in the dataset used to train the model.

In [48]:
import pandas as pd
# Convert data to DataFrame
data = pd.DataFrame.from_dict(all_results, orient='index')

# Get engine connection
engine = mysql_conn()

# Get id already stored in the database
query = f"SELECT id FROM properties"
id_stored = pd.read_sql(query, engine)

# filter data
test = data[~data.index.isin(id_stored)]

# Delete duplicated: The ids could be extracted more than once or the same sample could have different id
test = test[~test.index.duplicated(keep='first')]
test.drop_duplicates(inplace=True)

# Add date to the DF
test['date'] = [today] * len(test)

# Reindex 
test = test.reset_index().drop('index', axis=1)
test['id']=range(len(test))

The data will be processed through the pipeline to transform them into the format expected by the model, and predictions will be generated accordingly

In [55]:
from utils.deployment.model_loader import LoadModel, LoadPipeline

# Get the Pipeline and the Model
model = LoadModel()
pipeline = LoadPipeline()
pipeline.named_steps['density'].stage = 'deploy'

# Transform test data
transformed_test = pipeline.transform(test).drop('price', axis=1)

# Get predictions
y_pred = pd.Series(model.predict(transformed_test),
                   index=transformed_test.index)

# Filter the real values by the samples kept after the transformation  
mask=test.index.isin(transformed_test.index)
y_test=test.loc[mask, 'price']

Let's have a look to the metrics

In [59]:
from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_absolute_error

mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolut  Error (MAE): {mae.round(0)}')
print(f'Mean Absolut Percentage Error (MAPE): {mape.round(2)*100}%')
print(f'R^2 Score: {round(r2, 2)*100}%')

Mean Absolut  Error (MAE): 26636.0
Mean Absolut Percentage Error (MAPE): 16.0%
R^2 Score: 97.0%


Now, let's take the training MAE and MAPE values

In [73]:
import joblib
import os

cur_dir = os.getcwd()
# Import training mae and mape
model_train_residuals = joblib.load(os.path.join(cur_dir, 'model', '1', 'model_residual.pkl'))

print("Training MAE and MAPE:")
print(model_train_residuals)

train_mae = model_train_residuals['mae']
train_mape = model_train_residuals['mape']

Training MAE and MAPE:
{'mae': 40215.0, 'mape': 0.18012691662345948}


In [72]:
if mae < train_mae and mape < train_mape:
    print(f"The model is still predicting correctly with test MAE ({mae:.2f}) < train MAE ({train_mae:.2f}) and test MAPE ({mape:.2f}) < train MAPE ({train_mape:.2f}).")
else:
    print(f"The model performance has degraded: test MAE ({mae:.2f}) >= train MAE ({train_mae:.2f}) or test MAPE ({mape:.2f}) >= train MAPE ({train_mape:.2f}).")


The model is still predicting correctly with test MAE (26635.55) < train MAE (40215.00) and test MAPE (0.16) < train MAPE (0.18).


The model continues to perform correctly, with a lower MAE than the one obtained during training, along with a lower MAPE and a significantly higher R² score of 97%.

I will also evaluate the MAPE for the same price ranges assessed during training to check if the model continues to follow the same performance trends. This will help us ensure consistency in prediction accuracy across different price segments.

In [85]:
def mape_function(y_true, y_pred):
    return ((y_true - y_pred) / y_true).abs().mean()

test_result=pd.concat([test.loc[mask, ['neighborhood']],
           pd.Series(y_pred, name='pred').astype(int),
           y_test], axis=1)


test_result['price_bins'] = pd.Series(pd.cut(y_test, [40000, 75000, 105000, 155000, 244407, 5000000]))

test_result.groupby(['price_bins'],observed=False).agg(
    mape=('price', lambda x: mape_function(x, test_result.loc[x.index, 'pred'])), 
    size=('price', lambda x: (len(x)/len(test_result)))  
).sort_index()

Unnamed: 0_level_0,mape,size
price_bins,Unnamed: 1_level_1,Unnamed: 2_level_1
"(40000, 75000]",0.40352,0.132159
"(75000, 105000]",0.137918,0.127753
"(105000, 155000]",0.119911,0.255507
"(155000, 244407]",0.124877,0.237885
"(244407, 5000000]",0.129723,0.246696


The model continues to follow the same trend observed during training. For higher-priced properties, the MAPE remains low, indicating good prediction accuracy. However, for lower-priced properties, particularly those under $75,000, the MAPE is significantly higher, suggesting greater variance and less accurate predictions in this price range.






