In [None]:
import pandas as pd
import numpy as np
from joblib import dump, load
from lazypredict.Supervised import LazyRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor  # Exemplo de modelo selecionado


class LazyPredictModel:
    def __init__(self, df: pd.DataFrame):
        self.df = df.copy()
        self.model = None
        self.pipeline = None

        # Colunas
        self.target_col = 'price'
        self.categorical_cols = ['neighbourhood_cleansed', 'room_type']
        self.numerical_cols = ['accommodates', 'bathrooms', 'bedrooms', 'beds']

    def preprocess_and_train(self):
        # Separar features e target
        X = self.df[self.categorical_cols + self.numerical_cols]
        y = self.df[self.target_col]

        # Split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

        # LazyRegressor (apenas para comparação)
        reg = LazyRegressor(verbose=0, ignore_warnings=True, random_state=42)
        models, _ = reg.fit(X_train, X_test, y_train, y_test)
        print(models.head())

        # Exemplo: usar o melhor modelo manualmente (aqui RandomForest)
        numeric_transformer = Pipeline([
            ('scaler', StandardScaler())
        ])

        categorical_transformer = Pipeline([
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ])

        preprocessor = ColumnTransformer([
            ('num', numeric_transformer, self.numerical_cols),
            ('cat', categorical_transformer, self.categorical_cols)
        ])

        self.pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('model', RandomForestRegressor(random_state=42))
        ])

        # Treinar pipeline
        self.pipeline.fit(X_train, y_train)
        self.model = self.pipeline  # Para salvar posteriormente

        # Avaliar
        y_pred = self.pipeline.predict(X_test)
        return {
            'r2': r2_score(y_test, y_pred),
            'rmse': np.sqrt(mean_squared_error(y_test, y_pred)),
            'mae': mean_absolute_error(y_test, y_pred)
        }

    def save(self, path='lazy_model.pkl'):
        if self.model:
            dump(self.model, path)

    @staticmethod
    def load(path='lazy_model.pkl'):
        return load(path)

    @staticmethod
    def predict(model, input_data, base_price=100):
        if isinstance(input_data, dict):
            input_df = pd.DataFrame([input_data])
        else:
            input_df = input_data.copy()

        pred = model.predict(input_df)

        # Simular métricas se não houver valor real
        y_true = np.full(len(pred), base_price)
        y_pred = pred

        r2 = r2_score(y_true, y_pred)
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        mae = mean_absolute_error(y_true, y_pred)

        return float(pred[0]), r2, rmse, mae


In [3]:
# Carrega os dados
df = pd.read_csv('airbnb_rio_cleaned_latest.csv')

#df = pd.read_csv('airbnb_rio_cleaned_latest.csv')

lazy_model = LazyPredictModel(df)
metrics = lazy_model.preprocess_and_train()
lazy_model.save()

print("Métricas no teste:")
print(metrics)

KeyError: "['neighbourhood'] not in index"

In [None]:
from pycaret.regression import *
import pandas as pd

df = pd.read_csv('airbnb_rio_cleaned_latest.csv')

# Defina as colunas que você quer manter
categorical_cols = ['neighbourhood_cleansed', 'room_type']
numerical_cols = ['accommodates', 'bathrooms', 'bedrooms', 'beds']
target_col = 'price'

# Filtra o DataFrame para ficar só com as colunas desejadas + target
cols_to_keep = categorical_cols + numerical_cols + [target_col]
df_filtered = df[cols_to_keep].copy()

# Opcional: filtrar linhas com target válido (>0 e não nulo)
df_filtered = df_filtered[df_filtered[target_col].notna() & (df_filtered[target_col] > 0)]

# Criar experimento com o DataFrame filtrado
reg_experiment = setup(
    data=df_filtered,
    target=target_col,
    categorical_features=categorical_cols,
    numeric_features=numerical_cols,
    session_id=42,
    normalize=True,
    transformation=True,
    remove_multicollinearity=True,
    multicollinearity_threshold=0.9,
    verbose=False
)

# Comparar modelos e escolher o melhor
best_model = compare_models(n_select=1)

# Treinar e ajustar o melhor modelo
tuned_model = tune_model(best_model)

# Avaliar o modelo
evaluate_model(tuned_model)

# Prever no conjunto de dados filtrado (opcional)
predictions = predict_model(tuned_model)

# Salvar o modelo
save_model(tuned_model, 'melhor_modelo_pycaret')


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,138.8269,35206.5229,187.5383,0.3533,0.4702,0.4403,1.04
catboost,CatBoost Regressor,138.9541,35617.4336,188.6254,0.3457,0.4707,0.4372,7.93
lightgbm,Light Gradient Boosting Machine,138.1828,36664.0364,191.3698,0.3264,0.4784,0.4066,0.898
xgboost,Extreme Gradient Boosting,140.2876,36671.2809,191.4201,0.326,0.4772,0.4399,4.423
llar,Lasso Least Angle Regression,145.603,37354.6949,193.1813,0.3137,0.5082,0.4664,0.108
lasso,Lasso Regression,145.603,37354.6953,193.1813,0.3137,0.5082,0.4664,0.358
ridge,Ridge Regression,145.5737,37358.7105,193.192,0.3136,0.5095,0.4655,0.102
lar,Least Angle Regression,145.5724,37358.6003,193.1917,0.3136,0.5095,0.4655,0.112
br,Bayesian Ridge,145.579,37358.3529,193.1909,0.3136,0.5094,0.4655,0.123
en,Elastic Net,148.5867,38170.5964,195.2687,0.299,0.5007,0.4875,0.104


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,143.0316,37651.5062,194.04,0.3364,0.4702,0.432
1,139.9326,36712.1359,191.6041,0.364,0.4634,0.4191
2,141.0543,36734.2785,191.6619,0.375,0.4745,0.439
3,134.4252,32147.5472,179.2974,0.3673,0.4728,0.4612
4,136.739,33858.0497,184.0056,0.3143,0.4625,0.4358
5,141.5215,38175.1409,195.3846,0.3474,0.4751,0.4366
6,132.6061,31257.9885,176.7993,0.4046,0.4576,0.4375
7,137.6968,35069.555,187.2687,0.34,0.4616,0.4235
8,140.4352,35658.2352,188.8339,0.3462,0.4659,0.4356
9,135.6712,33665.5861,183.4818,0.358,0.4776,0.4509


Fitting 10 folds for each of 10 candidates, totalling 100 fits


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Gradient Boosting Regressor,140.153,36768.2586,191.7505,0.3324,0.4728,0.4389


FileNotFoundError: [Errno 2] No such file or directory: 'streamlit/ml/models/melhor_modelo_pycaret.pkl'