# Desenvolvimento do modelo de predição de preço

Exploração inicial de modelos de base

In [5]:
import pandas as pd
from dagshub.data_engine import datasources
import mlflow
import dagshub
from sklearn.model_selection import train_test_split
import mlflow.sklearn
import mlflow.catboost
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
import mlflow.models.signature
from mlflow.models import infer_signature
from catboost import CatBoostRegressor

## Carregando Dataset

In [6]:
ds = datasources.get_datasource("wagnerdataset/fiap-ds-mlops-10dtsr-quantum-finance", "processed")

In [7]:
ds.all().dataframe

Output()

Unnamed: 0,path,datapoint_id,dagshub_download_url,media type,size
0,laptop-price-brl-processed.csv,103596624,https://dagshub.com/api/v1/repos/wagnerdataset...,text/plain,54135


In [8]:
res = ds.head()

for dp in res:
    dataset_url = dp.download_url

Output()

In [9]:
dataset_url

'https://dagshub.com/api/v1/repos/wagnerdataset/fiap-ds-mlops-10dtsr-quantum-finance/raw/main/data/processed/laptop-price-brl-processed.csv'

In [10]:
df = pd.read_csv(dataset_url)
df.head()

Unnamed: 0,ram_gb,ssd,hdd,graphic_card_gb,warranty,price,brand_asus,brand_dell,brand_hp,brand_lenovo,...,ram_type_other,os_other,os_windows,os_bit_32-bit,os_bit_64-bit,weight_casual,weight_gaming,weight_thinnlight,touchscreen_0,touchscreen_1
0,4,0,1024,0,0,2321,1,0,0,0,...,0,0,1,0,1,1,0,0,1,0
1,4,0,1024,0,0,2613,0,0,0,1,...,0,0,1,0,1,1,0,0,1,0
2,4,0,1024,0,0,2680,0,0,0,1,...,0,0,1,0,1,1,0,0,1,0
3,8,512,0,2,0,4689,1,0,0,0,...,0,0,1,1,0,1,0,0,1,0
4,4,0,512,0,0,1808,1,0,0,0,...,0,0,1,0,1,1,0,0,1,0


## Desenvolvimento e experimentos de modelos

In [11]:
dagshub.init(repo_owner="wagnerdataset", repo_name="fiap-ds-mlops-10dtsr-quantum-finance", mlflow=True)

In [12]:
mlflow.autolog()

2025/08/01 11:07:42 INFO mlflow.tracking.fluent: Autologging successfully enabled for lightgbm.
2025/08/01 11:07:48 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/08/01 11:07:48 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.


In [13]:
features = list(df.columns)
features.remove("price")

features

['ram_gb',
 'ssd',
 'hdd',
 'graphic_card_gb',
 'warranty',
 'brand_asus',
 'brand_dell',
 'brand_hp',
 'brand_lenovo',
 'brand_other',
 'processor_brand_amd',
 'processor_brand_intel',
 'processor_brand_m1',
 'processor_name_core i3',
 'processor_name_core i5',
 'processor_name_core i7',
 'processor_name_other',
 'processor_name_ryzen 5',
 'processor_name_ryzen 7',
 'ram_type_ddr4',
 'ram_type_other',
 'os_other',
 'os_windows',
 'os_bit_32-bit',
 'os_bit_64-bit',
 'weight_casual',
 'weight_gaming',
 'weight_thinnlight',
 'touchscreen_0',
 'touchscreen_1']

In [14]:
X = df[features]

In [15]:
len(features)

30

In [16]:
y = df["price"]
y

0       2321
1       2613
2       2680
3       4689
4       1808
       ...  
775     9111
776     9714
777    10049
778     9580
779     3852
Name: price, Length: 780, dtype: int64

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [18]:
def evaluate_and_log_model(kind, model_name, model, X_test, y_test):
    predictions = model.predict(X_test)

    mse = mean_squared_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    mape = mean_absolute_percentage_error(y_test, predictions)

    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("R2", r2)
    mlflow.log_metric("MAPE", mape)

    signature = infer_signature(X_test, predictions)

    if kind == "catboost":
        mlflow.catboost.log_model(model, model_name, signature=signature, input_example=X_test[:5])
    elif kind == "xgboost":
        mlflow.xgboost.log_model(model, model_name, signature=signature, input_example=X_test[:5])
    elif kind == "lightgbm":
        mlflow.lightgbm.log_model(model, model_name, signature=signature, input_example=X_test[:5])
    else:
        mlflow.sklearn.log_model(model, model_name, signature=signature, input_example=X_test[:5])

    print(f"Model {model_name} logged with MSE: {mse}, MAE: {mae}, R2: {r2}, MAPE: {mape}")

### Experimento com Ridge Regression

In [19]:
with mlflow.start_run(run_name="Ridge Regression"):
    param_grid = {
        'alpha': [0.1, 1.0, 10.0, 100.0],
        'fit_intercept': [True, False],
        }
    
    ridge = Ridge()

    grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False))
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_

    mlflow.log_param("best_alpha", grid_search.best_params_['alpha'])
    mlflow.log_param("best_fit_intercept", grid_search.best_params_['fit_intercept'])

    evaluate_and_log_model("sklearn", "ridge_regression", best_model, X_test, y_test)

2025/08/01 11:08:41 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.


🏃 View run unruly-quail-463 at: https://dagshub.com/wagnerdataset/fiap-ds-mlops-10dtsr-quantum-finance.mlflow/#/experiments/0/runs/a8edace56eaa47d49a6f167ad4a9c41d
🧪 View experiment at: https://dagshub.com/wagnerdataset/fiap-ds-mlops-10dtsr-quantum-finance.mlflow/#/experiments/0
🏃 View run funny-calf-388 at: https://dagshub.com/wagnerdataset/fiap-ds-mlops-10dtsr-quantum-finance.mlflow/#/experiments/0/runs/09d216f44b0b4adab90958ce33409a29
🧪 View experiment at: https://dagshub.com/wagnerdataset/fiap-ds-mlops-10dtsr-quantum-finance.mlflow/#/experiments/0
🏃 View run serious-fish-426 at: https://dagshub.com/wagnerdataset/fiap-ds-mlops-10dtsr-quantum-finance.mlflow/#/experiments/0/runs/3a42e8da67e64c8aa02aee3adae56d56
🧪 View experiment at: https://dagshub.com/wagnerdataset/fiap-ds-mlops-10dtsr-quantum-finance.mlflow/#/experiments/0
🏃 View run likeable-sheep-594 at: https://dagshub.com/wagnerdataset/fiap-ds-mlops-10dtsr-quantum-finance.mlflow/#/experiments/0/runs/ee3dca2e8a504d1d907bcf753bc81



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Model ridge_regression logged with MSE: 3024089.113899016, MAE: 1137.7871730265826, R2: 0.6421034856259351, MAPE: 0.22065408855367982
🏃 View run Ridge Regression at: https://dagshub.com/wagnerdataset/fiap-ds-mlops-10dtsr-quantum-finance.mlflow/#/experiments/0/runs/4a210ea37c2e4f17b90fef90041f0cac
🧪 View experiment at: https://dagshub.com/wagnerdataset/fiap-ds-mlops-10dtsr-quantum-finance.mlflow/#/experiments/0


🏃 View run monumental-wasp-635 at: https://dagshub.com/wagnerdataset/fiap-ds-mlops-10dtsr-quantum-finance.mlflow/#/experiments/0/runs/8c2021291b1144a394b9055f94f7ac50
🧪 View experiment at: https://dagshub.com/wagnerdataset/fiap-ds-mlops-10dtsr-quantum-finance.mlflow/#/experiments/0
🏃 View run adorable-moose-619 at: https://dagshub.com/wagnerdataset/fiap-ds-mlops-10dtsr-quantum-finance.mlflow/#/experiments/0/runs/aaff88e5693c43c0a882e58eeff747e2
🧪 View experiment at: https://dagshub.com/wagnerdataset/fiap-ds-mlops-10dtsr-quantum-finance.mlflow/#/experiments/0
🏃 View run marvelous-crow-936 at: https://dagshub.com/wagnerdataset/fiap-ds-mlops-10dtsr-quantum-finance.mlflow/#/experiments/0/runs/eee68ff4850a41448728ea8c70b68720
🧪 View experiment at: https://dagshub.com/wagnerdataset/fiap-ds-mlops-10dtsr-quantum-finance.mlflow/#/experiments/0
🏃 View run valuable-gull-954 at: https://dagshub.com/wagnerdataset/fiap-ds-mlops-10dtsr-quantum-finance.mlflow/#/experiments/0/runs/4c91e76df87341bf94cff

### Decision Tree Regressor

In [20]:
with mlflow.start_run(run_name="Decision Tree Regression"):
    param_grid = {
        'max_depth': [None, 3, 5, 10, 15],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    
    ridge = DecisionTreeRegressor()

    grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False))
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_

    mlflow.log_param("best_max_depth", grid_search.best_params_['max_depth'])
    mlflow.log_param("best_min_samples_split", grid_search.best_params_['min_samples_split'])
    mlflow.log_param("best_min_samples_leaf", grid_search.best_params_['min_samples_leaf'])

    evaluate_and_log_model("sklearn", "ridge_regression", best_model, X_test, y_test)

2025/08/01 11:10:23 INFO mlflow.sklearn.utils: Logging the 5 best runs, 40 runs will be omitted.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Model ridge_regression logged with MSE: 3796751.874021301, MAE: 1067.3024379859905, R2: 0.5506599804185538, MAPE: 0.16683040944056576
🏃 View run Decision Tree Regression at: https://dagshub.com/wagnerdataset/fiap-ds-mlops-10dtsr-quantum-finance.mlflow/#/experiments/0/runs/5426d15b2a8e40d3a2cca2b1d1161164
🧪 View experiment at: https://dagshub.com/wagnerdataset/fiap-ds-mlops-10dtsr-quantum-finance.mlflow/#/experiments/0


## XGBoost

In [21]:
with mlflow.start_run(run_name="XGBoost_Regressor_Advanced"):
   
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0],
        'gamma': [0, 1],
        'reg_alpha': [0, 0.1],
        'reg_lambda': [1, 5],
        'min_child_weight': [1, 3]
    }
   
    xgb = XGBRegressor(random_state=42, verbosity=0)
    grid_search = GridSearchCV(xgb, param_grid, scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False), cv=5)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    mlflow.log_param("best_n_estimators", best_model.n_estimators)
    mlflow.log_param("best_max_depth", best_model.max_depth)
    mlflow.log_param("best_learning_rate", best_model.learning_rate)
    evaluate_and_log_model("xgboost", "XGBoost Regressor", best_model, X_test, y_test)

2025/08/01 11:26:45 INFO mlflow.sklearn.utils: Logging the 5 best runs, 763 runs will be omitted.
  self.get_booster().save_model(fname)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Model XGBoost Regressor logged with MSE: 2506782.75, MAE: 882.2275390625, R2: 0.7033259868621826, MAPE: 0.1387338936328888
🏃 View run XGBoost_Regressor_Advanced at: https://dagshub.com/wagnerdataset/fiap-ds-mlops-10dtsr-quantum-finance.mlflow/#/experiments/0/runs/f8fbb6c7a5a34cff86517c85e612aa6c
🧪 View experiment at: https://dagshub.com/wagnerdataset/fiap-ds-mlops-10dtsr-quantum-finance.mlflow/#/experiments/0


## Nova abordagem XGBoost

In [22]:
with mlflow.start_run(run_name="XGBoost_Regressor"):
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 7, 9],
        'learning_rate': [0.01, 0.1, 0.2, 0.3]
    }
    xgb = XGBRegressor(random_state=42, verbosity=0)
    grid_search = GridSearchCV(xgb, param_grid, scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False), cv=5)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    mlflow.log_param("best_n_estimators", best_model.n_estimators)
    mlflow.log_param("best_max_depth", best_model.max_depth)
    mlflow.log_param("best_learning_rate", best_model.learning_rate)
    evaluate_and_log_model("xgboost", "XGBoost Regressor", best_model, X_test, y_test)

2025/08/01 11:29:22 INFO mlflow.sklearn.utils: Logging the 5 best runs, 43 runs will be omitted.
  self.get_booster().save_model(fname)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Model XGBoost Regressor logged with MSE: 2609672.0, MAE: 889.0443115234375, R2: 0.6911491751670837, MAPE: 0.14123447239398956
🏃 View run XGBoost_Regressor at: https://dagshub.com/wagnerdataset/fiap-ds-mlops-10dtsr-quantum-finance.mlflow/#/experiments/0/runs/40c78fc23c3948f4b24b08b3bdfed1a9
🧪 View experiment at: https://dagshub.com/wagnerdataset/fiap-ds-mlops-10dtsr-quantum-finance.mlflow/#/experiments/0


## Registro de Modelo em Produção

In [24]:
run_id = "f8fbb6c7a5a34cff86517c85e612aa6c"

mlflow.register_model(model_uri=f"runs:/{run_id}/model", name="laptop-pricing-model-brl")

Registered model 'laptop-pricing-model-brl' already exists. Creating a new version of this model...
2025/08/01 13:49:43 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: laptop-pricing-model-brl, version 3
Created version '3' of model 'laptop-pricing-model-brl'.


<ModelVersion: aliases=[], creation_timestamp=1754066983344, current_stage='None', description='', last_updated_timestamp=1754066983344, name='laptop-pricing-model-brl', run_id='f8fbb6c7a5a34cff86517c85e612aa6c', run_link='', source='mlflow-artifacts:/21f7f6e4f12e499bb47b61591420461f/f8fbb6c7a5a34cff86517c85e612aa6c/artifacts/model', status='READY', status_message=None, tags={}, user_id='', version='3'>