# Desenvolvimento do modelo de predição de preço

Exploração inicial de modelos de base

In [46]:
import pandas as pd
from dagshub.data_engine import datasources
import mlflow
import dagshub
from sklearn.model_selection import train_test_split
import mlflow.sklearn
import mlflow.catboost
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
import mlflow.models.signature
from mlflow.models import infer_signature
from catboost import CatBoostRegressor

## Carregando Dataset

In [47]:
ds = datasources.get_datasource("wagnerdataset/fiap-ds-mlops-10dtsr-quantum-finance", "processed")

In [48]:
ds.all().dataframe

Output()

Unnamed: 0,path,datapoint_id,dagshub_download_url,media type,size
0,quantum_finance_test_processed.csv,103597636,https://dagshub.com/api/v1/repos/wagnerdataset...,text/plain,12949087
1,quantum_finance_train_processed.csv,103597637,https://dagshub.com/api/v1/repos/wagnerdataset...,text/plain,26058129


In [49]:
res = ds.head()

for dp in res:
    dataset_url = dp.download_url

Output()

In [50]:
dataset_url

'https://dagshub.com/api/v1/repos/wagnerdataset/fiap-ds-mlops-10dtsr-quantum-finance/raw/main/data/processed/quantum_finance_train_processed.csv'

In [51]:
df = pd.read_csv(dataset_url)
df.head()

Unnamed: 0,Age,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score_Label
0,23.0,,,3,4,3,,,,11.27,...,2,,26.82262,265.0,No,49.574949,,High_spent_Small_value_payments,,0
1,23.0,,,3,4,3,,,,11.27,...,1,,31.94496,221.195405,No,49.574949,,Low_spent_Large_value_payments,,0
2,-500.0,,,3,4,3,,,,_,...,1,,28.609352,267.0,No,49.574949,,Low_spent_Medium_value_payments,,0
3,23.0,,,3,4,3,,,,6.27,...,1,,31.377862,268.0,No,49.574949,,Low_spent_Small_value_payments,,0
4,23.0,,,3,4,3,,,,11.27,...,1,,24.797347,269.0,No,49.574949,,High_spent_Medium_value_payments,,0


## Desenvolvimento e experimentos de modelos

In [52]:
dagshub.init(repo_owner="wagnerdataset", repo_name="fiap-ds-mlops-10dtsr-quantum-finance", mlflow=True)

In [53]:
mlflow.autolog()

2025/08/02 10:04:42 INFO mlflow.tracking.fluent: Autologging successfully enabled for lightgbm.
2025/08/02 10:04:42 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/08/02 10:04:42 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.


In [54]:
features = list(df.columns)
features.remove("Credit_Score_Label")

features

['Age',
 'Annual_Income',
 'Monthly_Inhand_Salary',
 'Num_Bank_Accounts',
 'Num_Credit_Card',
 'Interest_Rate',
 'Num_of_Loan',
 'Delay_from_due_date',
 'Num_of_Delayed_Payment',
 'Changed_Credit_Limit',
 'Num_Credit_Inquiries',
 'Credit_Mix',
 'Outstanding_Debt',
 'Credit_Utilization_Ratio',
 'Credit_History_Age',
 'Payment_of_Min_Amount',
 'Total_EMI_per_month',
 'Amount_invested_monthly',
 'Payment_Behaviour',
 'Monthly_Balance']

In [55]:
X = df[features]

In [56]:
len(features)

20

In [57]:
y = df["Credit_Score_Label"]
y

0        0
1        0
2        0
3        0
4        0
        ..
99995    1
99996    1
99997    1
99998    2
99999    1
Name: Credit_Score_Label, Length: 100000, dtype: int64

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [59]:
def evaluate_and_log_model(kind, model_name, model, X_test, y_test):
    predictions = model.predict(X_test)

    mse = mean_squared_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    mape = mean_absolute_percentage_error(y_test, predictions)

    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("R2", r2)
    mlflow.log_metric("MAPE", mape)

    signature = infer_signature(X_test, predictions)

    if kind == "catboost":
        mlflow.catboost.log_model(model, model_name, signature=signature, input_example=X_test[:5])
    elif kind == "xgboost":
        mlflow.xgboost.log_model(model, model_name, signature=signature, input_example=X_test[:5])
    elif kind == "lightgbm":
        mlflow.lightgbm.log_model(model, model_name, signature=signature, input_example=X_test[:5])
    else:
        mlflow.sklearn.log_model(model, model_name, signature=signature, input_example=X_test[:5])

    print(f"Model {model_name} logged with MSE: {mse}, MAE: {mae}, R2: {r2}, MAPE: {mape}")

### Experimento com Ridge Regression

In [60]:
with mlflow.start_run(run_name="Ridge Regression"):
    param_grid = {
        'alpha': [0.1, 1.0, 10.0, 100.0],
        'fit_intercept': [True, False],
        }
    
    ridge = Ridge()

    grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False))
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_

    mlflow.log_param("best_alpha", grid_search.best_params_['alpha'])
    mlflow.log_param("best_fit_intercept", grid_search.best_params_['fit_intercept'])

    evaluate_and_log_model("sklearn", "ridge_regression", best_model, X_test, y_test)



🏃 View run Ridge Regression at: https://dagshub.com/wagnerdataset/fiap-ds-mlops-10dtsr-quantum-finance.mlflow/#/experiments/0/runs/6550530899054743a829cc1497db0e39
🧪 View experiment at: https://dagshub.com/wagnerdataset/fiap-ds-mlops-10dtsr-quantum-finance.mlflow/#/experiments/0


ValueError: 
All the 40 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Repository\Git\fiap-ds-mlops-10dtsr-quantum-finance\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Repository\Git\fiap-ds-mlops-10dtsr-quantum-finance\.venv\Lib\site-packages\mlflow\utils\autologging_utils\safety.py", line 483, in safe_patch_function
    patch_function(call_original, *args, **kwargs)
  File "d:\Repository\Git\fiap-ds-mlops-10dtsr-quantum-finance\.venv\Lib\site-packages\mlflow\utils\autologging_utils\safety.py", line 182, in patch_with_managed_run
    result = patch_function(original, *args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Repository\Git\fiap-ds-mlops-10dtsr-quantum-finance\.venv\Lib\site-packages\mlflow\sklearn\__init__.py", line 1662, in patched_fit
    return original(self, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Repository\Git\fiap-ds-mlops-10dtsr-quantum-finance\.venv\Lib\site-packages\mlflow\utils\autologging_utils\safety.py", line 474, in call_original
    return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Repository\Git\fiap-ds-mlops-10dtsr-quantum-finance\.venv\Lib\site-packages\mlflow\utils\autologging_utils\safety.py", line 425, in call_original_fn_with_event_logging
    original_fn_result = original_fn(*og_args, **og_kwargs)
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Repository\Git\fiap-ds-mlops-10dtsr-quantum-finance\.venv\Lib\site-packages\mlflow\utils\autologging_utils\safety.py", line 471, in _original_fn
    original_result = original(*_og_args, **_og_kwargs)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Repository\Git\fiap-ds-mlops-10dtsr-quantum-finance\.venv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Repository\Git\fiap-ds-mlops-10dtsr-quantum-finance\.venv\Lib\site-packages\sklearn\linear_model\_ridge.py", line 1239, in fit
    X, y = validate_data(
           ^^^^^^^^^^^^^^
  File "d:\Repository\Git\fiap-ds-mlops-10dtsr-quantum-finance\.venv\Lib\site-packages\sklearn\utils\validation.py", line 2961, in validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Repository\Git\fiap-ds-mlops-10dtsr-quantum-finance\.venv\Lib\site-packages\sklearn\utils\validation.py", line 1370, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "d:\Repository\Git\fiap-ds-mlops-10dtsr-quantum-finance\.venv\Lib\site-packages\sklearn\utils\validation.py", line 1055, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Repository\Git\fiap-ds-mlops-10dtsr-quantum-finance\.venv\Lib\site-packages\sklearn\utils\_array_api.py", line 839, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Repository\Git\fiap-ds-mlops-10dtsr-quantum-finance\.venv\Lib\site-packages\pandas\core\generic.py", line 2153, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: '_'

--------------------------------------------------------------------------------
39 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Repository\Git\fiap-ds-mlops-10dtsr-quantum-finance\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Repository\Git\fiap-ds-mlops-10dtsr-quantum-finance\.venv\Lib\site-packages\mlflow\utils\autologging_utils\safety.py", line 402, in safe_patch_function
    return original(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Repository\Git\fiap-ds-mlops-10dtsr-quantum-finance\.venv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Repository\Git\fiap-ds-mlops-10dtsr-quantum-finance\.venv\Lib\site-packages\sklearn\linear_model\_ridge.py", line 1239, in fit
    X, y = validate_data(
           ^^^^^^^^^^^^^^
  File "d:\Repository\Git\fiap-ds-mlops-10dtsr-quantum-finance\.venv\Lib\site-packages\sklearn\utils\validation.py", line 2961, in validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Repository\Git\fiap-ds-mlops-10dtsr-quantum-finance\.venv\Lib\site-packages\sklearn\utils\validation.py", line 1370, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "d:\Repository\Git\fiap-ds-mlops-10dtsr-quantum-finance\.venv\Lib\site-packages\sklearn\utils\validation.py", line 1055, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Repository\Git\fiap-ds-mlops-10dtsr-quantum-finance\.venv\Lib\site-packages\sklearn\utils\_array_api.py", line 839, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Repository\Git\fiap-ds-mlops-10dtsr-quantum-finance\.venv\Lib\site-packages\pandas\core\generic.py", line 2153, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: '_'


### Decision Tree Regressor

In [None]:
with mlflow.start_run(run_name="Decision Tree Regression"):
    param_grid = {
        'max_depth': [None, 3, 5, 10, 15],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    
    ridge = DecisionTreeRegressor()

    grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False))
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_

    mlflow.log_param("best_max_depth", grid_search.best_params_['max_depth'])
    mlflow.log_param("best_min_samples_split", grid_search.best_params_['min_samples_split'])
    mlflow.log_param("best_min_samples_leaf", grid_search.best_params_['min_samples_leaf'])

    evaluate_and_log_model("sklearn", "ridge_regression", best_model, X_test, y_test)

2025/08/01 11:10:23 INFO mlflow.sklearn.utils: Logging the 5 best runs, 40 runs will be omitted.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Model ridge_regression logged with MSE: 3796751.874021301, MAE: 1067.3024379859905, R2: 0.5506599804185538, MAPE: 0.16683040944056576
🏃 View run Decision Tree Regression at: https://dagshub.com/wagnerdataset/fiap-ds-mlops-10dtsr-quantum-finance.mlflow/#/experiments/0/runs/5426d15b2a8e40d3a2cca2b1d1161164
🧪 View experiment at: https://dagshub.com/wagnerdataset/fiap-ds-mlops-10dtsr-quantum-finance.mlflow/#/experiments/0


## XGBoost

In [None]:
with mlflow.start_run(run_name="XGBoost_Regressor_Advanced"):
   
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0],
        'gamma': [0, 1],
        'reg_alpha': [0, 0.1],
        'reg_lambda': [1, 5],
        'min_child_weight': [1, 3]
    }
   
    xgb = XGBRegressor(random_state=42, verbosity=0)
    grid_search = GridSearchCV(xgb, param_grid, scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False), cv=5)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    mlflow.log_param("best_n_estimators", best_model.n_estimators)
    mlflow.log_param("best_max_depth", best_model.max_depth)
    mlflow.log_param("best_learning_rate", best_model.learning_rate)
    evaluate_and_log_model("xgboost", "XGBoost Regressor", best_model, X_test, y_test)

2025/08/01 11:26:45 INFO mlflow.sklearn.utils: Logging the 5 best runs, 763 runs will be omitted.
  self.get_booster().save_model(fname)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Model XGBoost Regressor logged with MSE: 2506782.75, MAE: 882.2275390625, R2: 0.7033259868621826, MAPE: 0.1387338936328888
🏃 View run XGBoost_Regressor_Advanced at: https://dagshub.com/wagnerdataset/fiap-ds-mlops-10dtsr-quantum-finance.mlflow/#/experiments/0/runs/f8fbb6c7a5a34cff86517c85e612aa6c
🧪 View experiment at: https://dagshub.com/wagnerdataset/fiap-ds-mlops-10dtsr-quantum-finance.mlflow/#/experiments/0


## Nova abordagem XGBoost

In [None]:
with mlflow.start_run(run_name="XGBoost_Regressor"):
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 7, 9],
        'learning_rate': [0.01, 0.1, 0.2, 0.3]
    }
    xgb = XGBRegressor(random_state=42, verbosity=0)
    grid_search = GridSearchCV(xgb, param_grid, scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False), cv=5)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    mlflow.log_param("best_n_estimators", best_model.n_estimators)
    mlflow.log_param("best_max_depth", best_model.max_depth)
    mlflow.log_param("best_learning_rate", best_model.learning_rate)
    evaluate_and_log_model("xgboost", "XGBoost Regressor", best_model, X_test, y_test)

2025/08/01 11:29:22 INFO mlflow.sklearn.utils: Logging the 5 best runs, 43 runs will be omitted.
  self.get_booster().save_model(fname)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Model XGBoost Regressor logged with MSE: 2609672.0, MAE: 889.0443115234375, R2: 0.6911491751670837, MAPE: 0.14123447239398956
🏃 View run XGBoost_Regressor at: https://dagshub.com/wagnerdataset/fiap-ds-mlops-10dtsr-quantum-finance.mlflow/#/experiments/0/runs/40c78fc23c3948f4b24b08b3bdfed1a9
🧪 View experiment at: https://dagshub.com/wagnerdataset/fiap-ds-mlops-10dtsr-quantum-finance.mlflow/#/experiments/0


## Registro de Modelo em Produção

In [None]:
run_id = "f8fbb6c7a5a34cff86517c85e612aa6c"

mlflow.register_model(model_uri=f"runs:/{run_id}/model", name="laptop-pricing-model-brl")

Registered model 'laptop-pricing-model-brl' already exists. Creating a new version of this model...
2025/08/01 13:49:43 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: laptop-pricing-model-brl, version 3
Created version '3' of model 'laptop-pricing-model-brl'.


<ModelVersion: aliases=[], creation_timestamp=1754066983344, current_stage='None', description='', last_updated_timestamp=1754066983344, name='laptop-pricing-model-brl', run_id='f8fbb6c7a5a34cff86517c85e612aa6c', run_link='', source='mlflow-artifacts:/21f7f6e4f12e499bb47b61591420461f/f8fbb6c7a5a34cff86517c85e612aa6c/artifacts/model', status='READY', status_message=None, tags={}, user_id='', version='3'>