In [1]:
import matplotlib.pyplot as plt
import pandas
import numpy as np
import pycaret.regression as pr

%matplotlib inline
model_name = 'auto_model'

# Automóveis


## Leitura da Base 

In [5]:
df_auto = pandas.read_csv('../Data/dataset_auto.csv',sep=';')
df_auto = df_auto.sample(10000)
print(df_auto.shape)
df_auto.head()

(10000, 10)


Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,brand
6958,X2,2019,28991,Semi-Auto,8384,Petrol,150,39.8,2.0,bmw
7534,2 Series,2016,10600,Manual,24313,Diesel,0,74.3,1.5,bmw
2975,TT,2017,16484,Manual,23641,Petrol,145,47.1,1.8,audi
3903,A1,2016,11300,Manual,12650,Diesel,0,76.3,1.6,audi
8437,X1,2016,13500,Manual,75220,Diesel,125,58.9,2.0,bmw


## Configuração do PyCaret 

In [6]:
auto_target_col = 'price'
ignore_features = None
df_auto.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,brand
6958,X2,2019,28991,Semi-Auto,8384,Petrol,150,39.8,2.0,bmw
7534,2 Series,2016,10600,Manual,24313,Diesel,0,74.3,1.5,bmw
2975,TT,2017,16484,Manual,23641,Petrol,145,47.1,1.8,audi
3903,A1,2016,11300,Manual,12650,Diesel,0,76.3,1.6,audi
8437,X1,2016,13500,Manual,75220,Diesel,125,58.9,2.0,bmw


In [8]:
experiment = pr.setup(session_id=123,
                      data = df_auto, # Configurações de dados
                      train_size=0.6,
                      target = auto_target_col,
                      profile = False, # Analise interativa de variaveis
#                       fold_strategy = 'groupkfold', # Validação cruzada
                      fold = 10,
                      fold_groups = 'brand',
                      normalize = True,  # Normalização, transformação e remoção de variáveis
                      transformation = True, 
                      ignore_low_variance = True,
                      remove_multicollinearity = True,
                      multicollinearity_threshold = 0.95,
                      bin_numeric_features = ['year','tax'], # Binarizacao de variaveis
                      group_features = None, 
                      categorical_features = ['fuelType', 'transmission'],
                      ignore_features = None,
                      log_experiment = True, # Logging dos experimentos e afins
                      experiment_name = 'auto-model',
                      silent=True)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,price
2,Original Data,"(10000, 10)"
3,Missing Values,False
4,Numeric Features,5
5,Categorical Features,4
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(6000, 106)"


INFO:logs:Logging experiment in MLFlow
Traceback (most recent call last):
  File "C:\Users\BZ241WX\AppData\Local\Continuum\anaconda3\envs\infnet-ead\lib\site-packages\mlflow\store\tracking\file_store.py", line 261, in list_experiments
    experiment = self._get_experiment(exp_id, view_type)
  File "C:\Users\BZ241WX\AppData\Local\Continuum\anaconda3\envs\infnet-ead\lib\site-packages\mlflow\store\tracking\file_store.py", line 344, in _get_experiment
    meta = read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "C:\Users\BZ241WX\AppData\Local\Continuum\anaconda3\envs\infnet-ead\lib\site-packages\mlflow\utils\file_utils.py", line 176, in read_yaml
    raise MissingConfigException("Yaml file '%s' does not exist." % file_path)
mlflow.exceptions.MissingConfigException: Yaml file 'C:\Users\BZ241WX\OneDrive - EY\Documents\InfNet\2021_PGLLIA01C0L1LIA-A2\ead_datascience\Code\mlruns\2\meta.yaml' does not exist.
  File "C:\Users\BZ241WX\AppData\Roaming\Python\Python37\site-packages\pyc

## Seleção de Modelos

In [9]:
# O PyCaret treina os modelos com hiperparametros defaults, sem ajuste fino.
model = pr.compare_models(sort='MSE', include=['lasso', 'dt', 'svm', 'rf'], n_select=3)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,1751.0996,8759143.8515,2926.536,0.9235,0.1434,0.1009,1.254
lasso,Lasso Regression,2261.6471,12615857.8398,3533.3962,0.8892,0.2189,0.1361,4.624
dt,Decision Tree Regressor,2288.1386,15727398.8142,3950.8888,0.8609,0.188,0.1275,0.067
svm,Support Vector Regression,7254.6857,114119582.0781,10669.9162,-0.0008,0.5034,0.4406,3.126


INFO:logs:create_model_container: 4
INFO:logs:master_model_container: 4
INFO:logs:display_container: 2
INFO:logs:[RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-1, oob_score=False,
                      random_state=123, verbose=0, warm_start=False), Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=123,
      selection='cyclic', tol=0.0001, warm_start=False), DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_

## Ajuste de Hiperparâmetros 

In [10]:
tuned_model = pr.tune_model(model[0],
                            optimize = 'mse',
                            search_library = 'scikit-learn',
                            search_algorithm = 'random',
                            n_iter = 10)


Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,2085.9201,10398082.3196,3224.6058,0.8994,0.148,0.1149
1,2177.5388,12685009.0896,3561.602,0.8852,0.1958,0.1383
2,2075.5,9823774.4982,3134.2901,0.9114,0.174,0.1265
3,2292.7356,13417866.6348,3663.0406,0.8765,0.1733,0.1315
4,2360.1427,21577227.0536,4645.1294,0.8367,0.1746,0.1275
5,2001.3759,8568210.6,2927.1506,0.927,0.1518,0.1145
6,2248.6644,18523679.9169,4303.9145,0.839,0.1887,0.1367
7,2250.6668,17464332.6336,4179.0349,0.8703,0.1679,0.1252
8,2001.4716,9669761.7731,3109.6241,0.9053,0.1705,0.1217
9,2064.6592,13061508.945,3614.071,0.8758,0.1549,0.1133


INFO:logs:create_model_container: 5
INFO:logs:master_model_container: 5
INFO:logs:display_container: 3
INFO:logs:RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=9, max_features=1.0, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.1,
                      min_impurity_split=None, min_samples_leaf=4,
                      min_samples_split=7, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-1, oob_score=False,
                      random_state=123, verbose=0, warm_start=False)
INFO:logs:tune_model() succesfully completed......................................


## Avaliação do Modelo 

In [11]:
pr.evaluate_model(tuned_model)

INFO:logs:Initializing evaluate_model()
INFO:logs:evaluate_model(estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=9, max_features=1.0, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.1,
                      min_impurity_split=None, min_samples_leaf=4,
                      min_samples_split=7, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-1, oob_score=False,
                      random_state=123, verbose=0, warm_start=False), fold=None, fit_kwargs=None, plot_kwargs=None, feature_name=None, groups=None, use_train_data=False)


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

## Interpretação dos Resultados 

In [None]:
pr.interpret_model(tuned_model, plot='correlation', feature='mpg')

## Avaliação do Teste 

In [None]:
pred_holdout = pr.predict_model(tuned_model) # holdout set (test)

## Finalização do Modelo para Produção

In [None]:
final_model = pr.finalize_model(tuned_model)

## Exportação do Experimento 

In [None]:
pr.save_model(final_model, model_name)