# Corrigindo erro ao utilizar base de dados OOT
Ao utilizar a base OOT estava recebendo um erro relacionado ao OneHotEncoder receber um valor o qual não tinha sido utilizado no conjunto de treino, dessa forma, realizo nesse notebook a correção e retreinamento do modelo.

In [1]:
import pickle
import numpy as np
import pandas as pd
import gzip
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Carregando o modelo
with open('./app/model.pkl', 'rb') as file:
  model = pickle.load(file)

In [3]:
model.steps

[('simple_preprocessing',
  ColumnTransformer(transformers=[('pipeline-1',
                                   Pipeline(steps=[('num_imputer',
                                                    SimpleImputer(strategy='median'))]),
                                   <sklearn.compose._column_transformer.make_column_selector object at 0x000001F0B06A9910>),
                                  ('pipeline-2',
                                   Pipeline(steps=[('cat_imputer',
                                                    SimpleImputer(strategy='most_frequent')),
                                                   ('encoder', OneHotEncoder())]),
                                   <sklearn.compose._column_transformer.make_column_selector object at 0x000001F0B06A30D0>)])),
 ('estimator',
  DecisionTreeClassifier(max_depth=9, min_samples_leaf=0.15371419169712677,
                         min_samples_split=0.2572078354486276))]

In [4]:
train_cols = model.feature_names_in_
train_cols

array(['VAR2', 'IDADE', 'VAR5', 'VAR6', 'VAR7', 'VAR8', 'VAR9', 'VAR10',
       'VAR11', 'VAR12', 'VAR14', 'VAR15', 'VAR16', 'VAR18', 'VAR19',
       'VAR22', 'VAR24', 'VAR25', 'VAR32', 'VAR39', 'VAR40', 'VAR41',
       'VAR42', 'VAR47', 'VAR49', 'VAR50', 'VAR51', 'VAR52', 'VAR53',
       'VAR54', 'VAR55', 'VAR56', 'VAR57', 'VAR58', 'VAR59', 'VAR60',
       'VAR61', 'VAR62', 'VAR63', 'VAR64', 'VAR65', 'VAR66', 'VAR67',
       'VAR68', 'VAR69', 'VAR70', 'VAR71', 'VAR72', 'VAR73', 'VAR74',
       'VAR75', 'VAR76', 'VAR77', 'VAR78', 'VAR79', 'VAR80', 'VAR81',
       'VAR82', 'VAR83', 'VAR84', 'VAR85', 'VAR86', 'VAR87', 'VAR88',
       'VAR89', 'VAR90', 'VAR91', 'VAR92', 'VAR93', 'VAR94', 'VAR95',
       'VAR96', 'VAR97', 'VAR98', 'VAR99', 'VAR100', 'VAR101', 'VAR102',
       'VAR103', 'VAR104', 'VAR105', 'VAR106', 'VAR107', 'VAR108',
       'VAR109', 'VAR110', 'VAR111', 'VAR112', 'VAR113', 'VAR114',
       'VAR115', 'VAR116', 'VAR117', 'VAR118', 'VAR119', 'VAR120',
       'VAR121', 'VAR12

In [5]:
# construindo novo passo de ColumnTransformer
pipe_1 = Pipeline(steps=[("num_imputer", SimpleImputer(strategy='median'))])
pipe_2 = Pipeline(steps=[("cat_imputer", SimpleImputer(strategy='most_frequent')),
                         ("encoder", OneHotEncoder(handle_unknown='ignore'))])

ct = ColumnTransformer([('pipeline-1', pipe_1, make_column_selector(dtype_include=np.number)),
    ('pipeline-2', pipe_2, make_column_selector(dtype_include=object))])
ct

ColumnTransformer(transformers=[('pipeline-1',
                                 Pipeline(steps=[('num_imputer',
                                                  SimpleImputer(strategy='median'))]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x000001F0D04507F0>),
                                ('pipeline-2',
                                 Pipeline(steps=[('cat_imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('encoder',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x000001F0D0450550>)])

In [6]:
model.steps[0] = ('simple_preprocessing', ct)

In [7]:
# carregando base de treinamento
with gzip.open('D:\\Documentos\\Git\\Github\\ml-monitoring\\app\\datasets\\credit_01\\train.gz', 'r') as file:
  train_data = pd.read_csv(file)

y = train_data['TARGET']
X = train_data[train_cols]

In [8]:
X.shape, y.shape

((101128, 118), (101128,))

In [9]:
model.fit(X, y)

Pipeline(steps=[('simple_preprocessing',
                 ColumnTransformer(transformers=[('pipeline-1',
                                                  Pipeline(steps=[('num_imputer',
                                                                   SimpleImputer(strategy='median'))]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001F0D04507F0>),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('cat_imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001

In [10]:
with open('./app/fixed_model.pkl', 'wb') as file:
    pickle.dump(model, file)