## Carregar bibliotecas

In [5]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from tensorflow import keras
from tensorflow.keras.utils import FeatureSpace


In [6]:
# Carregando o DataFrame (substitua 'seu_dataframe.csv' pelo nome do seu arquivo)
df = pd.read_csv('/Users/thiagohendler/Documents/Bootcamp Machine Learning - ENAP/CODIGO/equipe_8-2.csv', encoding='latin1')
df.isnull().sum()

placa                                 0
h_perm_front                          0
anoModelo                             1
categoria                             1
cor                                   1
dataEmissaoCrv                        1
descricaoTipoDocumentoProprietario    1
marcaModelo                           1
municipioEmplacamento                 1
tipo                                  1
ufEmplacamento                        1
descricao                             0
ilicito                               0
dtype: int64

In [7]:
df = df.dropna()

In [8]:
df.rename(columns={'h_perm_front': 'tempofronteira',
                   'anoModelo': 'anomodelo',
                   'dataEmissaoCrv': 'dataemissaocrv',
                   'descricaoTipoDocumentoProprietario': 'tipodocumentoproprietario',
                   'municipioEmplacamento': 'municipioemplacamento',
                   'ufEmplacamento': 'ufemplacamento',
                   'descricao': 'consulta_res',
                   'ilicito': 'target'
                   }, inplace=True)
df[['marca', 'modelo']] = df['marcaModelo'].str.split('/', expand=True)

# Converter para data
df['dataemissaocrv'] = pd.to_datetime(df['dataemissaocrv'], unit='ms')

#Adicionar essa coluna gera uma colinariedade com a coluna dataemissaocrv
df['dataemissaocrv'] = df['dataemissaocrv'].dt.year

df['anomodelo'] = df['anomodelo'].astype(int)

df['unicodono'] = df['dataemissaocrv'] <= df['anomodelo']

df['tempofronteira'] = df['tempofronteira'].astype(int)
df['anomodelo'] = df['anomodelo'].astype(int)
df['dataemissaocrv'] = df['dataemissaocrv'].astype(int)

# Apagar colunas não utlizadas mais
df = df.drop('placa', axis=1)
df = df.drop('marcaModelo', axis=1)


In [12]:
X = df.drop(['target'], axis=1)
y = df['target'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(
    "Usando %d amostras para o treino e %d para validação"
    % (len(X_train), len(X_test))
)

Usando 596 amostras para o treino e 149 para validação


In [146]:
def dataframe_to_dataset(X, y):
    # Combina X e y em um DataFrame temporário
    df = X.copy()
    df['target'] = y

    labels = df.pop("target")
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    ds = ds.shuffle(buffer_size=len(df))
    return ds

In [147]:
# Transformando os dados de treino e teste em datasets
train_ds = dataframe_to_dataset(X_train, y_train)
val_ds = dataframe_to_dataset(X_test, y_test)

# Verifique se os datasets foram criados corretamente
for features_batch, label_batch in train_ds.take(1):
    print(features_batch)
    print(label_batch)


{'tempofronteira': <tf.Tensor: shape=(), dtype=int64, numpy=23>, 'anomodelo': <tf.Tensor: shape=(), dtype=int64, numpy=2020>, 'categoria': <tf.Tensor: shape=(), dtype=string, numpy=b'PARTICULAR'>, 'cor': <tf.Tensor: shape=(), dtype=string, numpy=b'VERMELHA'>, 'dataemissaocrv': <tf.Tensor: shape=(), dtype=int64, numpy=2019>, 'tipodocumentoproprietario': <tf.Tensor: shape=(), dtype=string, numpy=b'CPF'>, 'municipioemplacamento': <tf.Tensor: shape=(), dtype=string, numpy=b'PONTA GROSSA'>, 'tipo': <tf.Tensor: shape=(), dtype=string, numpy=b'AUTOMOVEL'>, 'ufemplacamento': <tf.Tensor: shape=(), dtype=string, numpy=b'PR'>, 'consulta_res': <tf.Tensor: shape=(), dtype=string, numpy=b'N\xc3\xa3o h\xc3\xa1 restri\xc3\xa7\xc3\xb5es para este ve\xc3\xadculo no DENATRAN BASE 24h/7d.'>, 'marca': <tf.Tensor: shape=(), dtype=string, numpy=b'RENAULT'>, 'modelo': <tf.Tensor: shape=(), dtype=string, numpy=b'KWID ZEN 10MT'>, 'unicodono': <tf.Tensor: shape=(), dtype=bool, numpy=True>}
tf.Tensor(0, shape=(),

In [148]:
feature_space = FeatureSpace(
    features={
        # Categorical features encoded as integers
        "unicodono": "integer_categorical",
        # Categorical feature encoded as string
        "categoria": "string_categorical",
        "tipodocumentoproprietario": "string_categorical",
        "cor": "string_categorical",
        "tipo": "string_categorical",
        "ufemplacamento": "string_categorical",
        "consulta_res": "string_categorical",
        "marca": "string_categorical",
        "modelo": "string_categorical",
        "municipioemplacamento": "string_categorical",
        # Numerical features to discretize
        "anomodelo": "float_discretized",
        "dataemissaocrv": "float_discretized",
        # Numerical features to normalize
        "tempofronteira": "float_normalized",
    },
    # We create additional features by hashing
    # value co-occurrences for the
    # following groups of categorical features.
    crosses=[("categoria", "tipodocumentoproprietario"), ("ufemplacamento", "tipodocumentoproprietario"),("tipo", "tipodocumentoproprietario")],
    # The hashing space for these co-occurrences
    # wil be 32-dimensional.
    crossing_dim=32,
    # Our utility will one-hot encode all categorical
    # features and concat all features into a single
    # vector (one vector per sample).
    output_mode="concat",
)

In [149]:
train_ds_with_no_labels = train_ds.map(lambda x, _: x)
feature_space.adapt(train_ds_with_no_labels)

In [163]:
preprocessed_train_ds = train_ds.map(
    lambda x, y: ([feature_space(x)], tf.expand_dims(y, axis=-1)), num_parallel_calls=tf.data.AUTOTUNE
)
preprocessed_train_ds = preprocessed_train_ds.prefetch(tf.data.AUTOTUNE)

preprocessed_val_ds = val_ds.map(
    lambda x, y: ([feature_space(x)], tf.expand_dims(y, axis=-1)), num_parallel_calls=tf.data.AUTOTUNE
)
preprocessed_val_ds = preprocessed_val_ds.prefetch(tf.data.AUTOTUNE)

In [164]:

# 2. Adaptar o FeatureSpace
# Obter as entradas e características codificadas do FeatureSpace
dict_inputs = feature_space.get_inputs()
encoded_features = feature_space.get_encoded_features()

# Criar a arquitetura do modelo
x = keras.layers.Dense(32, activation="relu")(encoded_features)  # Camada oculta
x = keras.layers.Dropout(0.5)(x)  # Camada de dropout para regularização
predictions = keras.layers.Dense(1, activation="sigmoid")(x)  # Camada de saída

# 6. Criar o modelo de treinamento
training_model = keras.Model(inputs=[encoded_features], outputs=predictions)  # Note o uso de uma lista

# 7. Compilar o modelo
training_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

inference_model = keras.Model(inputs=dict_inputs, outputs=predictions)

# Treinamento do modelo
training_model.fit(
    preprocessed_train_ds,  # Conjunto de dados de treinamento pré-processados
    epochs=20,  # Número de épocas
    validation_data=preprocessed_val_ds,  # Conjunto de dados de validação pré-processados
    verbose=2,  # Nível de verbosidade
)

Epoch 1/20


596/596 - 4s - 7ms/step - accuracy: 0.6661 - loss: 0.6245 - val_accuracy: 0.6980 - val_loss: 0.5914
Epoch 2/20
596/596 - 1s - 2ms/step - accuracy: 0.8104 - loss: 0.4327 - val_accuracy: 0.7651 - val_loss: 0.5403
Epoch 3/20
596/596 - 3s - 5ms/step - accuracy: 0.8607 - loss: 0.3600 - val_accuracy: 0.7718 - val_loss: 0.5379
Epoch 4/20
596/596 - 2s - 4ms/step - accuracy: 0.8876 - loss: 0.2807 - val_accuracy: 0.7315 - val_loss: 0.5316
Epoch 5/20
596/596 - 5s - 8ms/step - accuracy: 0.9128 - loss: 0.2425 - val_accuracy: 0.7785 - val_loss: 0.5420
Epoch 6/20
596/596 - 4s - 6ms/step - accuracy: 0.9178 - loss: 0.1951 - val_accuracy: 0.7987 - val_loss: 0.5628
Epoch 7/20
596/596 - 2s - 3ms/step - accuracy: 0.9362 - loss: 0.1706 - val_accuracy: 0.8054 - val_loss: 0.5710
Epoch 8/20
596/596 - 2s - 3ms/step - accuracy: 0.9463 - loss: 0.1462 - val_accuracy: 0.8054 - val_loss: 0.5974
Epoch 9/20
596/596 - 6s - 9ms/step - accuracy: 0.9530 - loss: 0.1273 - val_accuracy: 0.8121 - val_loss: 0.6046
Epoch 10/20


<keras.src.callbacks.history.History at 0x16c65d040>

In [165]:
# Avaliação do modelo nos dados de teste
test_loss, test_accuracy = training_model.evaluate(preprocessed_val_ds, verbose=2)

print(f'Test loss: {test_loss:.4f}')
print(f'Test accuracy: {test_accuracy:.4f}')


149/149 - 0s - 904us/step - accuracy: 0.8121 - loss: 0.9287
Test loss: 0.9287
Test accuracy: 0.8121


In [71]:
# Fazer previsões nos dados de teste
predictions = training_model.predict(preprocessed_val_ds)

# Exibir algumas previsões
print(predictions[:5])  # Mostra as primeiras 5 previsões


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step  
[[0.2759936 ]
 [0.80588496]
 [0.19188108]
 [0.9091081 ]
 [0.3591524 ]]
