In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from pickle import dump, load
import bisect

In [2]:
######################################################################################################
# Carrega os datasets de treino e teste
#####################################################################################################

datasetTrainHA = pd.read_csv('datasetTrainHA.csv', low_memory=False)
print(datasetTrainHA.shape)

datasetTestHA = pd.read_csv('datasetTestHA.csv', low_memory=False)
print(datasetTestHA.shape)

(23, 5)
(23, 5)


In [3]:
######################################################################################################
# Cria o encoder, aplica nos dados de treino e salva para utilizacao nos dados de teste
#####################################################################################################

###
### PARTE 1 - CRIA UMA VARIAVEL NUMERICA (FACTOR) - PODERIAMOS PULAR ESTA ETAPA SE A VARIAVEL QUE 
### SOFRERA A TRANSFORMACAO ONE-HOT-ENCODER JA FOSSE NUMERICA
###

# Cria um LabelEncoder para transformar os valores da coluna "classificacao" em numeros (factor)
le_classificacao = preprocessing.LabelEncoder()
le_classificacao.fit(datasetTrainHA.classificacao)

# Adiciona <unknown> a lista de valores do Encoder
le_classificacao_classes = le_classificacao.classes_.tolist()
bisect.insort_left(le_classificacao_classes, '<unknown>')
le_classificacao.classes_ = le_classificacao_classes

# Cria a coluna numerica no dataset
datasetTrainHA['le_classificacao'] = le_classificacao.transform(datasetTrainHA.classificacao)

# Salva o Encoder para uso futuro nos dados de teste
dump(le_classificacao, open('le_classificacao.sav', 'wb'))

###
### PARTE 2 - CRIA AS VARIAVEIS DUMMIES
###

# Cria um novo dataset com a chave + variavel de entrada para a transformacao
dataset_new = datasetTrainHA[['chave', 'le_classificacao']]

# Cria o OneHotEncoder
encoder = OneHotEncoder(categorical_features = np.array([False, True]), dtype=bool, sparse=True)
encoder.fit(dataset_new)

# Salva o Encoder para uso futuro nos dados de teste
dump(encoder, open('encoder.sav', 'wb'))

# Prepara o nome das colunas do novo dataset
colnames = list(le_classificacao.classes_)
index = colnames.index('<unknown>')
del colnames[index]
colnames.append('chave')

# Exemplo se existisse mais de uma coluna para fazer a trasformacao
#colnames = list(le_classificacao.classes_)
#index = colnames.index('<unknown>')
#del colnames[index]
#colnamesClass2 = list(le_classificacao2.classes_)
#for c in colnamesClass2:
#    if c != '<unknown>':
#        colnames.append(c)
#colnames.append('chave')

# Executa a transformacao nos dados de treino
results = encoder.transform(dataset_new)
dataset_encode = pd.DataFrame(results.toarray(), columns=colnames)

# Junta os dados no dataset final
dataset_merge = pd.merge(datasetTrainHA, dataset_encode, on='chave', how='inner')

# Exporta o dataset de treino
dataset_merge.to_csv("dataset_merge.csv")

In [4]:
######################################################################################################
# Carrega o encoder e aplica nos dados de treino
#####################################################################################################

# Carrega o Label Encoder
le_classificacao = load(open('le_classificacao.sav', 'rb')) 

# Atualiza os valores da coluna "classificacao" do dataset de treino para <unknown> os valores que 
# nao pertencem as classes do Label Encoder
datasetTestHA.loc[ ~datasetTestHA['classificacao'].isin(le_classificacao.classes_), 'classificacao' ] = '<unknown>'

# Aplica o Label Encoder ao dataset de treino
datasetTestHA['le_classificacao'] = le_classificacao.transform(datasetTestHA.classificacao)

# Cria um novo dataset com a chave + variavel de entrada para a transformacao
dataset_new = datasetTestHA[['chave', 'le_classificacao']]

# Carrega o OneHotEncoder
encoder = load(open('encoder.sav', 'rb')) 

# Prepara o nome das colunas do novo dataset
colnames = list(le_classificacao.classes_)
index = colnames.index('<unknown>')
del colnames[index]
colnames.append('chave')

# Executa a transformacao nos dados de teste
results = encoder.transform(dataset_new)
dataset_encode = pd.DataFrame(results.toarray(), columns=colnames)

# Merging datasets
dataset_merge = pd.merge(datasetTestHA, dataset_encode, on='chave', how='inner')

# Exporta o dataset de treino
dataset_merge.to_csv("dataset_merge.csv")