# Importações

In [40]:
import elm as extreme_learning_machine
import h5py
import numpy as np
import pandas as pd
import simplejson as json

In [21]:
from matplotlib import pyplot as plt
from sklearn import base, metrics, model_selection, neural_network, tree

In [3]:
%matplotlib inline

**Observação:** o pacote `elm` não se encontra no PyPI. Para instalá-lo, use o comando `pip install git+https://github.com/imatheussm/Python-ELM.git`

# Carregamento e consolidação dos dados

Os dados ora utilizados foram baixados do [UCI Machine Learning](https://archive.ics.uci.edu/ml/datasets/Dota2+Games+Results). Em adição, os arquivos `.json` utilizados foram baixados de um repositório no [GitHub](https://github.com/kronusme/dota2-api/tree/master/data).

In [4]:
treino = pd.read_csv("../dados/dota2Train.csv", header=None)
teste = pd.read_csv("../dados/dota2Test.csv", header=None)

In [5]:
dados = pd.concat((treino, teste))

In [6]:
X = dados.drop(0, axis=1).to_numpy()
y = dados.iloc[:, 0].to_numpy()

In [7]:
with h5py.File("../dados/dota2.h5", "w") as arquivo:
    arquivo.create_dataset("X", data=X, compression="gzip", compression_opts=9)
    arquivo.create_dataset("y", data=y, compression="gzip", compression_opts=9)
    arquivo.create_dataset("K", data=np.unique(y).size)

In [8]:
with h5py.File("../dados/dota2.h5", "r") as arquivo:
    X = pd.DataFrame(arquivo.get("X")[()])
    y = pd.DataFrame(arquivo.get("y")[()])

# Divisão em treino e teste

In [9]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)
X_train, X_test, y_train, y_test = X_train.copy(), X_test.copy(), y_train.copy(), y_test.copy()

# Conversão de variáveis em versões _target-encoded_

Os códigos abaixo foram adaptados de um artigo no [Medium](https://medium.com/@pouryaayria/k-fold-target-encoding-dfe9a594874b) sobre _$K$-Fold Target Encoding_, que será utilizado para fins de pré-processamento do conjunto de dados a ser trabalhado.

In [10]:
class KFoldStratifiedTargetEncoder(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, number_of_folds=5, verbose=False):
        self.number_of_folds = number_of_folds
        self.verbose = verbose

        self._values = None

    @property
    def number_of_folds(self):
        return self.__number_of_folds

    @number_of_folds.setter
    def number_of_folds(self, new_number_of_folds):
        self.__number_of_folds = new_number_of_folds

    @property
    def verbose(self):
        return self.__verbose

    @verbose.setter
    def verbose(self, new_verbose):
        self.__verbose = new_verbose

    def fit(self, X, y, **fit_params):
        encoded_X, current_fold = np.empty_like(X, dtype=np.float_), 0
        stratified_k_fold = model_selection.StratifiedKFold(n_splits=self.number_of_folds)

        for train_indices, test_indices in stratified_k_fold.split(X, y):
            X_train, y_train, X_test = X[train_indices, :], y[train_indices], X[test_indices, :]
            self._values = {column: None for column in range(X.shape[1])}

            for column in range(X.shape[1]):
                X_train_column, X_test_column = X_train[:, column], X_test[:, column]
                encoded_X_column = np.full_like(X_test_column, y_train.mean(), dtype=np.float_)
                categories = np.unique(X_train_column)

                for category in categories:
                    encoded_X_column[X_test_column == category] = y_train[X_train_column == category].mean()
                encoded_X[test_indices, column] = encoded_X_column

            current_fold += 1

        for column in range(X.shape[1]):
            X_column = X[:, column]
            categories = np.unique(X_column)

            self._values[column] = {category: None for category in categories}

            for category in categories:
                self._values[column][category] = y[X_column == category].mean()

        if "return_encoded_X" in fit_params.keys() and fit_params["return_encoded_X"] is True:
            return encoded_X

        return self

    def transform(self, X):
        if self._values is None:
            raise ValueError()

        encoded_X = np.empty_like(X, dtype=np.float_)

        for column in range(X.shape[1]):
            X_column = X[:, column]
            encoded_X_column = np.empty_like(X_column, dtype=np.float_)

            for category, value in self._values[column].items():
                encoded_X_column[X_column == category] = value

            encoded_X[:, column] = encoded_X_column

        return encoded_X

    def fit_transform(self, X, y=None, **fit_params):
        if y is None:
            raise ValueError()
        else:
            return self.fit(X, y, return_encoded_X=True)

In [11]:
target_encoder = KFoldStratifiedTargetEncoder(number_of_folds=10)

In [12]:
X_train.iloc[:, 0:3] = target_encoder.fit_transform(X_train.iloc[:, 0:3].to_numpy().copy(), y_train.to_numpy().ravel().copy())
X_test.iloc[:, 0:3] = target_encoder.transform(X_test.iloc[:, 0:3].to_numpy().copy())

# Treino dos algoritmos

In [45]:
models = {
    "dt": tree.DecisionTreeClassifier(),
    "mlp": neural_network.MLPClassifier(),
    "elm": extreme_learning_machine.ELM(),
}

In [46]:
stratified_k_fold = model_selection.StratifiedKFold(n_splits=10)

In [51]:
results = []

for acronym, model in models.items():
    results.append(
        model_selection.cross_validate(
            model, X_train.to_numpy(), y_train.to_numpy().ravel(), n_jobs=-1, cv=stratified_k_fold, scoring=[
                "accuracy", "f1"
            ]
        )
    )

In [61]:
for index, name in enumerate(models.keys()):
    print(f"{name}: accuracy {results[index]['test_accuracy'].mean()}, f1 {results[index]['test_f1'].mean()}")

dt: accuracy 0.5189703025256731, f1 0.5430234310215969
mlp: accuracy 0.5423952262003886, f1 0.5731863271896556
elm: accuracy 0.5922980849292256, f1 0.635421688277177
