# Bibliotecas
As seguintes bibliotecas são carregadas:
- **Pandas:** Tratamento de dados
- **Numpy:** Tratamento de dados
- **Matplotlib.pyplot:** Gráficos
- **Sci-kit:** Aprendizado de Máquina
  - **decomposition:** Redução de dimensionalidade (PCA)
  - **svm:** Support Vector Machine
  - **tree:** Decision Tree
  - **neighbors:** Nearest Neighbors
  - **train_test_split:** Holdout

In [1]:
# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install deap

Collecting deap
  Downloading deap-1.2.2.tar.gz (936kB)
[K    100% |████████████████████████████████| 942kB 625kB/s eta 0:00:01
[?25hBuilding wheels for collected packages: deap
  Running setup.py bdist_wheel for deap ... [?25ldone
[?25h  Stored in directory: /home/nbuser/.cache/pip/wheels/82/aa/67/2c93e17c84646c86099fda53ee0b3329372dcf94dd8789fd13
Successfully built deap
Installing collected packages: deap
Successfully installed deap-1.2.2


In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn import decomposition

from sklearn import svm
from sklearn import tree
from sklearn import neighbors

from sklearn.model_selection import train_test_split

import array, random
from deap import creator, base, tools, algorithms

In [5]:
%matplotlib inline

# Carregar dados

In [5]:
# Carrega dados de treinamento e teste a partir de arquivo CSV
training = pd.read_csv('Data/aps_failure_training_set.csv', na_values='na')
test = pd.read_csv('Data/aps_failure_test_set.csv', na_values='na')

# Tratamento de dados

In [6]:
# Transforma coluna "Classe" em tipo Lógico
training['class'] = training['class'] != 'neg'
test['class'] = test['class'] != 'neg'

In [None]:
# TODO: Verificar Outliers

In [7]:
# Imputing: atribuição de dados faltantes (utilizando valor médio)
for i in range(2, training.shape[1]):
    training.iloc[:,i] = training.iloc[:,i].fillna(training.iloc[:,i].mean())
    test.iloc[:,i] = test.iloc[:,i].fillna(test.iloc[:,i].mean())
    
# TODO: testar outros métodos de imputing (moda, mediana, KNN, regressão)

In [8]:
# Função para criar média e desvio padrão a partir de features de histograma
def hist_features(data, column_name):
    
    # Média
    weights = np.array([1, 2, 3, 4, 5, 6, 7, 8 , 9,  10])
    counts = data.loc[:, (column_name + '_000'):(column_name + '_009')]
    means = np.array(np.sum(counts * weights, axis = 1) / counts.sum(axis = 1))
        
    mean_data = pd.DataFrame(means, columns=[column_name + '_mean'])
    
    # Desvio-padrão
    weights_matrix = np.array([weights,] * data.shape[0])
    differences = (weights_matrix.transpose() - means.transpose()).transpose()
    stds = np.sqrt(np.sum((differences ** 2) * counts, axis = 1) / counts.sum(axis=1))
        
    std_data = pd.DataFrame(stds, columns=[column_name + '_std'])
    
    # Novas colunas
    new_data = pd.concat([mean_data, std_data], axis=1)
        
    return new_data

In [9]:
# Criar novas features com função anterior
column_names = ['ag', 'ay', 'az', 'ba', 'cn', 'cs', 'ee']

for column_name in column_names:
    
    new_training = hist_features(training, column_name)
    new_test = hist_features(test, column_name)

    training = pd.concat([training, new_training], axis=1)
    test = pd.concat([test, new_test], axis=1)

In [10]:
# Limpar dados faltantes de novas features
training[pd.isnull(training)] = 0
test[pd.isnull(test)] = 0

In [11]:
# Undersample: balanceamento de categorias

# Separa todas as observações verdadeiras
true = training.loc[training['class']][:]

# Separa todas as observações falsas
false = training.loc[~training['class']][:]

# Separa uma amostra aleatória das observações falsas (10%)
false_sample = false.sample(frac = 0.1)

# Concatena observações
undersample = pd.concat([true, false_sample])

In [12]:
# Separar dados de treinamento e validação (70% e 30%)
training_data, validation_data = train_test_split(undersample, test_size=0.3)

X_train = training_data.loc[:, training_data.columns != 'class']
Y_train = training_data.loc[:, 'class']

X_val = validation_data.loc[:, validation_data.columns != 'class']
Y_val = validation_data.loc[:, 'class']

# Preparar dados de teste
X_test = test.loc[:, test.columns != 'class']
Y_test = test.loc[:, 'class']

In [12]:
# Redução de dimensionalidade (com principal component analysis)
pca = decomposition.PCA(n_components = 30) # Diminuindo de 182 para 30 atributos

pca.fit(X_train)

# Não será realizado, pois descaracteriza "Data Mining".
#X_train = pd.DataFrame(pca.transform(X_train))
#X_val = pd.DataFrame(pca.transform(X_val))
#X_test = pd.DataFrame(pca.transform(X_test))

PCA(copy=True, iterated_power='auto', n_components=30, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [13]:
# Ajustar dados por normalização
media = X_train.mean()
desvio_padrao = X_train.std()

X_train = (X_train - media) / desvio_padrao
X_val = (X_val - media) / desvio_padrao
X_test = (X_test - media) / desvio_padrao

In [14]:
# Remover classe sem variância (estava criando erros devido a divisão por zero da normalização)
classe_sem_variancia = desvio_padrao == 0

X_train = X_train.loc[:, ~classe_sem_variancia]
X_val = X_val.loc[:, ~classe_sem_variancia]
X_test = X_test.loc[:, ~classe_sem_variancia]

# Treinamento e validação de modelo

In [38]:
# Inicia modelo de classificação
#model = svm.SVC(kernel='rbf')
model = tree.DecisionTreeClassifier(random_state=0)
#model = neighbors.KNeighborsClassifier()

In [39]:
# Treinamento do modelo
model.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [40]:
# Previsão com dados de validação
y = model.predict(X_val)

# Analizar resultado
tp = sum(Y_val & y)
tn = sum(~Y_val & ~y)
fp = sum(~Y_val & y)
fn = sum(Y_val & ~y)

score = 10 * fp + 500 * fn

[score, tp, fp, fn, tn]

[31510, 271, 51, 62, 1686]

# Teste de modelo
**Objetivo:** Score < 9920.

In [20]:
# Previsão com dados de teste
y = model.predict(X_test)

# Analizar resultado
tp = sum(Y_test & y)
tn = sum(~Y_test & ~y)
fp = sum(~Y_test & y)
fn = sum(Y_test & ~y)

score = 10 * fp + 500 * fn

[score, tp, fp, fn, tn]

[29430, 325, 443, 50, 15182]

# Otimização de modelo por seleção de atributos

In [60]:
# Função para otimização
def my_evaluation(x):
    
    features = np.asarray(x) == 1
    features = list(features)

    # Inicializa modelo
    model = tree.DecisionTreeClassifier(random_state=0)
    #model = svm.SVC()

    # Treinamento com atributos selecionados
    model.fit(X_train.loc[:, features], Y_train)

    # Realiza previsão de dados de validação
    y = model.predict(X_val.loc[:, features])

    # Analizar resultado
    tp = sum(Y_val & y)
    tn = sum(~Y_val & ~y)
    fp = sum(~Y_val & y)
    fn = sum(Y_val & ~y)
    score = 10 * fp + 500 * fn
    
    # Retorna score do modelo
    return (score,)

In [64]:
creator.create("FitnessMin", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMin)

toolbox = base.Toolbox()

toolbox.register("bit", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.bit, 183)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

#evalOneMax = lambda individual: (sum(individual),)

toolbox.register("evaluate", my_evaluation)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=10)

population = toolbox.population(n=50)



In [65]:
NGEN=100
for gen in range(NGEN):
    offspring = algorithms.varAnd(population, toolbox, cxpb=0.5, mutpb=0.1)
    fits = toolbox.map(toolbox.evaluate, offspring)
    for fit, ind in zip(fits, offspring):
        ind.fitness.values = fit
    population = offspring

In [66]:
individual = tools.selWorst(population, 1, fit_attr='fitness')[0]
features = list(np.asarray(individual) == 1)

individual.fitness.values[0]

23500.0

# Teste de modelo otimizado

In [67]:
# Criação e treinamento de modelo
model = tree.DecisionTreeClassifier(random_state=0)
#model = svm.SVC()
model.fit(X_train.loc[:, features], Y_train)

# Teste de modelo
# Previsão com dados de teste
y = model.predict(X_test.loc[:, features])

# Analizar resultado
tp = sum(Y_test & y)
tn = sum(~Y_test & ~y)
fp = sum(~Y_test & y)
fn = sum(Y_test & ~y)

score = 10 * fp + 500 * fn

[score, tp, fp, fn, tn]

[31700, 319, 370, 56, 15255]