In [1]:
########################################################################################################
#Tiago Tambonis - 2017 - 2018 - 2019 
#Observação: Cuidado com as definições de variáveis no arquivo Geração_imuno.sh e Geração_Non_imuno.sh.
#Na atual definição estou considerando somente 1 vizinho. Atenção ao scoring associado à GridSearch.
#Objetivo: avaliar os resultados preditivos sem uso do Suvrel.
#FEATURE SELECTION WITH MRMR.
########################################################################################################

In [21]:
#Imports 

import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
import pandas as pd
import numpy as np
from itertools import combinations
import matplotlib.pyplot as plt
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, matthews_corrcoef, roc_auc_score
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pickle
from scipy.spatial.distance import squareform, pdist

#random_state=101

In [22]:
## Função Suvrel.

def suvrel(X, y, gamma=2.0, norm=None, distance=False):
    """
    Return: a metric tensor for the data
    X columns representing samples and lines dimentions
    y labels
    gamma is a float
    norm:{None,\"unity\",\"t-test\"}
    distance: {False, True} if True return a tuple (weights, D)
    where D is the distanca matrix of the data
    for the geometric approach method
    """

    classes = list(set(y))
    n_classes = len(classes)
    dim = X.shape[1]

    if norm is None or norm == "unity":
        mean_cl = np.zeros((n_classes, dim))
        for i, cl in enumerate(classes):
            mean_cl[i] = np.mean(X[y == cl], axis=0)

        smeans = np.zeros(dim)
        for i, j in combinations(range(n_classes), 2):
            smeans += (mean_cl[i] - mean_cl[j]) ** 2

        if gamma != 2:
            var_cl = np.zeros((n_classes, dim))
            for cl in classes:
                var_cl[cl] = np.var(X[y == cl], axis=0)
            svar = np.sum(var_cl, axis=0)
            weights = ((gamma - 2.) * svar 
                        +  gamma /( n_classes - 1) * smeans)
        else:
            weights = smeans

        weights[weights < 0] = 0

        if norm is "unity":
            weights = weights / np.var(X, axis=0)

        if distance:
            return (weights / np.sqrt(np.sum(weights ** 2)),
                    squareform(pdist(X * np.sqrt(weights))))
        else:
            return weights / np.sqrt(np.sum(weights ** 2))

    elif norm == "t-test":
        if n_classes == 2:
            mean_cl = np.zeros((n_classes, dim))
            var_cl = np.zeros((n_classes, dim))
            for i, cl in enumerate(classes):
                mean_cl[i] = np.mean(X[y == cl], axis=0)
                var_cl[i] = np.var(X[y == cl], axis=0)

            for i, j in combinations(range(n_classes), 2):
                smeans = (mean_cl[i] - mean_cl[j]) ** 2
                #tnorm = (var_cl[i] / np.sum([y == classes[i]])
                         #+ var_cl[j] / np.sum([y == classes[j]]))

                # case with equal variance. Edited by Marcelo 21/10/13
                n1 = np.sum([y == classes[i]])
                n2 = np.sum([y == classes[j]])
                tnorm = ((n1 - 1) * var_cl[i] + (n2 - 1) * var_cl[j]) \
                    / (n1 + n2 - 2)
            if gamma != 2:
                svar = np.sum(var_cl, axis=0)
                weights = ((gamma - 2.) * svar 
                            +  gamma /( n_classes - 1) * smeans)
            else:
                weights = smeans
            weights = weights / tnorm
            weights[weights < 0] = 0

            if distance:
                return (weights / np.sqrt(np.sum(weights ** 2)),
                        squareform(pdist(X * np.sqrt(weights))))
            else:
                return weights / np.sqrt(np.sum(weights ** 2))

        else:
            print ("error: for t-test normalization the number" +
                   " of classes must be equal 2")
            return None
    else:
        print "error: norm options are None, \"unity\" and  \"t-test\""
    return None

In [86]:
#Carregar dados 
#with open('../Sequencias/DadosTreinoFeaturizados', 'rb') as fp:
#        DadosTreinoFeaturizados = pickle.load(fp)
#with open('../Sequencias/DadosTesteFeaturizados', 'rb') as fp:
#        DadosTesteFeaturizados = pickle.load(fp)

#Carregar dados 
with open('../Sequencias/DadosTreinoCru', 'rb') as fp:
        DadosTreinoFeaturizados = pickle.load(fp)
with open('../Sequencias/DadosTesteCru', 'rb') as fp:
        DadosTesteFeaturizados = pickle.load(fp)

In [87]:
print(DadosTreinoFeaturizados.shape)
print(DadosTesteFeaturizados.shape)

(3600, 401)
(400, 401)


In [88]:
DadosTreinoFeaturizados.head(n=3)

Unnamed: 0,GW,GV,GT,GS,GR,GQ,GP,GY,GG,GF,...,AQ,AP,AS,AR,AT,AW,AV,AY,VK,Classe
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0
1,0.0,0.0,0.0,0.0,5.26,0.0,0.0,0.0,5.26,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.26,0.0,...,0.0,0.0,0.0,0.0,5.26,0.0,0.0,0.0,0.0,-1.0


In [89]:
DadosTesteFeaturizados.head(n=3)

Unnamed: 0,GW,GV,GT,GS,GR,GQ,GP,GY,GG,GF,...,AQ,AP,AS,AR,AT,AW,AV,AY,VK,Classe
0,0.0,0.0,0.0,5.26,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.26,0.0,0.0,0.0,0.0,0.0,-1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.26,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.53,0.0,-1.0
2,0.0,0.0,5.26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# Run 

In [93]:
y_treino = np.array(DadosTreinoFeaturizados['Classe'])           
y_teste = np.array(DadosTesteFeaturizados['Classe'])                         

X_treino = DadosTreinoFeaturizados.drop(['Classe'], 1)
X_teste = DadosTesteFeaturizados.drop(['Classe'], 1)
X_treino_efetivo = np.copy(X_treino)

In [94]:
#Normalização e featurização

normalizar = False
usarsuvrel = True

if normalizar:

    scaler = StandardScaler()
    X_treino = scaler.fit(X_treino).transform(X_treino)
    X_treino_efetivo = scaler.transform(X_treino_efetivo)
    X_teste = scaler.transform(X_teste)

if usarsuvrel: 

    w = suvrel(X=X_treino, y=y_treino)
    w = np.sqrt(w)

    X_treino_efetivo = w*X_treino_efetivo

    X_teste = w*X_teste

In [95]:
pd.DataFrame(X_treino).head(n=5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,390,391,392,393,394,395,396,397,398,399
0,-0.13244,-0.294863,-0.277945,-0.307749,-0.252616,-0.199482,-0.230176,-0.200524,-0.312761,-0.213556,...,-0.280788,-0.277339,-0.304624,-0.356995,-0.291341,-0.324893,-0.142907,-0.346564,-0.234359,-0.253167
1,-0.13244,-0.294863,-0.277945,-0.307749,3.355671,-0.199482,-0.230176,-0.200524,2.673496,-0.213556,...,-0.280788,-0.277339,-0.304624,-0.356995,-0.291341,-0.324893,-0.142907,-0.346564,-0.234359,-0.253167
2,-0.13244,-0.294863,-0.277945,-0.307749,-0.252616,-0.199482,-0.230176,-0.200524,2.673496,-0.213556,...,-0.280788,-0.277339,-0.304624,-0.356995,-0.291341,2.446391,-0.142907,-0.346564,-0.234359,-0.253167
3,-0.13244,-0.294863,-0.277945,-0.307749,-0.252616,-0.199482,-0.230176,4.548321,-0.312761,-0.213556,...,-0.280788,-0.277339,-0.304624,-0.356995,-0.291341,-0.324893,-0.142907,-0.346564,-0.234359,-0.253167
4,-0.13244,-0.294863,-0.277945,-0.307749,-0.252616,-0.199482,-0.230176,-0.200524,-0.312761,-0.213556,...,-0.280788,-0.277339,-0.304624,-0.356995,-0.291341,-0.324893,-0.142907,-0.346564,-0.234359,-0.253167


In [96]:
pd.DataFrame(X_treino_efetivo).head(n=5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,390,391,392,393,394,395,396,397,398,399
0,-0.13244,-0.294863,-0.277945,-0.307749,-0.252616,-0.199482,-0.230176,-0.200524,-0.312761,-0.213556,...,-0.280788,-0.277339,-0.304624,-0.356995,-0.291341,-0.324893,-0.142907,-0.346564,-0.234359,-0.253167
1,-0.13244,-0.294863,-0.277945,-0.307749,3.355671,-0.199482,-0.230176,-0.200524,2.673496,-0.213556,...,-0.280788,-0.277339,-0.304624,-0.356995,-0.291341,-0.324893,-0.142907,-0.346564,-0.234359,-0.253167
2,-0.13244,-0.294863,-0.277945,-0.307749,-0.252616,-0.199482,-0.230176,-0.200524,2.673496,-0.213556,...,-0.280788,-0.277339,-0.304624,-0.356995,-0.291341,2.446391,-0.142907,-0.346564,-0.234359,-0.253167
3,-0.13244,-0.294863,-0.277945,-0.307749,-0.252616,-0.199482,-0.230176,4.548321,-0.312761,-0.213556,...,-0.280788,-0.277339,-0.304624,-0.356995,-0.291341,-0.324893,-0.142907,-0.346564,-0.234359,-0.253167
4,-0.13244,-0.294863,-0.277945,-0.307749,-0.252616,-0.199482,-0.230176,-0.200524,-0.312761,-0.213556,...,-0.280788,-0.277339,-0.304624,-0.356995,-0.291341,-0.324893,-0.142907,-0.346564,-0.234359,-0.253167


In [12]:
if True: #Conversão ao libsvm.
    
    from sklearn.datasets import dump_svmlight_file
    
    dump_svmlight_file(X_treino_efetivo, y_treino, 'libsvm-3.23/tools/DadosTreinoFeatlibsvmStandarScaler.dat',
                       zero_based=False, multilabel=False)
    
    dump_svmlight_file(X_teste, y_teste, 'libsvm-3.23/tools/DadosTesteFeatlibsvmStandarScaler.dat',
                       zero_based=False, multilabel=False)

In [13]:
print("OK.")

OK.


# Chacagem se os treino são iguais

In [97]:
print(np.sum(np.isclose(X_treino_efetivo, X_treino, atol=0.1)==False))
diffs = np.argwhere(np.isclose(X_treino_efetivo, X_treino, atol=0.1)==False)
print(X_treino[diffs[0,0], diffs[0,1]])
print(X_treino_efetivo[diffs[0,0], diffs[0,1]])

0

# Checagem da multiplicação matricial

In [61]:
X_treino = DadosTreinoFeaturizados.drop(['Classe'], 1)

In [62]:
pd.DataFrame(X_treino).head(n=4)

Unnamed: 0,GW,GV,GT,GS,GR,GQ,GP,GY,GG,GF,...,AN,AQ,AP,AS,AR,AT,AW,AV,AY,VK
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,5.26,0.0,0.0,0.0,5.26,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.26,0.0,...,0.0,0.0,0.0,0.0,0.0,5.26,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.26,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
pd.DataFrame(X_treino*w).head(n=4)

Unnamed: 0,GW,GV,GT,GS,GR,GQ,GP,GY,GG,GF,...,AN,AQ,AP,AS,AR,AT,AW,AV,AY,VK
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.628317,0.0,0.0,0.0,0.982132,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.982132,0.0,...,0.0,0.0,0.0,0.0,0.0,1.570381,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.470734,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [71]:
pd.DataFrame(X_treino['GG']*w[8]).head(n=4)

Unnamed: 0,GG
0,0.0
1,0.982132
2,0.982132
3,0.0
