In [1]:
########################################################################################################
#Tiago Tambonis - 2017 - 2018 - 2019 
#Objetivo: aplicar Suvrel
########################################################################################################

In [2]:
import pickle
import numpy as np
from sklearn.preprocessing import StandardScaler
from itertools import combinations
from scipy.spatial.distance import squareform, pdist
import pandas as pd

In [3]:
## Função Suvrel.

def suvrel(X, y, gamma=2.0, norm=None, distance=False):
    """
    Return: a metric tensor for the data
    X columns representing samples and lines dimentions
    y labels
    gamma is a float
    norm:{None,\"unity\",\"t-test\"}
    distance: {False, True} if True return a tuple (weights, D)
    where D is the distanca matrix of the data
    for the geometric approach method
    """

    classes = list(set(y))
    n_classes = len(classes)
    dim = X.shape[1]

    if norm is None or norm == "unity":
        mean_cl = np.zeros((n_classes, dim))
        for i, cl in enumerate(classes):
            mean_cl[i] = np.mean(X[y == cl], axis=0)

        smeans = np.zeros(dim)
        for i, j in combinations(range(n_classes), 2):
            smeans += (mean_cl[i] - mean_cl[j]) ** 2

        if gamma != 2:
            var_cl = np.zeros((n_classes, dim))
            for cl in classes:
                var_cl[cl] = np.var(X[y == cl], axis=0)
            svar = np.sum(var_cl, axis=0)
            weights = ((gamma - 2.) * svar 
                        +  gamma /( n_classes - 1) * smeans)
        else:
            weights = smeans

        weights[weights < 0] = 0

        if norm is "unity":
            weights = weights / np.var(X, axis=0)

        if distance:
            return (weights / np.sqrt(np.sum(weights ** 2)),
                    squareform(pdist(X * np.sqrt(weights))))
        else:
            return weights / np.sqrt(np.sum(weights ** 2))

    elif norm == "t-test":
        if n_classes == 2:
            mean_cl = np.zeros((n_classes, dim))
            var_cl = np.zeros((n_classes, dim))
            for i, cl in enumerate(classes):
                mean_cl[i] = np.mean(X[y == cl], axis=0)
                var_cl[i] = np.var(X[y == cl], axis=0)

            for i, j in combinations(range(n_classes), 2):
                smeans = (mean_cl[i] - mean_cl[j]) ** 2
                #tnorm = (var_cl[i] / np.sum([y == classes[i]])
                         #+ var_cl[j] / np.sum([y == classes[j]]))

                # case with equal variance. Edited by Marcelo 21/10/13
                n1 = np.sum([y == classes[i]])
                n2 = np.sum([y == classes[j]])
                tnorm = ((n1 - 1) * var_cl[i] + (n2 - 1) * var_cl[j]) \
                    / (n1 + n2 - 2)
            if gamma != 2:
                svar = np.sum(var_cl, axis=0)
                weights = ((gamma - 2.) * svar 
                            +  gamma /( n_classes - 1) * smeans)
            else:
                weights = smeans
            weights = weights / tnorm
            weights[weights < 0] = 0

            if distance:
                return (weights / np.sqrt(np.sum(weights ** 2)),
                        squareform(pdist(X * np.sqrt(weights))))
            else:
                return weights / np.sqrt(np.sum(weights ** 2))

        else:
            print ("error: for t-test normalization the number" +
                   " of classes must be equal 2")
            return None
    else:
        print "error: norm options are None, \"unity\" and  \"t-test\""
    return None

In [34]:
#Carregar dados processados 

with open('Dados/DadosTreino', 'rb') as fp:
        DadosTreino = pickle.load(fp)

with open('Dados/DadosTeste', 'rb') as fp:
        DadosTeste = pickle.load(fp)
        
DadosTreino_backup = DadosTreino

In [35]:
print(DadosTreino.shape)
print(DadosTeste.shape)

(14103, 255)
(1568, 255)


In [36]:
#Dividir classes e características

y_treino = np.array(DadosTreino['Classe'])           
y_teste = np.array(DadosTeste['Classe'])                         

X_treino = DadosTreino.drop(['Classe'], 1)
X_teste = DadosTeste.drop(['Classe'], 1)
X_treino_efetivo = np.copy(X_treino)

In [37]:
#Normalização e featurização. Dependendo do caso analisasdo, alterações devem ser feitas.

normalizar = True
usarsuvrel = True

if normalizar:

        scaler = StandardScaler()
        X_treino = scaler.fit(X_treino).transform(X_treino)
        #X_treino_efetivo = scaler.transform(X_treino_efetivo)

if usarsuvrel: 

    w = suvrel(X=X_treino, y=y_treino)
    w = np.sqrt(w)

    X_treino_efetivo = w*X_treino_efetivo

    X_teste = w*X_teste

In [38]:
pd.DataFrame(X_treino).head(n=3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,244,245,246,247,248,249,250,251,252,253
0,-0.068075,-0.312495,-0.248336,-0.269641,-0.191858,-0.069606,-0.269049,-0.173068,-0.139003,-0.334094,...,-0.131572,-0.228883,-0.154133,-0.067103,-0.070627,-0.163871,-0.165639,-0.330006,-0.136812,-0.310727
1,-0.068075,-0.312495,-0.248336,-0.269641,-0.191858,-0.069606,-0.269049,5.666007,-0.139003,-0.334094,...,-0.131572,-0.228883,-0.154133,-0.067103,-0.070627,-0.163871,-0.165639,-0.330006,-0.136812,-0.310727
2,-0.068075,-0.312495,-0.248336,-0.269641,-0.191858,-0.069606,-0.269049,5.666007,-0.139003,-0.334094,...,-0.131572,-0.228883,-0.154133,-0.067103,-0.070627,-0.163871,-0.165639,-0.330006,-0.136812,-0.310727


In [39]:
pd.DataFrame(X_treino_efetivo).head(n=3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,244,245,246,247,248,249,250,251,252,253
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001337,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001337,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
#Salvar

DadosTreino = pd.DataFrame(np.column_stack((X_treino_efetivo, y_treino)), 
                              columns=DadosTreino.columns)

DadosTeste = pd.DataFrame(np.column_stack((X_teste, y_teste)), 
                              columns=DadosTeste.columns)

#Salvar tabela de dados principal 

with open("Dados/DadosTreino", "wb") as fp:   #Pickling
    pickle.dump(DadosTreino, fp)   
with open("Dados/DadosTeste", "wb") as fp:   #Pickling
    pickle.dump(DadosTeste, fp) 

In [41]:
DadosTreino_backup.head() #Antes da aplciação Suvrel

Unnamed: 0,WC,EE,AN,VG,QP,WM,VP,IY,AW,LA,...,NP,SH,CM,CW,YQ,AM,LS,WL,SA,Classe
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010589,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010589,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012118,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [43]:
DadosTreino.head() #Depois so Suvrel

Unnamed: 0,WC,EE,AN,VG,QP,WM,VP,IY,AW,LA,...,NP,SH,CM,CW,YQ,AM,LS,WL,SA,Classe
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001337,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001337,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0016,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [47]:
#Teste 
print(w[7]*DadosTreino_backup['IY'].head())
print(w[7]*DadosTreino_backup['WL'].head())

0    0.000000
1    0.001337
2    0.001337
3    0.000000
4    0.000000
Name: IY, dtype: float64
0    0.00000
1    0.00000
2    0.00000
3    0.00153
4    0.00000
Name: WL, dtype: float64


In [13]:
print("OK.")

OK.
