In [1]:
########################################################################################################################
#Naive Bayes
#Tiago Tambonis
#14/02/19

In [2]:
import pickle
import numpy as np
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import squareform, pdist
from itertools import combinations
import pandas as pd 

In [3]:
## Função Suvrel.

def suvrel(X, y, gamma=2.0, norm=None, distance=False):
    """
    Return: a metric tensor for the data
    X columns representing samples and lines dimentions
    y labels
    gamma is a float
    norm:{None,\"unity\",\"t-test\"}
    distance: {False, True} if True return a tuple (weights, D)
    where D is the distanca matrix of the data
    for the geometric approach method
    """

    classes = list(set(y))
    n_classes = len(classes)
    dim = X.shape[1]

    if norm is None or norm == "unity":
        mean_cl = np.zeros((n_classes, dim))
        for i, cl in enumerate(classes):
            mean_cl[i] = np.mean(X[y == cl], axis=0)

        smeans = np.zeros(dim)
        for i, j in combinations(range(n_classes), 2):
            smeans += (mean_cl[i] - mean_cl[j]) ** 2

        if gamma != 2:
            var_cl = np.zeros((n_classes, dim))
            for cl in classes:
                var_cl[cl] = np.var(X[y == cl], axis=0)
            svar = np.sum(var_cl, axis=0)
            weights = ((gamma - 2.) * svar 
                        +  gamma /( n_classes - 1) * smeans)
        else:
            weights = smeans

        weights[weights < 0] = 0

        if norm is "unity":
            weights = weights / np.var(X, axis=0)

        if distance:
            return (weights / np.sqrt(np.sum(weights ** 2)),
                    squareform(pdist(X * np.sqrt(weights))))
        else:
            return weights / np.sqrt(np.sum(weights ** 2))

    elif norm == "t-test":
        if n_classes == 2:
            mean_cl = np.zeros((n_classes, dim))
            var_cl = np.zeros((n_classes, dim))
            for i, cl in enumerate(classes):
                mean_cl[i] = np.mean(X[y == cl], axis=0)
                var_cl[i] = np.var(X[y == cl], axis=0)

            for i, j in combinations(range(n_classes), 2):
                smeans = (mean_cl[i] - mean_cl[j]) ** 2
                #tnorm = (var_cl[i] / np.sum([y == classes[i]])
                         #+ var_cl[j] / np.sum([y == classes[j]]))

                # case with equal variance. Edited by Marcelo 21/10/13
                n1 = np.sum([y == classes[i]])
                n2 = np.sum([y == classes[j]])
                tnorm = ((n1 - 1) * var_cl[i] + (n2 - 1) * var_cl[j]) \
                    / (n1 + n2 - 2)
            if gamma != 2:
                svar = np.sum(var_cl, axis=0)
                weights = ((gamma - 2.) * svar 
                            +  gamma /( n_classes - 1) * smeans)
            else:
                weights = smeans
            weights = weights / tnorm
            weights[weights < 0] = 0

            if distance:
                return (weights / np.sqrt(np.sum(weights ** 2)),
                        squareform(pdist(X * np.sqrt(weights))))
            else:
                return weights / np.sqrt(np.sum(weights ** 2))

        else:
            print ("error: for t-test normalization the number" +
                   " of classes must be equal 2")
            return None
    else:
        print "error: norm options are None, \"unity\" and  \"t-test\""
    return None

In [4]:
#Carregar dados 

if False: 

    with open('../Sequencias/DadosTreinoFeaturizados', 'rb') as fp:
            DadosTreinoFeaturizados = pickle.load(fp)
    with open('../Sequencias/DadosTesteFeaturizados', 'rb') as fp:
            DadosTesteFeaturizados = pickle.load(fp)

if True: 

    with open('../Sequencias/DadosTreinoCru', 'rb') as fp:
            DadosTreinoFeaturizados = pickle.load(fp)
    with open('../Sequencias/DadosTesteCru', 'rb') as fp:
            DadosTesteFeaturizados = pickle.load(fp)

# Procura por meio de função

In [9]:
results = [] 
resultsnb = []

for i in np.linspace(0.0005005005005005005, 0.35, 200):

    y_treino = np.array(DadosTreinoFeaturizados['Classe'])           
    y_teste = np.array(DadosTesteFeaturizados['Classe'])                         

    X_treino = np.array(DadosTreinoFeaturizados.drop(['Classe'], 1))
    X_teste = np.array(DadosTesteFeaturizados.drop(['Classe'], 1))
    X_treino_efetivo = np.copy(X_treino)

    #Normalização e featurização

    normalizar = False
    usarsuvrel = True

    if normalizar:

            scaler = StandardScaler()
            X_treino = scaler.fit(X_treino).transform(X_treino)

    if usarsuvrel: 

        w = suvrel(X=X_treino, y=y_treino)
        #w = np.sqrt(w)

        X_treino_efetivo = w*X_treino_efetivo
        X_teste = w*X_teste
        
        #X_treino_efetivo = scaler.fit(X_treino_efetivo).transform(X_treino_efetivo)
        #X_teste = scaler.transform(X_teste)

    X_treino_efetivo = X_treino_efetivo[:, w>i]
    X_teste = X_teste[:, w>i]

    bnb = BernoulliNB(binarize=0)
    bnb.fit(X_treino_efetivo, y_treino)
    #print(bnb.score(X_teste, y_teste))

    gnb = GaussianNB()
    gnb.fit(X_treino_efetivo, y_treino)
    #print(gnb.score(X_teste, y_teste))
    
    results.append(gnb.score(X_teste, y_teste))
    resultsnb.append(bnb.score(X_teste, y_teste))

results = np.array(results)
resultsnb = np.array(resultsnb)

print(np.max(results))
print(np.max(resultsnb))

0.5535714285714286
0.5561224489795918


In [6]:
#Bernoulli

for j in np.linspace(0, 1, 10):
    
    resultsnb = []
    
    for i in np.linspace(0.0005005005005005005, 0.35, 100):

        y_treino = np.array(DadosTreinoFeaturizados['Classe'])           
        y_teste = np.array(DadosTesteFeaturizados['Classe'])                         

        X_treino = np.array(DadosTreinoFeaturizados.drop(['Classe'], 1))
        X_teste = np.array(DadosTesteFeaturizados.drop(['Classe'], 1))
        X_treino_efetivo = np.copy(X_treino)

        #Normalização e featurização

        normalizar = True
        usarsuvrel = True

        if normalizar:

                scaler = StandardScaler()
                X_treino = scaler.fit(X_treino).transform(X_treino)

        if usarsuvrel: 

            w = suvrel(X=X_treino, y=y_treino)
            #w = np.sqrt(w)

            X_treino_efetivo = w*X_treino_efetivo
            X_teste = w*X_teste

            X_treino_efetivo = scaler.fit(X_treino_efetivo).transform(X_treino_efetivo)
            X_teste = scaler.transform(X_teste)

        X_treino_efetivo = X_treino_efetivo[:, w>i]
        X_teste = X_teste[:, w>i]

        bnb = BernoulliNB(binarize=j)
        bnb.fit(X_treino_efetivo, y_treino)
        #print(bnb.score(X_teste, y_teste))

        #gnb = GaussianNB()
        #gnb.fit(X_treino_efetivo, y_treino)
        #print(gnb.score(X_teste, y_teste))

        #results.append(gnb.score(X_teste, y_teste))
        resultsnb.append(bnb.score(X_teste, y_teste))

    #results = np.array(results)
    resultsnb = np.array(resultsnb)

    #print(np.max(results))
    print(np.max(resultsnb))

0.5548469387755102
0.5548469387755102
0.5548469387755102
0.5548469387755102
0.5548469387755102
0.5548469387755102
0.5548469387755102
0.5548469387755102
0.5548469387755102
0.5548469387755102
