# i) Regra da soma

In [14]:
import os
import numpy as np
import pandas as pd
import clustering

## Datasets

In [15]:
DATA_BASE_PATH = "./data"

FAC_FILE = os.path.join(DATA_BASE_PATH, "mfeat-fac")
FOU_FILE = os.path.join(DATA_BASE_PATH, "mfeat-fou")
KAR_FILE = os.path.join(DATA_BASE_PATH, "mfeat-kar")

fac = np.loadtxt(FAC_FILE, dtype=int)
fou = np.loadtxt(FOU_FILE, dtype=float)
kar = np.loadtxt(KAR_FILE, dtype=float)

## Importando melhor resultado

In [16]:
best_result = clustering.import_best_result("data/melhor_resultado_todas.pickle")
partition, y_true = clustering.get_hard_patitions(best_result["membership_degree"])


## Probabilidade à priori das classes

In [17]:
Pw = np.array([len(c)/2000 for c in partition])
Pw

array([0.1175, 0.116 , 0.1355, 0.1465, 0.0405, 0.01  , 0.1305, 0.151 ,
       0.0815, 0.071 ])

## Função de densidade

In [18]:
def calc_gaussian_density_prob(xk, d, means, var, cov_matrix):
    coef = np.power(2*np.pi, -d/2) 
    inv_cov_matrix = np.linalg.inv(cov_matrix)
    (sign, logdet) = np.linalg.slogdet(inv_cov_matrix)
    sqrt_det_inv_cov = np.sqrt(sign*np.exp(logdet))
    diff = xk - means
    exp_exp = np.dot((-1/2)*np.dot(diff.T, inv_cov_matrix), diff)
    exp_func = np.exp(exp_exp)
    
    return coef * sqrt_det_inv_cov * exp_func

In [19]:

def calc_gaussian_bayesian_data(x, partition):
    n,d = x.shape
    qtd_w = len(partition)
    means = np.array([x[idxs].mean(axis=0) for idxs in partition])
    var = np.array([((x[idxs]-means[i])**2).mean(axis=0) for i, idxs in enumerate(partition)])
    cov_matrix = [np.zeros((d,d)) for _ in range(qtd_w)]

    for i in range(qtd_w):
        np.fill_diagonal(cov_matrix[i], var[i])
                
    p_x_w = np.empty((n, qtd_w))
    
    for k in range(n):
        for i in range(qtd_w):
            p_x_w[k, i] = calc_gaussian_density_prob(x[k], d, means[i], var[i], cov_matrix[i])   
    
    return p_x_w

def calc_prob_posteriori(p_x_w, Pw):
    qtd_x, qtd_w = p_x_w.shape 
    p_w_x = np.empty((qtd_x, qtd_w))
    
    for k in range(qtd_x):
        for i in range(qtd_w):
            sum_all = np.dot(p_x_w[k], Pw)
            p_w_x[k,i] = (p_x_w[k,i] * Pw[i])/sum_all
    
    y_pred = p_w_x.argmax(axis = 1)
    
    return y_pred, p_w_x


## Densidades por dataset

In [20]:
fac_byn_density_probs = calc_gaussian_bayesian_data(fac, partition)
fou_byn_density_probs = calc_gaussian_bayesian_data(fou, partition)
kar_byn_density_probs = calc_gaussian_bayesian_data(kar, partition)

In [21]:
fac_byn_density_probs[0,0], fou_byn_density_probs[0,0], kar_byn_density_probs[0,0]

(0.0, 3.243154665015897e+60, 7.254547591145725e-49)

## Prob. à priori por view

In [22]:
fac_byn_y_pred, fac_byn_posteriori_probs = calc_prob_posteriori(fac_byn_density_probs, Pw)
fou_byn_y_pred, fou_byn_posteriori_probs = calc_prob_posteriori(fou_byn_density_probs, Pw)
kar_byn_y_pred, kar_byn_posteriori_probs = calc_prob_posteriori(kar_byn_density_probs, Pw)




## Regra da soma

Precisei tirar o fac porque seus valores são nulos. TENTAR CORRIGIR ISSO

In [23]:
def regra_soma_padrao(fac_p_w_x, fou_p_w_x, kar_p_w_x, Pw):
    x_sum_w = np.empty((10, 2000))
    
    for i in range(10):
        for k in range(2000):
            views_sum =  fou_p_w_x[i,k] + kar_p_w_x[i,k]
            # views_sum =  fac_p_w_x[i,k] fou_p_w_x[i,k] + kar_p_w_x[i,k]
            x_sum_w[i,k] = (1-2)*Pw[i] + views_sum
            
    y_pred = x_sum_w.argmax(axis = 0) 
    return y_pred

## Estimador do scikit

In [24]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels

class ClassificaforBayesiano(BaseEstimator, ClassifierMixin):
    def __init__(self, partition, Pw):
        self.partition = partition
        self.Pw = Pw
    
    def fit(self, X, y):
        X, y = check_X_y(X, y)
         
        self.classes_ = unique_labels(y)
        self._fit_gaussian_bayesian_data(X)
        self.X_ = X
        self.y_ = y
        return self
    
    def _fit_gaussian_bayesian_data(self, X):
        n, d = X.shape
        qtd_w = len(self.partition)
        self.means = np.array([X[idxs].mean(axis=0) for idxs in self.partition])
        self.var = np.array([((X[idxs]-self.means[i])**2).mean(axis=0) for i, idxs in enumerate(self.partition)])
        self.cov_matrix = [np.zeros((d,d)) for _ in range(qtd_w)]

        for i in range(qtd_w):
            np.fill_diagonal(self.cov_matrix[i], self.var[i])

        return self
    
    def _calc_gaussian_density_prob(self, xk, cls):
        d = xk.shape[0]
        coef = np.power(2*np.pi, -d/2)
        inv_cov_matrix = np.linalg.inv(self.cov_matrix[cls])
        (sign, logdet) = np.linalg.slogdet(inv_cov_matrix)
        sqrt_det_inv_cov = np.sqrt(sign*np.exp(logdet))
        diff = xk - self.means[cls]
        exp_exp = np.dot((-1/2)*np.dot(diff.T, inv_cov_matrix), diff)
        exp_func = np.exp(exp_exp)

        return coef * sqrt_det_inv_cov * exp_func

    def predict_proba(self, X):
        check_is_fitted(self)
        X = check_array(X)
        
        desity_probs = np.empty((X.shape[0], len(self.classes_)))
        for k in range(desity_probs.shape[0]):
            for j in range(len(self.classes_)):
                desity_probs[k,j] = self._calc_gaussian_density_prob(X[k], j)
        
        _, post_probs = calc_prob_posteriori(desity_probs, self.Pw)
        
        return post_probs
        
        

In [25]:
class RegraSomaClasificadorBayesiano(BaseEstimator, ClassifierMixin):
    def __init__(self, partition, Pw):
#         self.views = views
        self.partition = partition
        self.Pw = Pw
        self.clfs = []        
    
    def fit(self, X, y):
#         X, y = check_X_y(X, y)
         
        self.classes_ = unique_labels(y)
        self.X_ = X
        self.y_ = y
        
        for x in X:
            clf = ClassificaforBayesiano(self.partition, Pw)
            clf.fit(x,y)
            self.clfs.append(clf)
            
        return self
    
    def predict(self, X):
        assert len(X) == len(self.clfs)
        # Check is fit had been called
        check_is_fitted(self)

        # Input validation
#         X = check_array(X)
        
        post_probs = [clf.predict_proba(x) for clf, x in zip(self.clfs, X)]
        
        return self.regra_soma(post_probs, Pw=self.Pw)
    
    def get_params(self, deep=True):
        return {"Pw": self.Pw, "partition": self.partition}
    
    def regra_soma(self, matrizes, Pw):
        qtd_x = matrizes[0].shape[0]
        qtd_w = len(Pw)
        x_sum_w = np.empty((qtd_x, qtd_w))
        
        for i in range(qtd_x):
            for k in range(qtd_w):
                views_sum = sum([v[i,k] for v in matrizes])
                # views_sum =  fac_p_w_x[i,k] fou_p_w_x[i,k] + kar_p_w_x[i,k]
                x_sum_w[i,k] = (1-len(matrizes))*Pw[k] + views_sum
                
        y_pred = x_sum_w.argmax(axis = 1) 
        return y_pred

# def regra_soma_padrao(fac_p_w_x, fou_p_w_x, kar_p_w_x, Pw):
# x_sum_w = np.empty((10, 2000))

# for i in range(10):
#     for k in range(2000):
#         views_sum =  fou_p_w_x[i,k] + kar_p_w_x[i,k]
#         # views_sum =  fac_p_w_x[i,k] fou_p_w_x[i,k] + kar_p_w_x[i,k]
#         x_sum_w[i,k] = (1-2)*Pw[i] + views_sum

# y_pred = x_sum_w.argmax(axis = 0) 
# return y_pred

## Validação cruzada

In [33]:
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import RepeatedStratifiedKFold

RANDOM_SEED = 42
FOLDS = 10
REPEATS = 30

def get_splited_partition(idxs, y_true):
    partition = [[] for i in range(10)]
    
    for i,indice in enumerate(idxs):
        partition[y_true[indice]].append(i)
        
    return partition
    
cv = RepeatedStratifiedKFold(FOLDS, REPEATS, RANDOM_SEED)
acc_scores = np.empty((cv.get_n_splits(),))
f1_scores = np.empty((cv.get_n_splits(),))

#views = [fac, fou, kar]
views = [fou, kar]

for i, (train_index, test_index) in enumerate(cv.split(fac, y_true)):
    print(f"{i+1}/{cv.get_n_splits()}", end=" ")
    
    test_views = [v[test_index] for v in views]
    train_views = [v[train_index] for v in views]
    
    local_partition = get_splited_partition(train_index, y_true)
    clf = RegraSomaClasificadorBayesiano(local_partition, Pw)
    
    clf.fit(train_views, y_true[train_index])
    
    y_pred = clf.predict(test_views)
    
    acc_scores[i] = accuracy_score(y_true[test_index], y_pred)
    f1_scores[i] = f1_score(y_true[test_index], y_pred, average="macro") 
    
    if (i+1)%50 == 0:
        print
        print(f"\n\nAcurácia parcial: {acc_scores[:i+1].mean()} +/- ({acc_scores[:i+1].std()})")
        print(f"Medida-F parcial: {f1_scores[:i+1].mean()} +/- ({f1_scores[:i+1].std()})\n")


1/300 2/300 3/300 4/300 5/300 6/300 7/300 8/300 9/300 10/300 11/300 12/300 13/300 14/300 15/300 16/300 17/300 18/300 19/300 20/300 21/300 22/300 23/300 24/300 25/300 26/300 27/300 28/300 29/300 30/300 31/300 32/300 33/300 34/300 35/300 36/300 37/300 38/300 39/300 40/300 41/300 42/300 43/300 44/300 45/300 46/300 47/300 48/300 49/300 50/300 

Acurácia parcial: 0.7238 +/- (0.027010368379568616)
Medida-F parcial: 0.6575911422781199 +/- (0.031279227732342436)

51/300 52/300 53/300 54/300 55/300 56/300 57/300 58/300 59/300 60/300 61/300 62/300 63/300 64/300 65/300 66/300 67/300 68/300 69/300 70/300 71/300 72/300 73/300 74/300 75/300 76/300 77/300 78/300 79/300 80/300 81/300 82/300 83/300 84/300 85/300 86/300 87/300 88/300 89/300 90/300 91/300 92/300 93/300 94/300 95/300 96/300 97/300 98/300 99/300 100/300 

Acurácia parcial: 0.7235 +/- (0.026091186251299504)
Medida-F parcial: 0.6583459406076162 +/- (0.030174762922527264)

101/300 102/300 103/300 104/300 105/300 106/300 107/300 108/300 109/30

# K-Vizinhos

## Normalizando 


In [36]:
from sklearn.preprocessing import minmax_scale

fac_norm = minmax_scale(fac)
fou_norm = minmax_scale(fou)
kar_norm = minmax_scale(kar)

## Distâncias entre os elementos

In [37]:
from sklearn.metrics.pairwise import euclidean_distances

fac_dist = euclidean_distances(fac_norm, fac_norm)
fou_dist = euclidean_distances(fou_norm, fou_norm)
kar_dist = euclidean_distances(kar_norm, kar_norm)

## Função de densidade

In [96]:
def calc_knn_density_prob(view_dists, k, Pw):
    qtd_x = view_dists.shape[0]
    qtd_w = len(Pw)
    
    p_x_w = np.empty((qtd_x, qtd_w))
    k_vizinhos = np.empty((qtd_x, k))
    kvizihos_temp = view_dists.argsort(axis=1)[:,:k+1]
    
    
    for i in range(qtd_x):
        k_vizinhos[i,:] = np.delete(kvizihos_temp[i], 0)

    del kvizihos_temp
    
    for j in range(qtd_x):
        for i in range(qtd_w):
            #p_w_x[j,i] = (((k_vizinhos[j] == i).sum()/len(partition[i]))/V)*Pw[i]
#             print("Vizinhos na classe: ", (k_vizinhos[j] == i).sum())
            p_x_w[j,i] = (k_vizinhos[j] == i).sum()/k
                
    #y_pred = p_x_w.argmax(axis = 0)
    y_pred = p_x_w.argmax(axis = 1)
    
    return y_pred, p_x_w
    

## Desidades por dataset

In [97]:
_, fac_knn_density_probs = calc_knn_density_prob(fac_dist, 15, Pw)
_, fou_knn_density_probs = calc_knn_density_prob(fou_dist, 15, Pw)
_, kar_knn_density_probs = calc_knn_density_prob(kar_dist, 15, Pw)


In [90]:
pd.DataFrame(fac_knn_density_probs).to_csv(f"data/fac_knn_density_probs.csv", decimal=",", index=False, header=False)

## Prob. à posteriori por view

In [98]:
def calc_prob_posteriori_knn(fac_knn_density_probs):
    pass
    
fac_knn_y_pred, fac_knn_posteriori_probs = calc_prob_posteriori(fac_knn_density_probs, Pw)
fou_knn_y_pred, fou_knn_posteriori_probs = calc_prob_posteriori(fou_knn_density_probs, Pw)
kar_knn_y_pred, kar_knn_posteriori_probs = calc_prob_posteriori(kar_knn_density_probs, Pw)



## Regra da soma

In [99]:
def regra_soma_knn(fac_p_w_x, fou_p_w_x, kar_p_w_x, Pw):
    x_sum_w = np.empty((2000, 10))
    
    for k in range(2000):
        for i in range(10):
            views_sum =  fou_p_w_x[k,i] + kar_p_w_x[k,i]
            #views_sum =  fac_p_w_x[i,k] + fou_p_w_x[i,k] + kar_p_w_x[i,k]
            x_sum_w[k,i] = (1-2)*Pw[i] + views_sum
    y_pred = x_sum_w.argmax(axis = 1) 
    return y_pred

In [100]:
y_pred_knn_all = regra_soma_knn(fac_knn_posteriori_probs, 
                                   fou_knn_posteriori_probs, 
                                   kar_knn_posteriori_probs, Pw)

# y_pred_knn_all = regra_soma_padrao([fac_knn_posteriori_probs, 
#                                    fou_knn_posteriori_probs, 
#                                    kar_knn_posteriori_probs], Pw)

print("Acurácia: ", accuracy_score(y_true, y_pred_knn_all))
print("Medida-F: ", f1_score(y_true, y_pred_knn_all, average="macro"))

Acurácia:  0.0985
Medida-F:  0.01793354574419663


Acurácia:  0.01
Medida-F:  0.0020554984583761563


# Janela de Parzen

## Função de densidade

In [None]:
# def parzen_density_function(view, h, partition):
#     p_x_w = np.empty((10, 2000))
#     dims = view.shape[1]
#     elements_in_window = np.empty((10, dims))
    
#     def K(x):
#         return np.exp(-x**2/2)/np.sqrt(2*np.pi)

#     for i in range(10):
#         n = len(partition[i])
#         for k in range(2000):
#             p = np.array([np.array([K((view[k,j] - view[e,j])/h) for j in range(dims)]).prod() 
#                  for e in partition[i]]).sum()
            
#             p_x_w[i,k] = (1/(n*h**dims))
            
#     return p_x_w

def parzen_density_function(view, h, partition):
    p_x_w = np.empty((2000, 10))
    dims = view.shape[1]
    
    for i in range(10):
        n = len(partition[i])
        x_view = view[partition[i],:]
        
        for k in range(2000):
            diff = (view[k] - x_view)/h
            gaussian_diff = np.exp(-diff**2/2)/np.sqrt(2*np.pi)
            prod_dims = gaussian_diff.prod(axis=0)
            p_x_w[k,i] = (1/(n*h**dims)) * prod_dims.sum()
            print(p_x_w[k,i])
            
    return p_x_w

In [None]:
fac_parzen_density_probs = parzen_density_function(fac, 50, partition)
fou_parzen_density_probs = parzen_density_function(fou, 50, partition)
kar_parzen_density_probs = parzen_density_function(kar, 50, partition)

## Probabilidade a priori

In [None]:
fac_prazen_y_pred, fac_prazen_posteriori_probs = calc_prob_posteriori(fac_parzen_density_probs, Pw)
fou_prazen_y_pred, fou_prazen_posteriori_probs = calc_prob_posteriori(fou_parzen_density_probs, Pw)
fou_prazen_y_pred, kar_prazen_posteriori_probs = calc_prob_posteriori(kar_parzen_density_probs, Pw)

In [None]:
y_prazen_pred_all = regra_soma_padrao(fac_prazen_posteriori_probs, 
                                    fou_prazen_posteriori_probs, 
                                    kar_prazen_posteriori_probs, Pw)

In [None]:
set(y_prazen_pred_all)

In [None]:
from sklearn.metrics import accuracy_score, f1_score

y_true = clustering.get_instances_class()

print("Acurácia: ", accuracy_score(y_true, y_prazen_pred_all))
print("Medida-F: ", f1_score(y_true, y_prazen_pred_all, average="macro"))