# i) Regra da soma

In [29]:
import os
import numpy as np
import pandas as pd
import clustering

## Datasets

In [30]:
DATA_BASE_PATH = "./data"

FAC_FILE = os.path.join(DATA_BASE_PATH, "mfeat-fac")
FOU_FILE = os.path.join(DATA_BASE_PATH, "mfeat-fou")
KAR_FILE = os.path.join(DATA_BASE_PATH, "mfeat-kar")

fac = np.loadtxt(FAC_FILE, dtype=int)
fou = np.loadtxt(FOU_FILE, dtype=float)
kar = np.loadtxt(KAR_FILE, dtype=float)

## Importando melhor resultado

In [31]:
best_result = clustering.import_best_result("data/melhor_resultado_todas.pickle")
partition, y_true = clustering.get_hard_patitions(best_result["membership_degree"])


## Probabilidade à priori das classes

In [32]:
Pw = np.array([len(c)/2000 for c in partition])
Pw

array([0.1175, 0.116 , 0.1355, 0.1465, 0.0405, 0.01  , 0.1305, 0.151 ,
       0.0815, 0.071 ])

## Função de densidade

In [33]:
def calc_gaussian_density_prob(xk, d, means, var, cov_matrix):
    coef = np.power(2*np.pi, -d/2) 
    inv_cov_matrix = np.linalg.inv(cov_matrix)
    (sign, logdet) = np.linalg.slogdet(inv_cov_matrix)
    sqrt_det_inv_cov = np.sqrt(sign*np.exp(logdet))
    diff = xk - means
    exp_exp = np.dot((-1/2)*np.dot(diff.T, inv_cov_matrix), diff)
    exp_func = np.exp(exp_exp)
    
    return coef * sqrt_det_inv_cov * exp_func

In [34]:

def calc_gaussian_bayesian_data(x, partition):
    n,d = x.shape
    qtd_w = len(partition)
    means = np.array([x[idxs].mean(axis=0) for idxs in partition])
    var = np.array([((x[idxs]-means[i])**2).mean(axis=0) for i, idxs in enumerate(partition)])
    cov_matrix = [np.zeros((d,d)) for _ in range(qtd_w)]

    for i in range(qtd_w):
        np.fill_diagonal(cov_matrix[i], var[i])
                
    p_x_w = np.empty((n, qtd_w))
    
    for i in range(qtd_w):
        for k in range(n):
            p_x_w[k, i] = calc_gaussian_density_prob(x[k], d, means[i], var[i], cov_matrix[i])   
    
    return p_x_w

def calc_prob_posteriori(p_x_w, Pw):
    qtd_w = min(*p_x_w.shape)
    qtd_x = max(*p_x_w.shape)
    p_w_x = np.empty((qtd_w, qtd_x))
    
    for i in range(qtd_w):
        for k in range(qtd_x):
            sum_all = np.dot(p_x_w[k], Pw)
            p_w_x[i,k] = (p_x_w[k, i] * Pw[i])/sum_all
    
    y_pred = p_w_x.argmax(axis = 0)
    
    return y_pred, p_w_x


## Densidades por dataset

In [35]:
fac_byn_density_probs = calc_gaussian_bayesian_data(fac, partition)
fou_byn_density_probs = calc_gaussian_bayesian_data(fou, partition)
kar_byn_density_probs = calc_gaussian_bayesian_data(kar, partition)

In [36]:
fac_byn_density_probs[0,0], fou_byn_density_probs[0,0], kar_byn_density_probs[0,0]

(0.0, 3.243154665015897e+60, 7.254547591145725e-49)

## Prob. à priori por view

In [37]:
fac_byn_y_pred, fac_byn_posteriori_probs = calc_prob_posteriori(fac_byn_density_probs, Pw)
fou_byn_y_pred, fou_byn_posteriori_probs = calc_prob_posteriori(fou_byn_density_probs, Pw)
kar_byn_y_pred, kar_byn_posteriori_probs = calc_prob_posteriori(kar_byn_density_probs, Pw)




## Regra da soma

Precisei tirar o fac porque seus valores são nulos. TENTAR CORRIGIR ISSO

In [38]:
def regra_soma_padrao(fac_p_w_x, fou_p_w_x, kar_p_w_x, Pw):
    x_sum_w = np.empty((10, 2000))
    
    for i in range(10):
        for k in range(2000):
            views_sum =  fou_p_w_x[i,k] + kar_p_w_x[i,k]
            # views_sum =  fac_p_w_x[i,k] fou_p_w_x[i,k] + kar_p_w_x[i,k]
            x_sum_w[i,k] = (1-2)*Pw[i] + views_sum
            
    y_pred = x_sum_w.argmax(axis = 0) 
    return y_pred

## Estimador do scikit

In [39]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels

class ClassificaforBayesiano(BaseEstimator, ClassifierMixin):
    def __init__(self, partition, Pw):
        self.partition = partition
        self.Pw = Pw
    
    def fit(self, X, y):
        X, y = check_X_y(X, y)
         
        self.classes_ = unique_labels(y)
        self._fit_gaussian_bayesian_data(X)
        self.X_ = X
        self.y_ = y
        return self
    
    def _fit_gaussian_bayesian_data(self, X):
        n, d = X.shape
        qtd_w = len(self.partition)
        self.means = np.array([X[idxs].mean(axis=0) for idxs in self.partition])
        self.var = np.array([((X[idxs]-self.means[i])**2).mean(axis=0) for i, idxs in enumerate(self.partition)])
        self.cov_matrix = [np.zeros((d,d)) for _ in range(qtd_w)]

        for i in range(qtd_w):
            np.fill_diagonal(self.cov_matrix[i], self.var[i])

        return self
    
    def _calc_gaussian_density_prob(self, xk, cls):
        d = xk.shape[0]
        coef = np.power(2*np.pi, -d/2)
        inv_cov_matrix = np.linalg.inv(self.cov_matrix[cls])
        (sign, logdet) = np.linalg.slogdet(inv_cov_matrix)
        sqrt_det_inv_cov = np.sqrt(sign*np.exp(logdet))
        diff = xk - self.means[cls]
        exp_exp = np.dot((-1/2)*np.dot(diff.T, inv_cov_matrix), diff)
        exp_func = np.exp(exp_exp)

        return coef * sqrt_det_inv_cov * exp_func

    def predict_proba(self, X):
        check_is_fitted(self)
        X = check_array(X)
        
        desity_probs = np.empty((X.shape[0], len(self.classes_)))
        for k in range(desity_probs.shape[0]):
            for j in range(len(self.classes_)):
                desity_probs[k,j] = self._calc_gaussian_density_prob(X[k], j)
                
        _, post_probs = calc_prob_posteriori(desity_probs, self.Pw)
        
        return post_probs
        
        

In [40]:
class RegraSomaClasificadorBayesiano(BaseEstimator, ClassifierMixin):
    def __init__(self, partition, Pw):
#         self.views = views
        self.partition = partition
        self.Pw = Pw
        self.clfs = []        
    
    def fit(self, X, y):
#         X, y = check_X_y(X, y)
         
        self.classes_ = unique_labels(y)
        self.X_ = X
        self.y_ = y
        
        for x in X:
            clf = ClassificaforBayesiano(self.partition, Pw)
            clf.fit(x,y)
            self.clfs.append(clf)
            
        return self
    
    def predict(self, X):
        assert len(X) == len(self.clfs)
        # Check is fit had been called
        check_is_fitted(self)

        # Input validation
#         X = check_array(X)
        
        post_probs = [clf.predict_proba(x) for clf, x in zip(self.clfs, X)]
        
        return self.regra_soma(post_probs, Pw=self.Pw)
    
    def get_params(self, deep=True):
        return {"Pw": self.Pw, "partition": self.partition}
    
    def regra_soma(self, matrizes, Pw):
        
        x_sum_w = np.empty((len(Pw), matrizes[0].shape[1]))
        for i in range(len(Pw)):
            for k in range(matrizes[0].shape[0]):
                views_sum = sum([v[i,k] for v in matrizes])
                # views_sum =  fac_p_w_x[i,k] fou_p_w_x[i,k] + kar_p_w_x[i,k]
                x_sum_w[i,k] = (1-len(matrizes))*Pw[i] + views_sum
        y_pred = x_sum_w.argmax(axis = 0) 
        return y_pred
    

## Validação cruzada

In [41]:
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import RepeatedStratifiedKFold

RANDOM_SEED = 42
FOLDS = 10
REPEATS = 30

def get_splited_partition(idxs, y_true):
    partition = [[] for i in range(10)]
    
    for i,indice in enumerate(idxs):
        partition[y_true[indice]].append(i)
        
    return partition
    
cv = RepeatedStratifiedKFold(FOLDS, REPEATS, RANDOM_SEED)
acc_scores = np.empty((cv.get_n_splits(),))
f1_scores = np.empty((cv.get_n_splits(),))

for i, (train_index, test_index) in enumerate(cv.split(fac, y_true)):
    print(f"Split {i+1}/{cv.get_n_splits()}")
    fac_train, fac_test = fac[train_index], fac[test_index]
    fou_train, fou_test = fou[train_index], fou[test_index]
    kar_train, kar_test = kar[train_index], kar[test_index]
    
    test_views = [fac_test, fou_test, kar_test]
    train_views = [fac_train, fou_train, kar_train]
    
    local_partition = get_splited_partition(train_index, y_true)
    clf = RegraSomaClasificadorBayesiano(local_partition, Pw)
    
    clf.fit(train_views, y_true[train_index])
    
    y_pred = clf.predict(test_views)
    
    acc_scores[i] = accuracy_score(y_true[test_index], y_pred)
    f1_scores[i] = f1_score(y_true[test_index], y_pred, average="macro") 
    
    print(f"Acurácia parcial: {acc_scores[:i+1].mean()} +/- {(acc_scores[:i+1].std())}", )
    print(f"Medida-F parcial: {f1_scores[:i+1].mean()} +/- {(f1_scores[:i+1].std())}", )


Split 1/300




Acurácia parcial: 0.13 +/- 0.0
Medida-F parcial: 0.1220266679921107 +/- 0.0
Split 2/300




Acurácia parcial: 0.14250000000000002 +/- 0.012499999999999997
Medida-F parcial: 0.12063237229876234 +/- 0.0013942956933483686
Split 3/300




Acurácia parcial: 0.155 +/- 0.020412414523193145
Medida-F parcial: 0.13323972162635428 +/- 0.017865792858314255
Split 4/300




Acurácia parcial: 0.15625 +/- 0.017809758560968756
Medida-F parcial: 0.13076830972276807 +/- 0.01605345901149367
Split 5/300




Acurácia parcial: 0.148 +/- 0.022934689882359426
Medida-F parcial: 0.12150595803531068 +/- 0.023437906734935707
Split 6/300




Acurácia parcial: 0.15 +/- 0.02140872096444188
Medida-F parcial: 0.12425622693391107 +/- 0.022262063138202847
Split 7/300




Acurácia parcial: 0.15285714285714286 +/- 0.02101991281366023
Medida-F parcial: 0.1268790687946448 +/- 0.021588777861675866
Split 8/300




Acurácia parcial: 0.15375 +/- 0.019803724397193575
Medida-F parcial: 0.12868864344725756 +/- 0.02075422454174903
Split 9/300




Acurácia parcial: 0.15833333333333333 +/- 0.02273030282830976
Medida-F parcial: 0.13260813786164372 +/- 0.022489499070731278
Split 10/300




Acurácia parcial: 0.15550000000000003 +/- 0.02317865397299852
Medida-F parcial: 0.12979732795742657 +/- 0.022941353108872783
Split 11/300




Acurácia parcial: 0.1581818181818182 +/- 0.023671302847802795
Medida-F parcial: 0.13166577023226086 +/- 0.022657680302197628
Split 12/300




Acurácia parcial: 0.15916666666666668 +/- 0.02289771944005681
Medida-F parcial: 0.13244780676072243 +/- 0.021847585589042602
Split 13/300




Acurácia parcial: 0.1580769230769231 +/- 0.022320950966995243
Medida-F parcial: 0.13190336493413451 +/- 0.021075039902585625
Split 14/300




Acurácia parcial: 0.15785714285714286 +/- 0.021523598819027663
Medida-F parcial: 0.13191769734369135 +/- 0.020308482351175133
Split 15/300




Acurácia parcial: 0.15766666666666665 +/- 0.020805982045769646
Medida-F parcial: 0.13150673392007262 +/- 0.01968002281871627
Split 16/300


KeyboardInterrupt: 

In [None]:
print(f"Acurácia: {acc_scores.mean()} +/- {(acc_scores.std())}", )
print(f"Medida-F: {f1_scores.mean()} +/- {(f1_scores.std())}", )

# K-Vizinhos

## Normalizando 


In [None]:
from sklearn.preprocessing import minmax_scale

fac_norm = minmax_scale(fac)
fou_norm = minmax_scale(fou)
kar_norm = minmax_scale(kar)

## Distâncias entre os elementos

In [None]:
from sklearn.metrics.pairwise import euclidean_distances

fac_dist = euclidean_distances(fac_norm, fac_norm)
fou_dist = euclidean_distances(fou_norm, fou_norm)
kar_dist = euclidean_distances(kar_norm, kar_norm)

## Função de densidade

In [None]:
def calc_knn_density_prob(view_dists, k, Pw):
    p_x_w = np.empty((2000, 10))
    k_vizinhos = np.empty((2000, k))
    kvizihos_temp = view_dists.argsort(axis=1)[:,:k+1]
    
    for i in range(2000):
        k_vizinhos[i,:] = np.delete(kvizihos_temp[i], 0)
    
    del kvizihos_temp
    
    for i in range(10):
        for j in range(2000):
            #p_w_x[i,j] = (((k_vizinhos[j] == i).sum()/len(partition[i]))/V)*Pw[i]
            p_x_w[j,i] = (k_vizinhos[j] == i).sum()/k
                
    #y_pred = p_x_w.argmax(axis = 0)
    y_pred = k_vizinhos.argmax(axis = 0)
    
    return y_pred, p_x_w
    

## Desidades por dataset

In [None]:
_, fac_knn_density_probs = calc_knn_density_prob(fac_dist, 100, Pw)
_, fou_knn_density_probs = calc_knn_density_prob(fou_dist, 100, Pw)
_, kar_knn_density_probs = calc_knn_density_prob(kar_dist, 100, Pw)

## Prob. à priori por view

In [None]:
fac_knn_y_pred, fac_knn_posteriori_probs = calc_prob_posteriori(fac_knn_density_probs, Pw)
fou_knn_y_pred, fou_knn_posteriori_probs = calc_prob_posteriori(fou_knn_density_probs, Pw)
kar_knn_y_pred, kar_knn_posteriori_probs = calc_prob_posteriori(kar_knn_density_probs, Pw)

## Regra da soma

In [None]:
def regra_soma_knn(fac_p_w_x, fou_p_w_x, kar_p_w_x, Pw):
    x_sum_w = np.empty((10, 2000))
    
    for i in range(10):
        for k in range(2000):
            #views_sum =  fou_p_w_x[i,k] + kar_p_w_x[i,k]
            views_sum =  fac_p_w_x[i,k] + fou_p_w_x[i,k] + kar_p_w_x[i,k]
            x_sum_w[i,k] = (1-2)*Pw[i] + views_sum
            
    y_pred = x_sum_w.argmax(axis = 0) 
    return y_pred

In [None]:
y_pred_knn_all = regra_soma_knn(fac_knn_posteriori_probs, 
                                   fou_knn_posteriori_probs, 
                                   kar_knn_posteriori_probs, Pw)

In [None]:
print("Acurácia: ", accuracy_score(y_true, y_pred_knn_all))
print("Medida-F: ", f1_score(y_true, y_pred_knn_all, average="macro"))

# Janela de Parzen

## Função de densidade

In [None]:
# def parzen_density_function(view, h, partition):
#     p_x_w = np.empty((10, 2000))
#     dims = view.shape[1]
#     elements_in_window = np.empty((10, dims))
    
#     def K(x):
#         return np.exp(-x**2/2)/np.sqrt(2*np.pi)

#     for i in range(10):
#         n = len(partition[i])
#         for k in range(2000):
#             p = np.array([np.array([K((view[k,j] - view[e,j])/h) for j in range(dims)]).prod() 
#                  for e in partition[i]]).sum()
            
#             p_x_w[i,k] = (1/(n*h**dims))
            
#     return p_x_w

def parzen_density_function(view, h, partition):
    p_x_w = np.empty((2000, 10))
    dims = view.shape[1]
    
    for i in range(10):
        n = len(partition[i])
        x_view = view[partition[i],:]
        
        for k in range(2000):
            diff = (view[k] - x_view)/h
            gaussian_diff = np.exp(-diff**2/2)/np.sqrt(2*np.pi)
            prod_dims = gaussian_diff.prod(axis=0)
            p_x_w[k,i] = (1/(n*h**dims)) * prod_dims.sum()
            print(p_x_w[k,i])
            
    return p_x_w

In [None]:
fac_parzen_density_probs = parzen_density_function(fac, 50, partition)
fou_parzen_density_probs = parzen_density_function(fou, 50, partition)
kar_parzen_density_probs = parzen_density_function(kar, 50, partition)

## Probabilidade a priori

In [None]:
fac_prazen_y_pred, fac_prazen_posteriori_probs = calc_prob_posteriori(fac_parzen_density_probs, Pw)
fou_prazen_y_pred, fou_prazen_posteriori_probs = calc_prob_posteriori(fou_parzen_density_probs, Pw)
fou_prazen_y_pred, kar_prazen_posteriori_probs = calc_prob_posteriori(kar_parzen_density_probs, Pw)

In [None]:
y_prazen_pred_all = regra_soma_padrao(fac_prazen_posteriori_probs, 
                                    fou_prazen_posteriori_probs, 
                                    kar_prazen_posteriori_probs, Pw)

In [None]:
set(y_prazen_pred_all)

In [None]:
from sklearn.metrics import accuracy_score, f1_score

y_true = clustering.get_instances_class()

print("Acurácia: ", accuracy_score(y_true, y_prazen_pred_all))
print("Medida-F: ", f1_score(y_true, y_prazen_pred_all, average="macro"))