# i) Regra da soma

In [59]:
import os
import numpy as np
import pandas as pd
import clustering

## Datasets

In [60]:
DATA_BASE_PATH = "./data"

FAC_FILE = os.path.join(DATA_BASE_PATH, "mfeat-fac")
FOU_FILE = os.path.join(DATA_BASE_PATH, "mfeat-fou")
KAR_FILE = os.path.join(DATA_BASE_PATH, "mfeat-kar")

fac = np.loadtxt(FAC_FILE, dtype=int)
fou = np.loadtxt(FOU_FILE, dtype=float)
kar = np.loadtxt(KAR_FILE, dtype=float)

## Importando melhor resultado

In [61]:
best_result = clustering.import_best_result("data/melhor_resultado_todas.pickle")
partition, _ = clustering.get_hard_patitions(best_result["membership_degree"])


## Probabilidade à priori das classes

In [62]:
Pw = np.array([len(c)/2000 for c in partition])
Pw, Pw.sum()

(array([0.1175, 0.116 , 0.1355, 0.1465, 0.0405, 0.01  , 0.1305, 0.151 ,
        0.0815, 0.071 ]),
 1.0)

In [63]:
for i, idxs in enumerate(partition):
    pd.DataFrame(fac[idxs]).to_csv(f"data/fac{i}.csv",decimal=",", index=False, header=False)

In [64]:
pd.DataFrame(partition).to_csv(f"data/partition.csv",decimal=",", index=False, header=False)

## Probabilidade à priori dos exemplos

In [65]:
def calc_p_x_w(xk, d, means, var, cov_matrix):
    coef = np.power(2*np.pi, -d/2) 
#     print(cov_matrix)
    inv_cov_matrix = np.linalg.inv(cov_matrix)
    (sign, logdet) = np.linalg.slogdet(inv_cov_matrix)
    sqrt_det_inv_cov = np.sqrt(sign*np.exp(logdet))
    diff = xk - means
    exp_exp = np.dot((-1/2)*np.dot(diff.T, inv_cov_matrix), diff)
    exp_func = np.exp(exp_exp)
    
    return coef * sqrt_det_inv_cov * exp_func

# def calc_p_x_w(xk, d, means, var, cov_matrix):
#     coef = math.pow(2*math.pi, d/2) 
#     inv_cov_matrix = np.linalg.inv(cov_matrix)
#     det_inv =  np.linalg.det(inv_cov_matrix)
#     diff = xk - means
#     exp_exp = np.dot(np.dot((-1/2)*(diff.T), inv_cov_matrix), diff)
#     exp_func = np.exp(exp_exp)
    
#     return exp_func/(coef * math.pow(det_inv, 1/2))

def calc_gaussian_bayesian_data(x, partition):
    n,d = x.shape
    qtd_w = len(partition)
    means = np.array([x[idxs].mean(axis=0) for idxs in partition])
    var = np.array([((x[idxs]-means[i])**2).mean(axis=0) for i, idxs in enumerate(partition)])
    cov_matrix = [np.zeros((d,d)) for _ in range(qtd_w)]

    for i in range(qtd_w):
        np.fill_diagonal(cov_matrix[i], var[i])
                
    p_x_w = np.empty((n, qtd_w))
    
    for i in range(qtd_w):
        for k in range(n):
            p_x_w[k, i] = calc_p_x_w(x[k], d, means[i], var[i], cov_matrix[i])   
    
    return p_x_w

def calc_prob_posteriori(p_x_w, Pw):
    p_w_x = np.empty((10, 2000))
    
    for i in range(10):
        for k in range(2000):
            sum_all = np.dot(p_x_w[k], Pw)
            p_w_x[i,k] = (p_x_w[k, i] * Pw[i])/sum_all
    
    y_pred = p_w_x.argmax(axis = 0)
    return y_pred, p_w_x


## Densidades por dataset

In [66]:
fac_p_x_w = calc_gaussian_bayesian_data(fac, partition)
fou_p_x_w = calc_gaussian_bayesian_data(fou, partition)
kar_p_x_w = calc_gaussian_bayesian_data(kar, partition)

In [67]:
fac_p_x_w[0,0], fou_p_x_w[0,0], kar_p_x_w[0,0]

(0.0, 3.243154665015897e+60, 7.254547591145725e-49)

## Classificação dos exemplos por view

In [68]:
fac_y_pred, fac_p_w_x = calc_prob_posteriori(fac_p_x_w, Pw)
fou_y_pred, fou_p_w_x = calc_prob_posteriori(fou_p_x_w, Pw)
kar_y_pred, kar_p_w_x = calc_prob_posteriori(kar_p_x_w, Pw)




## Classificador combinado com a regra da soma

Precisei tirar o fac porque seus valores são nulos. TENTAR CORRIGIR ISSO

In [69]:
def regra_soma_simples(fac_p_w_x, fou_p_w_x, kar_p_w_x, Pw):
    x_sum_w = np.empty((10, 2000))
    
    for i in range(10):
        for k in range(2000):
            views_sum =  fou_p_w_x[i,k] + kar_p_w_x[i,k]
            # views_sum =  fac_p_w_x[i,k] fou_p_w_x[i,k] + kar_p_w_x[i,k]
            x_sum_w[i,k] = (1-3)*Pw[i] + views_sum
            
    y_pred = x_sum_w.argmax(axis = 0) 
    return y_pred

In [75]:
y_pred_all = regra_soma_simples(fac_p_w_x, fou_p_w_x, kar_p_w_x, Pw)

In [76]:
from sklearn.metrics import accuracy_score, f1_score

y_true = clustering.get_instances_class()

print("Acurácia: ", accuracy_score(y_true, y_pred_all))
print("Medida-F: ", f1_score(y_true, y_pred_all, average="macro"))

Acurácia:  0.672
Medida-F:  0.6560764324030204


# K-Vizinhos

## Normalizando 


In [77]:
from sklearn.preprocessing import minmax_scale

fac_norm = minmax_scale(fac, copy=False)
fou_norm = minmax_scale(fou, copy=False)
kar_norm = minmax_scale(kar, copy=False)

## Distâncias entre os elementos

In [173]:
from sklearn.metrics.pairwise import euclidean_distances

fac_dist = euclidean_distances(fac_norm, fac_norm)
fou_dist = euclidean_distances(fou_norm, fou_norm)
kar_dist = euclidean_distances(kar_norm, kar_norm)

In [182]:
def calc_prob_posteriori_kvizinhos(view_dists, k, Pw, V):
    p_w_x = np.empty((10, 2000))
    k_vizinhos = np.empty((2000, k))
    kvizihos_temp = view_dists.argsort(axis=1)[:,:k+1]
    
    for i in range(2000):
        k_vizinhos[i,:] = np.delete(kvizihos_temp[i], 0)
    
    del kvizihos_temp
    
    for i in range(10):
        for j in range(2000):
            #p_w_x[i,j] = (((k_vizinhos[j] == i).sum()/len(partition[i]))/V)*Pw[i]
            p_w_x[i,j] = (k_vizinhos[j] == i).sum()/k
                
    y_pred = p_w_x.argmax(axis = 0)
    
    
    return y_pred, p_w_x
    

In [183]:
_, fac_knn_p_w_x = calc_prob_posteriori_kvizinhos(fac_dist, 10, Pw, 1)
_, fou_knn_p_w_x = calc_prob_posteriori_kvizinhos(fou_dist, 10, Pw, 1)
_, kar_knn_p_w_x = calc_prob_posteriori_kvizinhos(kar_dist, 10, Pw, 1)

In [187]:
pd.DataFrame(fac_knn_p_w_x).to_csv(f"data/fac_knn_p_w_x.csv",decimal=",", index=False, header=False)

## Regra da soma

In [188]:
def regra_soma_knn(fac_p_w_x, fou_p_w_x, kar_p_w_x, Pw):
    x_sum_w = np.empty((10, 2000))
    
    for i in range(10):
        for k in range(2000):
            #views_sum =  fou_p_w_x[i,k] + kar_p_w_x[i,k]
            views_sum =  fac_p_w_x[i,k] + fou_p_w_x[i,k] + kar_p_w_x[i,k]
            x_sum_w[i,k] = (1-3)*Pw[i] + views_sum
            
    y_pred = x_sum_w.argmax(axis = 0) 
    return y_pred

In [189]:
y_pred_knn_all = regra_soma_knn(fac_knn_p_w_x, fou_knn_p_w_x, kar_knn_p_w_x, Pw)

In [190]:
print("Acurácia: ", accuracy_score(y_true, y_pred_knn_all))
print("Medida-F: ", f1_score(y_true, y_pred_knn_all, average="macro"))

Acurácia:  0.1
Medida-F:  0.01851851851851852
