# i) Regra da soma (gaussiano)

In [4]:
import os
import numpy as np
import pandas as pd
import clustering

## Datasets

In [5]:
DATA_BASE_PATH = "./data"

FAC_FILE = os.path.join(DATA_BASE_PATH, "mfeat-fac")
FOU_FILE = os.path.join(DATA_BASE_PATH, "mfeat-fou")
KAR_FILE = os.path.join(DATA_BASE_PATH, "mfeat-kar")

fac = np.loadtxt(FAC_FILE, dtype=int)
fou = np.loadtxt(FOU_FILE, dtype=float)
kar = np.loadtxt(KAR_FILE, dtype=float)

## Importando melhor resultado

In [6]:
best_result = clustering.import_best_result("data/melhor_resultado_todas.pickle")
partition, y_true = clustering.get_hard_patitions(best_result["membership_degree"])


## Probabilidade à priori das classes

In [7]:
Pw = np.array([len(c)/2000 for c in partition])
Pw

array([0.1175, 0.116 , 0.1355, 0.1465, 0.0405, 0.01  , 0.1305, 0.151 ,
       0.0815, 0.071 ])

## Densidades por dataset

In [31]:
# fac_byn_density_probs = calc_gaussian_bayesian_data(fac, partition)
# fou_byn_density_probs = calc_gaussian_bayesian_data(fou, partition)
# kar_byn_density_probs = calc_gaussian_bayesian_data(kar, partition)

In [32]:
# fac_byn_density_probs[0,0], fou_byn_density_probs[0,0], kar_byn_density_probs[0,0]

## Prob. à posteriori por view

In [33]:
def calc_prob_posteriori(p_x_w, Pw):
    qtd_x, qtd_w = p_x_w.shape 
    p_w_x = np.empty((qtd_x, qtd_w))
    
    for k in range(qtd_x):
        for i in range(qtd_w):
            sum_all = np.dot(p_x_w[k], Pw)
            p_w_x[k,i] = (p_x_w[k,i] * Pw[i])/sum_all
       
    return p_w_x


# fac_byn_posteriori_probs = calc_prob_posteriori(fac_byn_density_probs, Pw)
# fou_byn_posteriori_probs = calc_prob_posteriori(fou_byn_density_probs, Pw)
# kar_byn_posteriori_probs = calc_prob_posteriori(kar_byn_density_probs, Pw)


## Regra da soma

In [34]:
def regra_soma_padrao(views, Pw):
    qtd_x = views[0].shape[0]
    qtd_w = len(Pw)
    x_sum_w = np.empty((qtd_x, qtd_w))

    for i in range(qtd_x):
        for k in range(qtd_w):
            views_sum = sum([v[i,k] for v in views])
            x_sum_w[i,k] = (1-len(views))*Pw[k] + views_sum
                    
    y_pred = x_sum_w.argmax(axis = 1) 
    return y_pred

## Estimador do scikit

In [35]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels

class ClassificaforBayesiano(BaseEstimator, ClassifierMixin):
    def __init__(self, partition, Pw):
        self.partition = partition
        self.Pw = Pw
    
    def fit(self, X, y):
        X, y = check_X_y(X, y)
         
        self.classes_ = unique_labels(y)
        self._fit_gaussian_bayesian_data(X)
        self.X_ = X
        self.y_ = y
        return self
    
    def _fit_gaussian_bayesian_data(self, X):
        n, d = X.shape
        qtd_w = len(self.partition)
        self.means = np.array([X[idxs].mean(axis=0) for idxs in self.partition])
        self.var = np.array([((X[idxs]-self.means[i])**2).mean(axis=0) for i, idxs in enumerate(self.partition)])
        self.cov_matrix = [np.zeros((d,d)) for _ in range(qtd_w)]

        for i in range(qtd_w):
            np.fill_diagonal(self.cov_matrix[i], self.var[i])

        return self
    
    def _calc_gaussian_density_prob(self, xk, cls):
        d = xk.shape[0]
        coef = np.power(2*np.pi, -d/2)
        inv_cov_matrix = np.linalg.inv(self.cov_matrix[cls])
        (sign, logdet) = np.linalg.slogdet(inv_cov_matrix)
        sqrt_det_inv_cov = np.sqrt(sign*np.exp(logdet))
        diff = xk - self.means[cls]
        exp_exp = np.dot((-1/2)*np.dot(diff.T, inv_cov_matrix), diff)
        exp_func = np.exp(exp_exp)

        return coef * sqrt_det_inv_cov * exp_func

    def predict_proba(self, X):
        check_is_fitted(self)
        X = check_array(X)
        
        desity_probs = np.empty((X.shape[0], len(self.classes_)))
        for k in range(desity_probs.shape[0]):
            for j in range(len(self.classes_)):
                desity_probs[k,j] = self._calc_gaussian_density_prob(X[k], j)
        
        post_probs = calc_prob_posteriori(desity_probs, self.Pw)
        
        return post_probs
        
        

In [36]:
class RegraSomaClasificadorBayesiano(BaseEstimator, ClassifierMixin):
    def __init__(self, partition, Pw):
#         self.views = views
        self.partition = partition
        self.Pw = Pw
        self.clfs = []        
    
    def fit(self, X, y):
        self.classes_ = unique_labels(y)
        self.X_ = X
        self.y_ = y
        
        for x in X:
            clf = ClassificaforBayesiano(self.partition, Pw)
            clf.fit(x,y)
            self.clfs.append(clf)
            
        return self
    
    def predict(self, X):
        assert len(X) == len(self.clfs)
        
        check_is_fitted(self)

        post_probs = [clf.predict_proba(x) for clf, x in zip(self.clfs, X)]
        
        return self.regra_soma(post_probs, Pw=self.Pw)
    
    def get_params(self, deep=True):
        return {"Pw": self.Pw, "partition": self.partition}
    
    def regra_soma(self, matrizes, Pw):
        qtd_x = matrizes[0].shape[0]
        qtd_w = len(Pw)
        x_sum_w = np.empty((qtd_x, qtd_w))
        
        for i in range(qtd_x):
            for k in range(qtd_w):
                views_sum = sum([v[i,k] for v in matrizes])
                x_sum_w[i,k] = (1-len(matrizes))*Pw[k] + views_sum
                
        y_pred = x_sum_w.argmax(axis = 1) 
        return y_pred

## Validação cruzada

- **Observação**: No modelo gaussiano,  as desidades de probabilidades da VIEW1 (mfeat-fac) são todas zeradas, o que causa uma baixa na performance geral do modelo da regra da soma. Por esta razão, apenas neste modelo, ela foi desconsiderada. Portanto, no modelo gaussiano apresentamos os resultados os dados considerando somente VIEW2 e VIEW3.

- As médias das métricas para cada split da validação estão no arquivo **data/gaussian_training_data.csv**. Nota-se que muitos resultados aprensetam valores altos ou representações do infinito ("inf"). Isso se deu por conta de problemas de overflow na multiplicação usando valores muito pequenos fato que, aparentemente, foi causado pela quantidade de splits.

In [61]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import precision_recall_fscore_support, accuracy_score


RANDOM_SEED = 42
FOLDS = 10
REPEATS = 30
REAPEATS_GAUSSIAN = 10

def get_splited_partition(idxs, y_true):
    partition = [[] for i in range(10)]
    
    for i,indice in enumerate(idxs):
        partition[y_true[indice]].append(i)
        
    return partition
    
cv = RepeatedStratifiedKFold(FOLDS, REPEATS, RANDOM_SEED)

train_precision_scores = np.empty((cv.get_n_splits(),))
train_recall_scores = np.empty((cv.get_n_splits(),))
train_f1_scores = np.empty((cv.get_n_splits(),))
train_acc_scores = np.empty((cv.get_n_splits(),))

test_precision_scores = np.empty((cv.get_n_splits(),))
test_recall_scores = np.empty((cv.get_n_splits(),))
test_f1_scores = np.empty((cv.get_n_splits(),))
test_acc_scores = np.empty((cv.get_n_splits(),))

views = [fou, kar]

for i, (train_index, test_index) in enumerate(cv.split(fac, y_true)):
    print(f"{i+1}/{cv.get_n_splits()}", end=" ")
    
    test_views = [v[test_index] for v in views]
    train_views = [v[train_index] for v in views]
    
    local_partition = get_splited_partition(train_index, y_true)
    clf = RegraSomaClasificadorBayesiano(local_partition, Pw)
    
    clf.fit(train_views, y_true[train_index])
    
    y_pred_train = clf.predict(train_views)
    y_pred_test = clf.predict(test_views)
    
    scores_train = precision_recall_fscore_support(y_true[train_index], 
                                                       y_pred_train, 
                                                       average="macro")

    train_precision_scores[i] = scores_train[0]
    train_recall_scores[i] = scores_train[1]
    train_f1_scores[i] = scores_train[2]
    train_acc_scores[i] = accuracy_score(y_true[train_index], y_pred_train)

    scores_test = precision_recall_fscore_support(y_true[test_index], 
                                                   y_pred_test, 
                                                   average="macro")

    test_precision_scores[i] = scores_test[0]
    test_recall_scores[i] = scores_test[1]
    test_f1_scores[i] = scores_test[2]
    test_acc_scores[i] = accuracy_score(y_true[test_index], y_pred_test)

    report = dict(
        runs = cv.get_n_splits(),
        split=i+1,
        train_accuracy = train_acc_scores[i],
        train_precision = train_precision_scores[i],
        train_recall = train_recall_scores[i],
        train_fscore = train_f1_scores[i],
        test_accuracy = test_acc_scores[i],
        test_precision = test_precision_scores[i],
        test_recall = test_recall_scores[i],
        test_fscore = test_f1_scores[i],

    )

    if os.path.exists("data/gaussian_300_splits.csv"):
        pd.DataFrame([report]).to_csv("data/gaussian_300_splits.csv", mode="a", header=False, decimal=",", index=False)
    else:
        pd.DataFrame([report]).to_csv("data/gaussian_300_splits.csv", mode="a", header=True, decimal=",", index=False)





1/300 2/300 3/300 4/300 5/300 6/300 7/300 8/300 9/300 10/300 11/300 12/300 13/300 14/300 15/300 16/300 17/300 18/300 19/300 20/300 21/300 22/300 23/300 24/300 25/300 26/300 27/300 28/300 29/300 30/300 31/300 32/300 33/300 34/300 35/300 36/300 37/300 38/300 39/300 40/300 41/300 42/300 43/300 44/300 45/300 46/300 47/300 48/300 49/300 50/300 51/300 52/300 53/300 54/300 55/300 56/300 57/300 58/300 59/300 60/300 61/300 62/300 63/300 64/300 65/300 66/300 67/300 68/300 69/300 70/300 71/300 72/300 73/300 74/300 75/300 76/300 77/300 78/300 79/300 80/300 81/300 82/300 83/300 84/300 85/300 86/300 87/300 88/300 89/300 90/300 91/300 92/300 93/300 94/300 95/300 96/300 97/300 98/300 99/300 100/300 101/300 102/300 103/300 104/300 105/300 106/300 107/300 108/300 109/300 110/300 111/300 112/300 113/300 114/300 115/300 116/300 117/300 118/300 119/300 120/300 121/300 122/300 123/300 124/300 125/300 126/300 127/300 128/300 129/300 130/300 131/300 132/300 133/300 134/300 135/300 136/300 137/300 138/300 139/

  _warn_prf(average, modifier, msg_start, len(result))


214/300 215/300 216/300 217/300 218/300 219/300 220/300 221/300 222/300 223/300 224/300 225/300 226/300 227/300 228/300 229/300 230/300 231/300 232/300 233/300 234/300 235/300 236/300 237/300 238/300 239/300 240/300 241/300 242/300 243/300 244/300 245/300 246/300 247/300 248/300 249/300 250/300 251/300 252/300 253/300 254/300 255/300 256/300 257/300 258/300 259/300 260/300 261/300 262/300 263/300 264/300 265/300 266/300 267/300 268/300 269/300 270/300 271/300 272/300 273/300 274/300 275/300 276/300 277/300 278/300 279/300 280/300 281/300 282/300 283/300 284/300 285/300 286/300 287/300 288/300 289/300 290/300 291/300 292/300 293/300 294/300 295/300 296/300 297/300 298/300 299/300 300/300 

## Treinamento com base completa

- O objetivo aqui é determinar o desempenho do classificador por classe, tendo em vista diversas métricas de classificação. Observar os resultados no arquivo **data/gaussian_classification_report.txt**

In [38]:
from sklearn.metrics import classification_report

views = [fou, kar]

clf = RegraSomaClasificadorBayesiano(partition, Pw)

clf.fit(views, y_true)

y_pred = clf.predict(views)

report = classification_report(y_true, y_pred, output_dict=False, digits=4)

acc_score_gaussian = accuracy_score(y_true, y_pred)

with open("data/gaussian_classification_report.txt", "w") as report_file:
    report_file.write(report)


## Estimativa pontual e intervalo de confiança

In [39]:
p = acc_score_gaussian
z_padrao = 1.96 # Confiança de 95%

def calc_intervalo_confiança(p, z=z_padrao, n=2000):
    diff = z*np.sqrt(p*(1-p)/n)
    return (round(p - diff, 4), round(p+diff, 4))

print("Estimativa pontual: ", p)
print("Intervalo de confiança: ", calc_intervalo_confiança(p))

Estimativa pontual:  0.7555
Intervalo de confiança:  (0.7367, 0.7743)


# ii) K-Vizinhos

## Normalizando 


In [40]:
from sklearn.preprocessing import minmax_scale

fac_norm = minmax_scale(fac)
fou_norm = minmax_scale(fou)
kar_norm = minmax_scale(kar)

## Distâncias entre os elementos

In [41]:
from sklearn.metrics.pairwise import euclidean_distances

fac_dist = euclidean_distances(fac_norm, fac_norm)
fou_dist = euclidean_distances(fou_norm, fou_norm)
kar_dist = euclidean_distances(kar_norm, kar_norm)

## Função de densidade

In [42]:
def calc_knn_density_prob(view_dists, k, Pw, y_true):
    qtd_x = view_dists.shape[0]
    qtd_w = len(Pw)
    
    p_x_w = np.empty((qtd_x, qtd_w))
    k_vizinhos = view_dists.argsort(axis=1)[:,:k+1]
    
    for j in range(qtd_x):
        w_vizinhos = y_true[k_vizinhos[j, 1:]]
        for i in range(qtd_w):
            p_x_w[j,i] = (w_vizinhos == i).sum()/k
                    
    return p_x_w
    

## Desidades por dataset

In [43]:
# fac_knn_density_probs = calc_knn_density_prob(fac_dist, 5, Pw, y_true)
# fou_knn_density_probs = calc_knn_density_prob(fou_dist, 5, Pw, y_true)
# kar_knn_density_probs = calc_knn_density_prob(kar_dist, 5, Pw, y_true)


## Prob. à posteriori por view

In [44]:
# fac_knn_posteriori_probs = calc_prob_posteriori(fac_knn_density_probs, Pw)
# fou_knn_posteriori_probs = calc_prob_posteriori(fou_knn_density_probs, Pw)
# kar_knn_posteriori_probs = calc_prob_posteriori(kar_knn_density_probs, Pw)

## Regra da soma

In [45]:
# y_pred_knn_all = regra_soma_padrao([fac_knn_posteriori_probs, 
#                                    fou_knn_posteriori_probs, 
#                                    kar_knn_posteriori_probs], Pw)

# print("Acurácia: ", accuracy_score(y_true, y_pred_knn_all))
# print("Medida-F: ", f1_score(y_true, y_pred_knn_all, average="macro"))

## Validação cruzada

In [46]:
from sklearn.metrics import precision_recall_fscore_support
import pandas as pd
import os

cv = RepeatedStratifiedKFold(FOLDS, REPEATS, RANDOM_SEED)


k_range = range(3,4)
views = [fac, fou, kar]

for k in k_range:
    
    train_precision_scores = np.empty((cv.get_n_splits(),))
    train_recall_scores = np.empty((cv.get_n_splits(),))
    train_f1_scores = np.empty((cv.get_n_splits(),))

    test_precision_scores = np.empty((cv.get_n_splits(),))
    test_recall_scores = np.empty((cv.get_n_splits(),))
    test_f1_scores = np.empty((cv.get_n_splits(),))
    
    for i, (train_index, test_index) in enumerate(cv.split(fac, y_true)):
        print(f"[k={k}] {i+1}/{cv.get_n_splits()}", end=" ")

        train_views = [minmax_scale(v[train_index]) for v in views]
        test_views = [minmax_scale(v[test_index]) for v in views]

        train_views_dists = [euclidean_distances(v,v) for v in train_views]
        test_views_dists = [euclidean_distances(v,v) for v in test_views]

    #     y_true_split = get_splited_partition(train_index, y_true)

        train_views_density_probs = [calc_knn_density_prob(v, k, Pw, y_true[train_index]) 
                                     for v in train_views_dists]

        train_views_post_probs = [calc_prob_posteriori(v, Pw) 
                                  for v in train_views_density_probs]

        y_pred_train = regra_soma_padrao(train_views_post_probs, Pw)


        test_views_density_probs = [calc_knn_density_prob(v, k, Pw, y_true[test_index]) 
                                     for v in test_views_dists]

        test_views_post_probs = [calc_prob_posteriori(v, Pw) 
                                  for v in test_views_density_probs]

        y_pred_test = regra_soma_padrao(test_views_post_probs, Pw)
        
        scores_train = precision_recall_fscore_support(y_true[train_index], 
                                                       y_pred_train, 
                                                       average="macro")
        
#         train_precision_scores[i] = scores_train[0]
#         train_recall_scores[i] = scores_train[1]
#         train_f1_scores[i] = scores_train[2]

#         scores_test = precision_recall_fscore_support(y_true[test_index], 
#                                                        y_pred_test, 
#                                                        average="macro")
        
#         test_precision_scores[i] = scores_test[0]
#         test_recall_scores[i] = scores_test[1]
#         test_f1_scores[i] = scores_test[2]
        
        
        train_precision_scores[i] = scores_train[0]
        train_recall_scores[i] = scores_train[1]
        train_f1_scores[i] = scores_train[2]
        train_acc_scores[i] = accuracy_score(y_true[train_index], y_pred_train)

        scores_test = precision_recall_fscore_support(y_true[test_index], 
                                                       y_pred_test, 
                                                       average="macro")

        test_precision_scores[i] = scores_test[0]
        test_recall_scores[i] = scores_test[1]
        test_f1_scores[i] = scores_test[2]
        test_acc_scores[i] = accuracy_score(y_true[test_index], y_pred_test)
        
        report = dict(
            runs = cv.get_n_splits(),
            split=i+1,
            train_accuracy = train_acc_scores[i],
            train_precision = train_precision_scores[i],
            train_recall = train_recall_scores[i],
            train_fscore = train_f1_scores[i],
            test_accuracy = test_acc_scores[i],
            test_precision = test_precision_scores[i],
            test_recall = test_recall_scores[i],
            test_fscore = test_f1_scores[i],
        )

        if os.path.exists("data/knn_300_splits.csv"):
            pd.DataFrame([report]).to_csv("data/knn_300_splits.csv", mode="a", header=False, decimal=",", index=False)
        else:
            pd.DataFrame([report]).to_csv("data/knn_300_splits.csv", mode="a", header=True, decimal=",", index=False)
        
#     report = dict(
#         k = k,
#         runs = cv.get_n_splits(),
#         train_precision = train_precision_scores.mean(),
#         train_precision_std = train_precision_scores.std(),
#         train_recall = train_recall_scores.mean(),
#         train_recall_std = train_recall_scores.std(),
#         train_fscore = train_f1_scores.mean(),
#         train_fscore_std = train_f1_scores.std(),
#         test_precision = test_precision_scores.mean(),
#         test_precision_std = test_precision_scores.std(),
#         test_recall = test_recall_scores.mean(),
#         test_recall_std = test_recall_scores.std(),
#         test_fscore = test_f1_scores.mean(),
#         test_fscore_std = test_f1_scores.std(),
        
#     )
    
#     if os.path.exists("data/knn_best_k.csv"):
#         pd.DataFrame([report]).to_csv("data/knn_best_k.csv", mode="a", header=False, decimal=",", index=False)
#     else:
#         pd.DataFrame([report]).to_csv("data/knn_best_k.csv", mode="a", header=True, decimal=",", index=False)
    
    
    



[k=3] 1/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 2/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 3/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 4/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 5/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 6/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 7/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 8/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 9/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 10/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 11/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 12/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 13/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 14/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 15/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 16/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 17/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 18/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 19/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 20/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 21/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 22/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 23/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 24/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 25/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 26/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 27/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 28/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 29/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 30/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 31/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 32/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 33/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 34/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 35/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 36/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 37/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 38/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 39/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 40/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 41/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 42/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 43/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 44/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 45/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 46/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 47/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 48/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 49/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 50/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 51/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 52/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 53/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 54/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 55/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 56/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 57/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 58/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 59/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 60/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 61/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 62/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 63/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 64/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 65/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 66/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 67/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 68/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 69/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 70/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 71/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 72/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 73/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 74/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 75/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 76/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 77/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 78/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 79/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 80/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 81/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 82/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 83/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 84/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 85/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 86/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 87/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 88/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 89/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 90/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 91/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 92/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 93/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 94/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 95/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 96/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 97/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 98/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 99/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 100/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 101/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 102/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 103/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 104/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 105/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 106/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 107/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 108/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 109/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 110/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 111/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 112/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 113/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 114/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 115/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 116/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 117/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 118/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 119/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 120/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 121/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 122/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 123/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 124/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 125/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 126/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 127/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 128/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 129/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 130/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 131/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 132/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 133/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 134/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 135/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 136/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 137/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 138/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 139/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 140/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 141/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 142/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 143/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 144/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 145/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 146/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 147/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 148/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 149/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 150/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 151/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 152/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 153/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 154/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 155/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 156/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 157/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 158/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 159/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 160/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 161/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 162/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 163/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 164/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 165/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 166/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 167/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 168/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 169/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 170/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 171/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 172/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 173/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 174/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 175/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 176/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 177/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 178/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 179/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 180/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 181/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 182/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 183/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 184/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 185/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 186/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 187/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 188/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 189/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 190/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 191/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 192/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 193/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 194/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 195/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 196/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 197/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 198/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 199/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 200/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 201/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 202/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 203/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 204/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 205/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 206/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 207/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 208/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 209/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 210/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 211/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 212/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 213/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 214/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 215/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 216/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 217/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 218/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 219/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 220/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 221/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 222/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 223/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 224/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 225/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 226/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 227/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 228/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 229/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 230/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 231/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 232/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 233/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 234/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 235/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 236/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 237/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 238/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 239/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 240/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 241/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 242/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 243/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 244/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 245/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 246/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 247/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 248/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 249/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 250/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 251/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 252/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 253/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 254/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 255/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 256/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 257/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 258/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 259/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 260/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 261/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 262/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 263/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 264/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 265/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 266/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 267/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 268/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 269/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 270/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 271/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 272/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 273/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 274/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 275/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 276/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 277/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 278/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 279/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 280/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 281/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 282/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 283/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 284/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 285/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 286/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 287/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 288/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 289/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 290/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 291/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 292/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 293/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 294/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 295/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 296/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 297/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 298/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 299/300 

  _warn_prf(average, modifier, msg_start, len(result))


[k=3] 300/300 

  _warn_prf(average, modifier, msg_start, len(result))


## Treinamento com melhor valor de K

- O objetivo aqui é determinar o desempenho do classificador o valor de K com o qual ovtivemos a melhor medida-f média através validação cruzada. Observar os resultados no arquivo **data/knn_classification_report.txt**
- O melhor valor de K pode ser observado no arquivo **data/knn_best_k.csv**, gerado no experimento anterior

In [47]:
from sklearn.metrics import classification_report

BEST_K = 3
#views = [fac, fou, kar]
views = [fac, fou, kar]

train_views = [minmax_scale(v) for v in views]

train_views_dists = [euclidean_distances(v,v) for v in train_views]

train_views_density_probs = [calc_knn_density_prob(v, BEST_K, Pw, y_true) 
                             for v in train_views_dists]

train_views_post_probs = [calc_prob_posteriori(v, Pw) 
                          for v in train_views_density_probs]

y_pred = regra_soma_padrao(train_views_post_probs, Pw)
      
acc_score_knn = accuracy_score(y_true, y_pred)
    
report = classification_report(y_true, y_pred, output_dict=False, digits=4)

with open("data/knn_classification_report.txt", "w") as report_file:
    report_file.write(report)


  _warn_prf(average, modifier, msg_start, len(result))


## Estimativa pontual e intervalo de confiança

In [48]:
p = acc_score_knn
z_padrao = 1.96 # Confiança de 95%

print("Estimativa pontual: ", p)
print("Intervalo de confiança: ", calc_intervalo_confiança(p, z_padrao))

Estimativa pontual:  0.811
Intervalo de confiança:  (0.7938, 0.8282)


# iii) Janela de Parzen

## Função de densidade

In [49]:
def parzen_density_function(view, h, partition):
    qtd_x = view.shape[0]
    qtd_w = len(partition)
    
    p_x_w = np.empty((qtd_x, qtd_w))
    dims = view.shape[1]
    
    for i in range(qtd_w):
        n = len(partition[i])
        x_view = view[partition[i],:]
        for k in range(qtd_x):
            diff = (view[k] - x_view)/h
            gaussian_kernel = np.exp(-(diff**2)/2)/np.sqrt(2*np.pi)
            prod_dims = gaussian_kernel.prod(axis=1)
            p_x_w[k,i] = prod_dims.sum()/(n*h**dims)
            
    return p_x_w

In [50]:
# fac_parzen_density_probs = parzen_density_function(fac, 2, partition)
# fou_parzen_density_probs = parzen_density_function(fou, 2, partition)
# kar_parzen_density_probs = parzen_density_function(kar, 2, partition)

## Prob. à posteriori por view

In [51]:
# fac_prazen_posteriori_probs = calc_prob_posteriori(fac_parzen_density_probs, Pw)
# fou_prazen_posteriori_probs = calc_prob_posteriori(fou_parzen_density_probs, Pw)
# kar_prazen_posteriori_probs = calc_prob_posteriori(kar_parzen_density_probs, Pw)

## Regra da soma

In [52]:
# for h in range(2, 21, 1):
#     fac_parzen_density_probs = parzen_density_function(fac, h, partition)
#     fou_parzen_density_probs = parzen_density_function(fou, h, partition)
#     kar_parzen_density_probs = parzen_density_function(kar, h, partition)

#     fac_prazen_posteriori_probs = calc_prob_posteriori(fac_parzen_density_probs, Pw)
#     fou_prazen_posteriori_probs = calc_prob_posteriori(fou_parzen_density_probs, Pw)
#     kar_prazen_posteriori_probs = calc_prob_posteriori(kar_parzen_density_probs, Pw)

#     y_pred_prazen_all = regra_soma_padrao([fac_prazen_posteriori_probs, 
#                                        fou_prazen_posteriori_probs, 
#                                        kar_prazen_posteriori_probs], Pw)

#     print(f"h: {h}")
#     print("Acurácia: ", accuracy_score(y_true, y_pred_prazen_all))
#     print("Medida-F: ", f1_score(y_true, y_pred_prazen_all, average="macro"))

## Validação cruzada

- Variamos h de 2 a 12 para fins de comparação. Não foi possível, na validação cruzada, avaliar valores de janela maiores por limitação de tempo.

In [53]:
cv = RepeatedStratifiedKFold(FOLDS, REPEATS, RANDOM_SEED)
train_acc_scores = np.empty((cv.get_n_splits(),))
train_f1_scores = np.empty((cv.get_n_splits(),))

test_acc_scores = np.empty((cv.get_n_splits(),))
test_f1_scores = np.empty((cv.get_n_splits(),))


h_range = range(2,3)
best_data = {"f1_scores": np.zeros((1,)), 
             "h":None}

views = [fac, fou, kar]

y_true = y_true.astype(int)
for h in h_range:
    train_precision_scores = np.empty((cv.get_n_splits(),))
    train_recall_scores = np.empty((cv.get_n_splits(),))
    train_f1_scores = np.empty((cv.get_n_splits(),))
    train_acc_scores = np.empty((cv.get_n_splits(),))

    test_precision_scores = np.empty((cv.get_n_splits(),))
    test_recall_scores = np.empty((cv.get_n_splits(),))
    test_f1_scores = np.empty((cv.get_n_splits(),))
    test_acc_scores = np.empty((cv.get_n_splits(),))
    
    for i, (train_index, test_index) in enumerate(cv.split(fac, y_true)):
        print(f"[h={h}] {i+1}/{cv.get_n_splits()}", end=" ")

        train_views = [v[train_index] for v in views]
        test_views = [v[test_index] for v in views]

        
        y_true_split = get_splited_partition(train_index, y_true)
        

        train_views_density_probs = [parzen_density_function(v, h, y_true_split) 
                                     for v in train_views]

        train_views_post_probs = [calc_prob_posteriori(v, Pw) 
                                  for v in train_views_density_probs]

        y_pred_train = regra_soma_padrao(train_views_post_probs, Pw)


        y_true_split = get_splited_partition(test_index, y_true)
        
        test_views_density_probs = [parzen_density_function(v, h, y_true_split) 
                                     for v in test_views]

        test_views_post_probs = [calc_prob_posteriori(v, Pw) 
                                  for v in test_views_density_probs]

        y_pred_test = regra_soma_padrao(test_views_post_probs, Pw)

        scores_train = precision_recall_fscore_support(y_true[train_index], 
                                                       y_pred_train, 
                                                       average="macro")

        train_precision_scores[i] = scores_train[0]
        train_recall_scores[i] = scores_train[1]
        train_f1_scores[i] = scores_train[2]
        train_acc_scores[i] = accuracy_score(y_true[train_index], y_pred_train)

        scores_test = precision_recall_fscore_support(y_true[test_index], 
                                                       y_pred_test, 
                                                       average="macro")

        test_precision_scores[i] = scores_test[0]
        test_recall_scores[i] = scores_test[1]
        test_f1_scores[i] = scores_test[2]
        test_acc_scores[i] = accuracy_score(y_true[test_index], y_pred_test)
        
        report = dict(
            runs = cv.get_n_splits(),
            split=i+1,
            train_accuracy = train_acc_scores[i],
            train_precision = train_precision_scores[i],
            train_recall = train_recall_scores[i],
            train_fscore = train_f1_scores[i],
            test_accuracy = test_acc_scores[i],
            test_precision = test_precision_scores[i],
            test_recall = test_recall_scores[i],
            test_fscore = test_f1_scores[i],
        )

        if os.path.exists("data/parzen_300_splits.csv"):
            pd.DataFrame([report]).to_csv("data/parzen_300_splits.csv", mode="a", header=False, decimal=",", index=False)
        else:
            pd.DataFrame([report]).to_csv("data/parzen_300_splits.csv", mode="a", header=True, decimal=",", index=False)

        
#         report = dict(
#             h = h,
#             runs = cv.get_n_splits(),
#             train_precision = train_precision_scores.mean(),
#             train_precision_std = train_precision_scores.std(),
#             train_recall = train_recall_scores.mean(),
#             train_recall_std = train_recall_scores.std(),
#             train_fscore = train_f1_scores.mean(),
#             train_fscore_std = train_f1_scores.std(),
#             test_precision = test_precision_scores.mean(),
#             test_precision_std = test_precision_scores.std(),
#             test_recall = test_recall_scores.mean(),
#             test_recall_std = test_recall_scores.std(),
#             test_fscore = test_f1_scores.mean(),
#             test_fscore_std = test_f1_scores.std(),
#         )

#     if os.path.exists("data/parzen_300_splits.csv"):
#         pd.DataFrame([report]).to_csv("data/parzen_300_splits.csv", mode="a", header=False, decimal=",", index=False)
#     else:
#         pd.DataFrame([report]).to_csv("data/parzen_300_splits.csv", mode="a", header=True, decimal=",", index=False)
    
    



[h=2] 1/300 [h=2] 2/300 [h=2] 3/300 [h=2] 4/300 [h=2] 5/300 [h=2] 6/300 [h=2] 7/300 [h=2] 8/300 [h=2] 9/300 [h=2] 10/300 [h=2] 11/300 [h=2] 12/300 [h=2] 13/300 [h=2] 14/300 [h=2] 15/300 [h=2] 16/300 [h=2] 17/300 [h=2] 18/300 [h=2] 19/300 [h=2] 20/300 [h=2] 21/300 [h=2] 22/300 [h=2] 23/300 [h=2] 24/300 [h=2] 25/300 [h=2] 26/300 [h=2] 27/300 [h=2] 28/300 [h=2] 29/300 [h=2] 30/300 [h=2] 31/300 [h=2] 32/300 [h=2] 33/300 [h=2] 34/300 [h=2] 35/300 [h=2] 36/300 [h=2] 37/300 [h=2] 38/300 [h=2] 39/300 [h=2] 40/300 [h=2] 41/300 [h=2] 42/300 [h=2] 43/300 [h=2] 44/300 [h=2] 45/300 [h=2] 46/300 [h=2] 47/300 [h=2] 48/300 [h=2] 49/300 [h=2] 50/300 [h=2] 51/300 [h=2] 52/300 [h=2] 53/300 [h=2] 54/300 [h=2] 55/300 [h=2] 56/300 [h=2] 57/300 [h=2] 58/300 [h=2] 59/300 [h=2] 60/300 [h=2] 61/300 [h=2] 62/300 [h=2] 63/300 [h=2] 64/300 [h=2] 65/300 [h=2] 66/300 [h=2] 67/300 [h=2] 68/300 [h=2] 69/300 [h=2] 70/300 [h=2] 71/300 [h=2] 72/300 [h=2] 73/300 [h=2] 74/300 [h=2] 75/300 [h=2] 76/300 [h=2] 77/300 [h=2] 78

## Treinamento com melhor valor de h

- O objetivo aqui é determinar o desempenho do classificador o valor de h com o qual ovtivemos a melhor medida-f média através validação cruzada. Observar os resultados no arquivo **data/parzen_classification_report_h2.txt**
- Contudo, assim como com a validação cruzada, o modelo obteve 100% em todas as medidas, para todos os valores de h e em ambas as bases de treinamento e validação.

- Valores de h a partir de 30 causaram underflow no trenamento graças à poderação exponecial feita no cálculo da função de densidade do modelo e portanto não puderam ser computadas.

In [54]:
from sklearn.metrics import classification_report

BEST_H = 2
#views = [fac, fou, kar]
views = [fac, fou, kar]

train_views_density_probs = [parzen_density_function(v, BEST_H, partition) 
                             for v in views]

train_views_post_probs = [calc_prob_posteriori(v, Pw) 
                          for v in train_views_density_probs]

y_pred = regra_soma_padrao(train_views_post_probs, Pw)

report = classification_report(y_true, y_pred, output_dict=False, digits=4)

acc_score_parzen = accuracy_score(y_true, y_pred)

with open(f"data/parzen_classification_report_h{BEST_H}.txt", "w") as report_file:
    report_file.write(report)


## Estimativa pontual e intervalo de confiança

In [55]:
p = acc_score_parzen
z_padrao = 1.96 # Confiança de 95%

print("Estimativa pontual: ", p)
print("Intervalo de confiança: ", calc_intervalo_confiança(p, z_padrao))

Estimativa pontual:  1.0
Intervalo de confiança:  (1.0, 1.0)


## Friedman test (p-value)

Conclusão: 
- Nossa hipótese nula $H_0$ é que os resultados entre os classifcadores não tem diferença estatística significativa. Se o pvalue for maior que $0,05$ (nível de significância padrão), aceitamos $H_0$. Caso contrário, rejeitamos.

- O valor de pvalue fora inferior ao nível de significância. Logo, nossa hipótese é rejeitada pelo teste de Friedman e concluímos, portanto, que os classificadores possuem desempenho diferente. Para identificarmos quais deles são diferentes entre si, realizamos um pós-teste


In [2]:
from scipy.stats import friedmanchisquare
import pandas as pd

knn_scores = pd.read_csv("data/knn_300_splits.csv", decimal=",")
gaussian_scores = pd.read_csv("data/gaussian_300_splits.csv", decimal=",")
parzen_scores = pd.read_csv("data/parzen_300_splits.csv", decimal=",")

knn_acc = knn_scores["test_accuracy"].values
gaussian_acc = gaussian_scores["test_accuracy"].values
parzen_acc = parzen_scores["test_accuracy"].values

friedmanchisquare(knn_acc, gaussian_acc, parzen_acc)

FriedmanchisquareResult(statistic=513.6526138279932, pvalue=2.895690222219934e-112)

## Friedman test (pós-teste)

Conclusão:

- Conclui-se que todos os 3 classicadores são diferentes entre si. Os valores na tabela são os p-values calculados com o teste de Nemenyi. Como podemos observar, todos eles são menores que $0,05$.

- É importante lembrar que o modelo gaussiano padrão foi treinado apenas com duas matrizes, fato que dificulta a comparação direta de seus resultados com os outros dois modelos treinamos com todas as 3 matrizes. Essa diferença pode ter resultado numa diferença significativa nos testes de Friedman. 

In [25]:
## Dados extraídos de script em R

data = [[1.1e-13, ""], ["< 2e-16" , "< 2e-16"]]
pd.DataFrame(data, columns="gaussian knn".split(), index="knn parzen".split())

Unnamed: 0,gaussian,knn
knn,1.1e-13,
parzen,< 2e-16,< 2e-16
