# Imports e constantes

In [2]:
import random
from joblib import dump, load
from pathlib import Path
from aux_functions import read_data
from CNN_Net import Net

import tensorflow as tf
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import StratifiedKFold


from pytorch_metric_learning import distances, losses, reducers, testers
import numpy as np
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV

import datetime
now = datetime.datetime.now()
timestamp = now.strftime('%Y-%m-%d')

gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

In [3]:
SEED = 42
EXP_NAME = 'v2-ensble'
np.random.seed = SEED
random.seed = SEED
tf.random.set_seed(SEED)
D1_DATA_DIR = Path('../data/patches-filter_by_area/d1/') # dataset 1
D2_DATA_DIR = Path('../data/patches-filter_by_area/d2/') # dataset 2
DATA_DIR = Path('./data/v1-and-fba/')
MODEL_OUT_DIR = Path(f'models/{EXP_NAME}_{timestamp}')
MODEL_OUT_DIR.mkdir(parents=True, exist_ok=False)
TRAIN_SIZE = .7
BATCH_SIZE = 128
LEARNING_RATE = 0.01
EPOCHS = 10
EMBEDDING_SIZE = 128
NUM_CLASSES = 2
IMG_SIZE = (96, 96)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

# Dados mistos: Dataset v1+v2
v1 = 	melhoria de contraste na imagem inteira **(7341 imagens)** \
v2 =	melhoria de constraste em áreas específicas da imagem **(4277 imagens)** \
data aug = 	rotation_range=120, horizontal_flip=True, vertical_flip=True, zoom_range=0.3 **(10000 imagens)** \
**21390 exemplares da classe negativa**

**TOTAL: 21618 'LEISH' x 21390 'NO-LEISH'**

# Objetivos

- Definir a função de perda com base na escolha de loss_select.
- Inicializar o modelo de rede neural e o otimizador.
- Realizar o treinamento do modelo com os dados de treino por um número especificado de épocas.
- Gerar embeddings para os conjuntos de treino e teste.
- Utilizar esses embeddings para treinar e testar um classificador SVM printando no final, os melhores hiperparams do classificador (C e gamma) e o número de dimensões do PCA.
- Realizar a validação cruzada estratificada em vários splits para avaliar a estabilidade e confiabilidade do modelo.
- Calcular e imprimir a média e o desvio padrão do recall para as duas classes.
- Salvar o estado do modelo treinado para uso futuro, assim como o PCA e o Standard Scaler.

In [16]:
def save_model(loss_select, model):
    '''
    Salva o estado do modelo treinado em um arquivo.

    @params
    loss_select: Nome da função de perda utilizada durante o treinamento.
    model: Modelo de rede neural a ser salvo.

    @returns
    None
    '''
    model_filename = MODEL_OUT_DIR.joinpath(f'model_{loss_select}_{EXP_NAME}_{timestamp}.pth')
    torch.save(model.state_dict(), model_filename)

In [4]:
def train(model, loss_func, device, train_loader, optimizer, epoch):
    '''
    Treina o modelo fornecido com o conjunto de dados de treinamento.

    @params
    model: O modelo de rede neural a ser treinado.
    loss_func: A função de perda a ser utilizada.
    device: O dispositivo no qual o modelo está sendo treinado (CPU ou GPU).
    train_loader: DataLoader contendo os dados de treinamento.
    optimizer: O otimizador usado para ajuste dos parâmetros do modelo.
    epoch: Número atual da época de treinamento.

    @returns
    Retorna a perda após a última iteração de treinamento na época atual.
    '''
    model.train()
    for batch_idx, (data, label) in enumerate(train_loader):
        data, label = data.to(device), label.to(device)
        optimizer.zero_grad()
        embeddings = model(data)
        loss = loss_func(embeddings, label)
        loss.backward()
        optimizer.step()

        if batch_idx % 100 == 0:
            print(
                'Epoch {} Iteration {}: Loss = {}'.format(
                    epoch, batch_idx, loss
                )
            )

    return loss

def get_all_embeddings(dataset, model):
    '''
    Gera embeddings para todos os exemplos em um conjunto de dados usando o modelo fornecido.

    @params
    dataset: O conjunto de dados para o qual os embeddings serão gerados.
    model: Modelo de rede neural usado para gerar embeddings.

    @returns
    Retorna uma tupla contendo os embeddings e os rótulos correspondentes dos exemplos do conjunto de dados.
    '''
    tester = testers.BaseTester()
    return tester.get_all_embeddings(dataset, model)

In [5]:
def train_model(loss_select, x_train_loader):
    '''
    Treina a CNN com base na função de perda especificada.

    @params
    loss_select: Uma string que especifica a função de perda a ser utilizada.
    x_train_loader: DataLoader contendo os dados de treinamento.

    @returns
    Retorna o modelo treinado, o otimizador e uma lista contendo as perdas de treinamento para cada época.
    '''
    loss_func = None
    if loss_select == 'Triplet':
        distance = distances.CosineSimilarity()
        reducer = reducers.ThresholdReducer(low=0)
        loss_func = losses.TripletMarginLoss(margin=0.2, distance=distance, reducer=reducer)

    if loss_select == 'NPairs':
        loss_func = losses.NPairsLoss()

    if loss_select == 'CosFace':
        loss_func = losses.CircleLoss()

    if loss_select == 'MultiSimilarity':
        loss_func = losses.MultiSimilarityLoss(alpha = 2, beta = 50, base=0.5)

    model = Net(EMBEDDING_SIZE, (1, *IMG_SIZE)).to(device)
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

    lss_train = []
    for epoch in range(1, EPOCHS + 1):
        lss = train(model, loss_func, device, x_train_loader, optimizer, epoch)
        lss_train.append(lss)

    return model, optimizer, lss_train

def get_all_embeddings(dataset, model):
    '''
    Gera embeddings para todos os exemplos em um conjunto de dados usando o modelo fornecido.

    @params
    dataset: O conjunto de dados para o qual os embeddings serão gerados.
    model: Modelo de rede neural usado para gerar embeddings.

    @returns
    Retorna uma tupla contendo os embeddings e os rótulos correspondentes dos exemplos do conjunto de dados.
    '''
    tester = testers.BaseTester()
    return tester.get_all_embeddings(dataset, model)

def train_eval_model(loss_select, train_set, test_set, model, C=None, gamma=None, perform_grid_search=False, to_save=False):
    '''
    Realiza Grid Search para hiperparâmetros do SVM se necessário, treina e avalia o modelo SVM.

    @params
    train_set: Conjunto de dados de treinamento.
    test_set: Conjunto de dados de teste.
    model: Modelo de rede neural para gerar embeddings.
    C: Parâmetro de regularização para o modelo SVM. Se None, é determinado via Grid Search.
    gamma: Parâmetro de kernel para o modelo SVM. Se None, é determinado via Grid Search.
    perform_grid_search: Booleano para realizar ou não o Grid Search.
    to_save: Booleano indicando se o modelo SVM e os pré-processadores devem ser salvos.

    @returns
    Retorna os rótulos verdadeiros de teste, as previsões feitas pelo modelo SVM, e os parâmetros C e gamma.
    '''
    print('Computing embeddings...')
    train_embeddings, train_labels = get_all_embeddings(train_set, model)
    test_embeddings, test_labels = get_all_embeddings(test_set, model)

    scaler = StandardScaler()
    pca = PCA(n_components=0.9)
    train_embeddings = pca.fit_transform(scaler.fit_transform(train_embeddings.cpu().numpy()))
    test_embeddings = pca.transform(scaler.transform(test_embeddings.cpu().numpy()))

    if perform_grid_search:
        print('Performing GridSearch...')
        clf = svm.SVC(kernel='rbf')
        params = {'C': [1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001]}
        search = GridSearchCV(clf, params, verbose=3, scoring='recall_macro', n_jobs=-1)
        search.fit(train_embeddings, train_labels.cpu().numpy().ravel())
        C, gamma = search.best_params_['C'], search.best_params_['gamma']
        print(f'Best Params: C={C}, gamma={gamma}')
        print(f'PCA dim ={pca.n_components_}')

    clf = svm.SVC(C=C, gamma=gamma, kernel='rbf')
    clf.fit(train_embeddings, train_labels.cpu().numpy().ravel())
    predictions = clf.predict(test_embeddings)
    y_true_test = test_labels.cpu().numpy().ravel()
    print(f'RESULTS FOR LOSS = [{loss_select}]')
    print(classification_report(y_true_test, predictions))

    if to_save:
        dump(clf, MODEL_OUT_DIR.joinpath(f'clf_{loss_select}_{EXP_NAME}_{timestamp}.joblib'))
        dump(scaler, MODEL_OUT_DIR.joinpath(f'scaler_{loss_select}_{EXP_NAME}_{timestamp}.joblib'))
        dump(pca, MODEL_OUT_DIR.joinpath(f'pca_{loss_select}_{EXP_NAME}_{timestamp}.joblib'))
        print('~ Joblib files saved.')

    return y_true_test, predictions, C, gamma

def cross_val_model(loss_select, train_set, model, best_C, best_gamma, n_folds=15):
    '''
    Realiza a validação cruzada estratificada do modelo SVM com os melhores hiperparâmetros encontrados.

    @params
    loss_select: Nome da função de perda utilizada durante o treinamento.
    train_set: Conjunto de dados de treinamento. (torch.Dataset)
    model: Modelo de rede neural.
    best_C: Melhor valor de C encontrado para o SVM.
    best_gamma: Melhor valor de gamma encontrado para o SVM.
    n_folds: Número de folds para a validação cruzada.

    @returns
    None.
    '''
    kf = StratifiedKFold(n_splits=n_folds, shuffle=True)
    iteration = 1
    recalls_fold_0 = []
    recalls_fold_1 = []

    for train_index, test_index in kf.split(train_set.indices, y_true_train):
        print(f'-------> RUN {iteration}')
        train_subset = torch.utils.data.Subset(train_set.dataset, train_index)
        test_subset = torch.utils.data.Subset(train_set.dataset, test_index)

        y_true_test, y_pred, *_ = train_eval_model(loss_select, train_subset, test_subset, model, C=best_C, gamma=best_gamma)

        report = classification_report(y_true_test, y_pred, output_dict=True)
        recalls_fold_0.append(report['0']['recall'])
        recalls_fold_1.append(report['1']['recall'])

        iteration += 1

    mean_recall_class_0 = np.mean(recalls_fold_0)
    mean_recall_class_1 = np.mean(recalls_fold_1)
    std_recall_class_0 = np.std(recalls_fold_0)
    std_recall_class_1 = np.std(recalls_fold_1)

    print(f'\n Loss ===== {loss_select}')
    print(f'Mean Recall (Class 0): {mean_recall_class_0}')
    print(f'Mean Recall (Class 1): {mean_recall_class_1}')
    print(f'Std Recall (Class 0): {std_recall_class_0}')
    print(f'Std Recall (Class 1): {std_recall_class_1}')

def save_model(loss_select, model):
    model_filename = MODEL_OUT_DIR.joinpath(f'model_{loss_select}_{EXP_NAME}_{timestamp}.pth')
    torch.save(model.state_dict(), model_filename)

## Função principal

In [6]:
def run_full_pipeline(loss_select, x_train, x_test, x_train_loader):
    '''
    Executa todo o pipeline de treinamento, avaliação e validação cruzada do modelo.

    @params
    loss_select: Nome da função de perda a ser utilizada. (string)
    x_train: Conjunto de dados de treinamento. (torch.Dataset)
    x_test: Conjunto de dados de teste. (torch.Dataset)
    x_train_loader: DataLoader para o conjunto de treinamento. (torch.DataLoader)

    @returns
    None
    '''
    model, optim, losses_training = train_model(loss_select, x_train_loader)
    *_, best_C, best_gamma = train_eval_model(loss_select, x_train, x_test, model, perform_grid_search=True, to_save=True)
    cross_val_model(loss_select, x_train, model, best_C, best_gamma)
    save_model(loss_select, model)

In [8]:
x_train, x_test, x_train_loader, x_test_loader, y_true_train, y_true_test = read_data(DATA_DIR, BATCH_SIZE, TRAIN_SIZE)
loss = ['Triplet', 'NPairs', 'CosFace', 'MultiSimilarity']
run_full_pipeline(loss[2], x_train, x_test, x_train_loader)

label format = {'leish': 0, 'no-leish': 1}
train test split proportion = train[30105], test[12903]
leish in training set = 15124
leish in testing set = 6494
Epoch 1 Iteration 0: Loss = 71.53922271728516
Epoch 1 Iteration 100: Loss = 57.454227447509766
Epoch 1 Iteration 200: Loss = 55.692989349365234
Epoch 2 Iteration 0: Loss = 54.74848175048828
Epoch 2 Iteration 100: Loss = 55.07919692993164
Epoch 2 Iteration 200: Loss = 54.35858154296875
Epoch 3 Iteration 0: Loss = 53.963783264160156
Epoch 3 Iteration 100: Loss = 52.793792724609375
Epoch 3 Iteration 200: Loss = 51.48012161254883
Epoch 4 Iteration 0: Loss = 51.56048583984375
Epoch 4 Iteration 100: Loss = 51.264259338378906
Epoch 4 Iteration 200: Loss = 49.38299560546875
Epoch 5 Iteration 0: Loss = 49.887168884277344
Epoch 5 Iteration 100: Loss = 49.3829345703125
Epoch 5 Iteration 200: Loss = 48.67662048339844
Epoch 6 Iteration 0: Loss = 47.674869537353516
Epoch 6 Iteration 100: Loss = 48.48349380493164
Epoch 6 Iteration 200: Loss = 47.

100%|██████████| 941/941 [00:21<00:00, 44.68it/s]
100%|██████████| 404/404 [00:15<00:00, 26.01it/s]


Performing GridSearch...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Params: C=10, gamma=0.01
PCA dim =26
RESULTS FOR LOSS = [CosFace]
              precision    recall  f1-score   support

           0       0.89      0.91      0.90      6494
           1       0.91      0.88      0.89      6409

    accuracy                           0.90     12903
   macro avg       0.90      0.90      0.90     12903
weighted avg       0.90      0.90      0.90     12903

~ Joblib files saved.
-------> RUN 1
Computing embeddings...


100%|██████████| 879/879 [00:44<00:00, 19.81it/s]
100%|██████████| 63/63 [00:10<00:00,  5.84it/s]


RESULTS FOR LOSS = [CosFace]
              precision    recall  f1-score   support

           0       0.91      0.94      0.93      1450
           1       0.84      0.77      0.80       557

    accuracy                           0.90      2007
   macro avg       0.88      0.86      0.87      2007
weighted avg       0.89      0.90      0.89      2007

-------> RUN 2
Computing embeddings...


100%|██████████| 879/879 [00:19<00:00, 46.09it/s]
100%|██████████| 63/63 [00:07<00:00,  8.22it/s]


RESULTS FOR LOSS = [CosFace]
              precision    recall  f1-score   support

           0       0.92      0.96      0.94      1449
           1       0.88      0.78      0.83       558

    accuracy                           0.91      2007
   macro avg       0.90      0.87      0.88      2007
weighted avg       0.91      0.91      0.91      2007

-------> RUN 3
Computing embeddings...


100%|██████████| 879/879 [00:19<00:00, 45.54it/s]
100%|██████████| 63/63 [00:08<00:00,  7.63it/s]


RESULTS FOR LOSS = [CosFace]
              precision    recall  f1-score   support

           0       0.91      0.95      0.93      1420
           1       0.86      0.77      0.81       587

    accuracy                           0.90      2007
   macro avg       0.88      0.86      0.87      2007
weighted avg       0.89      0.90      0.89      2007

-------> RUN 4
Computing embeddings...


100%|██████████| 879/879 [00:19<00:00, 44.82it/s]
100%|██████████| 63/63 [00:08<00:00,  7.71it/s]


RESULTS FOR LOSS = [CosFace]
              precision    recall  f1-score   support

           0       0.91      0.94      0.92      1444
           1       0.83      0.75      0.79       563

    accuracy                           0.89      2007
   macro avg       0.87      0.84      0.85      2007
weighted avg       0.88      0.89      0.88      2007

-------> RUN 5
Computing embeddings...


100%|██████████| 879/879 [00:19<00:00, 45.15it/s]
100%|██████████| 63/63 [00:08<00:00,  7.72it/s]


RESULTS FOR LOSS = [CosFace]
              precision    recall  f1-score   support

           0       0.91      0.95      0.93      1437
           1       0.85      0.75      0.80       570

    accuracy                           0.89      2007
   macro avg       0.88      0.85      0.86      2007
weighted avg       0.89      0.89      0.89      2007

-------> RUN 6
Computing embeddings...


100%|██████████| 879/879 [00:19<00:00, 45.10it/s]
100%|██████████| 63/63 [00:07<00:00,  8.15it/s]


RESULTS FOR LOSS = [CosFace]
              precision    recall  f1-score   support

           0       0.92      0.94      0.93      1457
           1       0.83      0.77      0.80       550

    accuracy                           0.89      2007
   macro avg       0.87      0.86      0.86      2007
weighted avg       0.89      0.89      0.89      2007

-------> RUN 7
Computing embeddings...


100%|██████████| 879/879 [00:19<00:00, 44.63it/s]
100%|██████████| 63/63 [00:08<00:00,  7.87it/s]


RESULTS FOR LOSS = [CosFace]
              precision    recall  f1-score   support

           0       0.90      0.94      0.92      1406
           1       0.85      0.76      0.80       601

    accuracy                           0.89      2007
   macro avg       0.88      0.85      0.86      2007
weighted avg       0.89      0.89      0.89      2007

-------> RUN 8
Computing embeddings...


100%|██████████| 879/879 [00:19<00:00, 45.02it/s]
100%|██████████| 63/63 [00:07<00:00,  7.92it/s]


RESULTS FOR LOSS = [CosFace]
              precision    recall  f1-score   support

           0       0.91      0.94      0.93      1458
           1       0.82      0.76      0.79       549

    accuracy                           0.89      2007
   macro avg       0.87      0.85      0.86      2007
weighted avg       0.89      0.89      0.89      2007

-------> RUN 9
Computing embeddings...


100%|██████████| 879/879 [00:19<00:00, 45.23it/s]
100%|██████████| 63/63 [00:08<00:00,  7.84it/s]


RESULTS FOR LOSS = [CosFace]
              precision    recall  f1-score   support

           0       0.90      0.95      0.92      1434
           1       0.85      0.75      0.79       573

    accuracy                           0.89      2007
   macro avg       0.88      0.85      0.86      2007
weighted avg       0.89      0.89      0.89      2007

-------> RUN 10
Computing embeddings...


100%|██████████| 879/879 [00:19<00:00, 45.56it/s]
100%|██████████| 63/63 [00:08<00:00,  7.67it/s]


RESULTS FOR LOSS = [CosFace]
              precision    recall  f1-score   support

           0       0.92      0.95      0.93      1429
           1       0.86      0.78      0.82       578

    accuracy                           0.90      2007
   macro avg       0.89      0.87      0.88      2007
weighted avg       0.90      0.90      0.90      2007

-------> RUN 11
Computing embeddings...


100%|██████████| 879/879 [00:19<00:00, 44.51it/s]
100%|██████████| 63/63 [00:08<00:00,  7.70it/s]


RESULTS FOR LOSS = [CosFace]
              precision    recall  f1-score   support

           0       0.91      0.95      0.93      1437
           1       0.86      0.76      0.81       570

    accuracy                           0.90      2007
   macro avg       0.89      0.86      0.87      2007
weighted avg       0.90      0.90      0.90      2007

-------> RUN 12
Computing embeddings...


100%|██████████| 879/879 [00:19<00:00, 44.46it/s]
100%|██████████| 63/63 [00:08<00:00,  7.65it/s]


RESULTS FOR LOSS = [CosFace]
              precision    recall  f1-score   support

           0       0.91      0.94      0.93      1465
           1       0.83      0.76      0.79       542

    accuracy                           0.89      2007
   macro avg       0.87      0.85      0.86      2007
weighted avg       0.89      0.89      0.89      2007

-------> RUN 13
Computing embeddings...


100%|██████████| 879/879 [00:18<00:00, 46.31it/s]
100%|██████████| 63/63 [00:07<00:00,  8.06it/s]


RESULTS FOR LOSS = [CosFace]
              precision    recall  f1-score   support

           0       0.91      0.95      0.93      1450
           1       0.85      0.75      0.79       557

    accuracy                           0.89      2007
   macro avg       0.88      0.85      0.86      2007
weighted avg       0.89      0.89      0.89      2007

-------> RUN 14
Computing embeddings...


100%|██████████| 879/879 [00:19<00:00, 45.32it/s]
100%|██████████| 63/63 [00:08<00:00,  7.57it/s]


RESULTS FOR LOSS = [CosFace]
              precision    recall  f1-score   support

           0       0.90      0.95      0.92      1427
           1       0.85      0.73      0.79       580

    accuracy                           0.89      2007
   macro avg       0.87      0.84      0.86      2007
weighted avg       0.88      0.89      0.88      2007

-------> RUN 15
Computing embeddings...


100%|██████████| 879/879 [00:19<00:00, 45.54it/s]
100%|██████████| 63/63 [00:07<00:00,  7.98it/s]


RESULTS FOR LOSS = [CosFace]
              precision    recall  f1-score   support

           0       0.92      0.95      0.93      1455
           1       0.85      0.77      0.81       552

    accuracy                           0.90      2007
   macro avg       0.88      0.86      0.87      2007
weighted avg       0.90      0.90      0.90      2007


 Loss ===== CosFace
Mean Recall (Class 0): 0.9464393471322047
Mean Recall (Class 1): 0.761307872865164
Std Recall (Class 0): 0.005418160385782211
Std Recall (Class 1): 0.012872774179429749
