#### Imports and Constants

In [1]:
from joblib import dump, load
from pathlib import Path
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
from scipy.stats import mode
from pytorch_metric_learning import testers
from sklearn.metrics import classification_report

TRAIN_SIZE = .7
BATCH_SIZE = 128
LEARNING_RATE = 0.01
EPOCHS = 10
EMBEDDING_SIZE = 128
NUM_CLASSES = 2
IMG_SIZE = (1, 96, 96)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#### Feature extractor class (CNN)

In [2]:
class Net(nn.Module):
    def __init__(self, embedding_size):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout2d(0.25)
        self.dropout2 = nn.Dropout2d(0.5)
        self.fc_input_size = self.calculate_fc_input_size(IMG_SIZE)
        self.fc1 = nn.Linear(self.fc_input_size, embedding_size)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        return x

    def calculate_fc_input_size(self, input_size):
        x = torch.randn(1, *input_size)
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        return x.size(1)

#### Function for simultaneous reading and splitting data from folders
Expected folder structure:
<br>

dataset<br>
...|___ leish<br>
...|___ no-leish<br>


In [3]:
def read_data(data_path):
    transform = transforms.Compose([
        transforms.Grayscale(num_output_channels=1),
        transforms.Resize((96, 96)),
        transforms.ToTensor(),
    ])
    dataset = datasets.ImageFolder(root=data_path, transform=transform)

    train_size = int(TRAIN_SIZE * len(dataset))
    # test_size = len(dataset) - train_size
    indices = torch.randperm(len(dataset)).tolist()
    train_indices, test_indices = indices[:train_size], indices[train_size:]
    train_dataset = torch.utils.data.Subset(dataset, train_indices)
    test_dataset = torch.utils.data.Subset(dataset, test_indices)

    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE)

    # Obtendo as etiquetas verdadeiras do test_dataset usando Subset
    train_true_labels = [dataset.targets[idx] for idx in train_indices]
    test_true_labels = [dataset.targets[idx] for idx in test_indices]

    leish_train = sum(label == 0 for _, label in train_dataset)
    leish_test = sum(label == 0 for _, label in test_dataset)

    print(f'label format = {dataset.class_to_idx}')
    print(f'train test split proportion = train[{len(train_dataset)}], test[{len(test_dataset)}]')
    print(f'leish in training set = {leish_train}')
    print(f'leish in testing set = {leish_test}')

    return train_dataset, test_dataset, train_loader, test_loader, train_true_labels, test_true_labels

#### Loading pre-trained models with respectives StandardScaler, PCA and SVM
ALL MODELS SHOULD BE USED FROM \v1-ensble_2023-11-07\ <br>

nome do arquivo zip = v1-ensble_2023-11-07.zip <br>
!gdown 1hVZE8yOAL6txiscunA5egvErwfBxZaEc

In [4]:
triplet_model = Net(EMBEDDING_SIZE).to(device)
triplet_model.load_state_dict(torch.load('./models/v1-ensble_2023-11-07\model_Triplet_v1-ensble_2023-11-07.pth'))

triplet_model.eval()

triplet_clf = load('./models/v1-ensble_2023-11-07\clf_Triplet_v1-ensble_2023-11-07.joblib')
triplet_scaler = load('./models/v1-ensble_2023-11-07\scaler_Triplet_v1-ensble_2023-11-07.joblib')
triplet_pca = load('./models/v1-ensble_2023-11-07\pca_Triplet_v1-ensble_2023-11-07.joblib')

################################

cosface_model = Net(EMBEDDING_SIZE).to(device)
cosface_model.load_state_dict(torch.load('./models/v1-ensble_2023-11-07\model_CosFace_v1-ensble_2023-11-07.pth'))

cosface_model.eval()

cosface_clf = load('./models/v1-ensble_2023-11-07\clf_CosFace_v1-ensble_2023-11-07.joblib')
cosface_scaler = load('./models/v1-ensble_2023-11-07\scaler_CosFace_v1-ensble_2023-11-07.joblib')
cosface_pca = load('./models/v1-ensble_2023-11-07\pca_CosFace_v1-ensble_2023-11-07.joblib')

################################

multisim_model = Net(EMBEDDING_SIZE).to(device)
multisim_model.load_state_dict(torch.load('./models/v1-ensble_2023-11-07\model_MultiSimilarity_v1-ensble_2023-11-07.pth'))

multisim_model.eval()

multisim_clf = load('./models/v1-ensble_2023-11-07\clf_MultiSimilarity_v1-ensble_2023-11-07.joblib')
multisim_scaler = load('./models/v1-ensble_2023-11-07\scaler_MultiSimilarity_v1-ensble_2023-11-07.joblib')
multisim_pca = load('./models/v1-ensble_2023-11-07\pca_MultiSimilarity_v1-ensble_2023-11-07.joblib')

### Model ensembling functions (technique: majority voting)

In [5]:
def get_all_embeddings(dataset, model):
    tester = testers.BaseTester()
    return tester.get_all_embeddings(dataset, model)

def get_predictions(x_test, models, classifiers, scalers, pcas):
    '''
    Função para pré-processar e obter previsões de cada classificador
    com base nas embeddings extraídas de cada modelo correspondente.
    '''
    predictions = []
    for model, clf, scaler, pca in zip(models, classifiers, scalers, pcas):
        test_embed, _ = get_all_embeddings(x_test, model)
        test_embed_scaled = scaler.transform(test_embed.cpu().numpy())
        test_embed_pca = pca.transform(test_embed_scaled)

        preds = clf.predict(test_embed_pca)
        predictions.append(preds)

    return predictions

def combine_predictions(predictions):
    '''
    Função para combinar previsões usando votação majoritária
    Axis=0 (col) para votação por amostra
    '''
    return mode(predictions, axis=0)[0]

#### Start flow

In [6]:
data_path = Path('./data-9500train/all_patches/') # <--- SUBSTITUA O DATASET
x_train, x_test, x_train_loader, x_test_loader, y_true_train, y_true_test = read_data(data_path)

label format = {'leish': 0, 'no-leish': 1}
train test split proportion = train[7357], test[3154]
leish in training set = 5152
leish in testing set = 2189


In [7]:
classifiers = [triplet_clf, cosface_clf, multisim_clf]
scalers = [triplet_scaler, cosface_scaler,  multisim_scaler]
pcas = [triplet_pca, cosface_pca, multisim_pca]
models = [triplet_model, cosface_model, multisim_model]

individual_preds = get_predictions(x_test, models, classifiers, scalers, pcas)
ensemble_prediction = combine_predictions(individual_preds)
print('---- Ensemble result for Triplet, Cosface, Multisimilarity')
print(classification_report(y_true_test, ensemble_prediction, target_names=['Leish', 'No Leish']))

100%|██████████| 99/99 [00:13<00:00,  7.18it/s]
100%|██████████| 99/99 [00:07<00:00, 13.16it/s]
100%|██████████| 99/99 [00:07<00:00, 13.71it/s]


---- Ensemble result for Triplet, Cosface, Multisimilarity
              precision    recall  f1-score   support

       Leish       0.97      0.94      0.95      2189
    No Leish       0.87      0.93      0.90       965

    accuracy                           0.94      3154
   macro avg       0.92      0.93      0.93      3154
weighted avg       0.94      0.94      0.94      3154

