In [None]:
!pip install medmnist
!pip install numpy
!pip install scikit-learn
!pip install torch
!pip install torchvision


Collecting medmnist
  Downloading medmnist-3.0.2-py3-none-any.whl.metadata (14 kB)
Collecting fire (from medmnist)
  Downloading fire-0.7.0.tar.gz (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading medmnist-3.0.2-py3-none-any.whl (25 kB)
Building wheels for collected packages: fire
  Building wheel for fire (setup.py) ... [?25l[?25hdone
  Created wheel for fire: filename=fire-0.7.0-py3-none-any.whl size=114249 sha256=2bd0ef1bc55aaf19901a45ad44b2be422f125217aa349dd1b955c507251ae6ff
  Stored in directory: /root/.cache/pip/wheels/19/39/2f/2d3cadc408a8804103f1c34ddd4b9f6a93497b11fa96fe738e
Successfully built fire
Installing collected packages: fire, medmnist
Successfully installed fire-0.7.0 medmnist-3.0.2


In [None]:
from medmnist import PneumoniaMNIST 
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms
from sklearn.model_selection import train_test_split
import random
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold


##############################
**DATA LOADING AND PROCESSING**
##############################

In [None]:
train_dataset = PneumoniaMNIST(split='train', download=True)
test_dataset = PneumoniaMNIST(split='test', download=True)

# Assume train_dataset.imgs is a numpy array of shape (N, H, W, C)
x_train = train_dataset.imgs.astype('float32') / 255.0
y_train = train_dataset.labels.flatten()

x_test = test_dataset.imgs.astype('float32') / 255.0
y_test = test_dataset.labels.flatten()

# Flatten images for traditional model
x_train_flat = x_train.reshape(x_train.shape[0], -1)
x_test_flat = x_test.reshape(x_test.shape[0], -1)


Downloading https://zenodo.org/records/10519652/files/pneumoniamnist.npz?download=1 to /root/.medmnist/pneumoniamnist.npz


100%|██████████| 4.17M/4.17M [00:01<00:00, 3.29MB/s]


Using downloaded and verified file: /root/.medmnist/pneumoniamnist.npz


In [None]:
# For PyTorch CNN, convert images to tensors.
# Images are grayscale, shape (1, H, W)
def numpy_to_tensor(x):
    if x.ndim == 3:
        # x is (N, H, W) - add channel dimension to get (N, 1, H, W)
        x = np.expand_dims(x, axis=1)
    elif x.ndim == 4 and x.shape[-1] == 1:
        # x is (N, H, W, 1) - convert to (N, 1, H, W)
        x = x.transpose(0, 3, 1, 2)
    # Otherwise, assume it's already in the desired format.
    return torch.tensor(x, dtype=torch.float32)


x_train_tensor = numpy_to_tensor(x_train)
x_test_tensor  = numpy_to_tensor(x_test)


In [None]:
# Split training set into a new training set and a validation set for GA optimization.
# We use validation for computing ensemble fitness.
(x_train_flat_model, x_val_flat, 
 x_train_tensor_model, x_val_tensor, 
 y_train_model, y_val) = train_test_split(x_train_flat, x_train_tensor, y_train, test_size=0.2, random_state=42)


**DECISION TREE MODEL**

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(random_state=42, max_depth=10)
dt_model.fit(x_train_flat_model, y_train_model)
dt_val_probs = dt_model.predict_proba(x_val_flat)[:, 1]


**GAUSSIAN NAIVE BAYES MODEL**

In [None]:
from sklearn.naive_bayes import GaussianNB
bayes_model = GaussianNB()
bayes_model.fit(x_train_flat_model, y_train_model)
bayes_val_probs = bayes_model.predict_proba(x_val_flat)[:, 1]


**PYTORCH CNN MODEL**

In [None]:
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        # Input channels = 1, output channels = 32, kernel_size = 3
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2) ## Reduces 28x28 -> 14x14
        self.fc1 = nn.Linear(32 * 14 * 14, 128)
        self.fc2 = nn.Linear(128, 1)
        
    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        return x


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cnn_model = SimpleCNN().to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(cnn_model.parameters(), lr=0.001)


In [None]:
# Create a simple DataLoader for training
batch_size = 32
train_dataset_tensor = torch.utils.data.TensorDataset(x_train_tensor_model, torch.tensor(y_train_model, dtype=torch.float32).unsqueeze(1))
train_loader = torch.utils.data.DataLoader(train_dataset_tensor, batch_size=batch_size, shuffle=True)


In [None]:
cnn_model.train()
n_epochs = 7
for epoch in range(n_epochs):
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = cnn_model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()


In [None]:
cnn_model.eval()
with torch.no_grad():
    x_val_tensor = x_val_tensor.to(device)
    cnn_val_outputs = cnn_model(x_val_tensor).cpu().numpy().flatten()
cnn_val_probs = cnn_val_outputs


**Ensembling the predictions & Custom GA for Weight Optimization**

In [None]:
# The ensemble prediction is a weighted average of the three models’ probability outputs.
def ensemble_prediction(weights, rf_probs, cnn_probs, bayes_probs):
    # weights: [w_rf, w_cnn, w_bayes]
    total_weight = sum(weights)
    ensemble_probs = (weights[0]*rf_probs + weights[1]*cnn_probs + weights[2]*bayes_probs) / total_weight
    preds = (ensemble_probs > 0.5).astype(int)
    return preds


In [None]:
# Fitness function: returns accuracy on the validation set.
def fitness(weights, rf_probs, cnn_probs, bayes_probs, true_labels):
    preds = ensemble_prediction(weights, rf_probs, cnn_probs, bayes_probs)
    return accuracy_score(true_labels, preds)


In [None]:
def initialize_population(pop_size):
    # Initialize individuals with 3 weights in the range [0.1, 1.0]
    return [ [random.uniform(0.1, 1.0) for _ in range(3)] for _ in range(pop_size) ]


In [None]:
def tournament_selection(pop, fitnesses, tournament_size=3):
    selected = []
    pop_size = len(pop)
    for _ in range(pop_size):
        # Randomly choose individuals for tournament
        candidates = [ random.randrange(pop_size) for _ in range(tournament_size) ]
        best = max(candidates, key=lambda idx: fitnesses[idx])
        selected.append(pop[best])
    return selected


In [None]:
def blend_crossover(parent1, parent2, alpha=0.5):
    # For each gene, blend between parents
    child1, child2 = [], []
    for gene1, gene2 in zip(parent1, parent2):
        d = abs(gene1 - gene2)
        lower = min(gene1, gene2) - alpha*d
        upper = max(gene1, gene2) + alpha*d
        c1 = random.uniform(lower, upper)
        c2 = random.uniform(lower, upper)
        child1.append(c1)
        child2.append(c2)
    return child1, child2


In [None]:
def mutate(individual, mutation_rate=0.2, mutation_strength=0.1):
    # Mutate each gene with a probability and add a small random value.
    for i in range(len(individual)):
        if random.random() < mutation_rate:
            individual[i] += random.uniform(-mutation_strength, mutation_strength)
            # Keep gene in [0.1, 1.0]
            individual[i] = max(0.1, min(1.0, individual[i]))
    return individual


In [None]:
def fitness_cv(weights, dt_probs, cnn_probs, bayes_probs, true_labels, k=5, reg_lambda=0.01):
    # Prepare cross-validation splits (indices for the validation set)
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    auc_scores = []
    
    # Convert probabilities to numpy arrays if necessary.
    dt_probs = np.array(dt_probs)
    cnn_probs = np.array(cnn_probs)
    bayes_probs = np.array(bayes_probs)
    true_labels = np.array(true_labels)
    
    for train_idx, val_idx in kf.split(true_labels):
        # Get the fold's predictions and labels
        dt_fold = dt_probs[val_idx]
        cnn_fold = cnn_probs[val_idx]
        bayes_fold = bayes_probs[val_idx]
        labels_fold = true_labels[val_idx]
        
        total_weight = sum(weights)
        ensemble_probs = (weights[0] * dt_fold +
                          weights[1] * cnn_fold +
                          weights[2] * bayes_fold) / total_weight
        
        # Compute AUC for this fold; if all labels are the same, skip the fold.
        if len(np.unique(labels_fold)) > 1:
            auc = roc_auc_score(labels_fold, ensemble_probs)
            auc_scores.append(auc)
    
    # Average AUC across folds.
    avg_auc = np.mean(auc_scores) if auc_scores else 0.0
    
    # Regularization: Penalize extreme weights.
    reg_penalty = reg_lambda * np.sum(np.square(weights))
    
    # The fitness could be defined as average AUC minus the penalty.
    fitness_value = avg_auc - reg_penalty
    return fitness_value


In [None]:
def run_ga(dt_probs, cnn_probs, bayes_probs, true_labels, pop_size=20, generations=10):
    population = initialize_population(pop_size)
    best_individual = None
    best_fitness = -np.inf

    for gen in range(generations):
        fitnesses = [fitness_cv(ind, dt_probs, cnn_probs, bayes_probs, true_labels) for ind in population]
        # Track best
        gen_best_idx = np.argmax(fitnesses)
        if fitnesses[gen_best_idx] > best_fitness:
            best_fitness = fitnesses[gen_best_idx]
            best_individual = population[gen_best_idx]
        print(f"Generation {gen}: Best Fitness = {fitnesses[gen_best_idx]:.4f}, Weights = {population[gen_best_idx]}")
        
        # Selection
        selected = tournament_selection(population, fitnesses)
        # Crossover (pair up and create children)
        next_population = []
        for i in range(0, pop_size - 1, 2):
            child1, child2 = blend_crossover(selected[i], selected[i+1])
            next_population.append(child1)
            next_population.append(child2)
        # If odd population, add last individual
        if len(next_population) < pop_size:
            next_population.append(selected[-1])
        # Mutation
        next_population = [ mutate(ind, mutation_rate=0.3, mutation_strength=0.1) for ind in next_population ]
        population = next_population
    return best_individual, best_fitness


In [None]:
# Run the GA on the validation predictions.
# Change between rf and dt here
best_weights, best_acc = run_ga(dt_val_probs, cnn_val_probs, bayes_val_probs, y_val, pop_size=20, generations=30)
print("\nOptimized Ensemble Weights:", best_weights)
print("Validation Accuracy with Optimized Weights:", best_acc)


Generation 0: Best Fitness = 0.9745, Weights = [0.12435297564590883, 0.5421802206771655, 0.5591086898797808]
Generation 1: Best Fitness = 0.9832, Weights = [0.1, 0.6416518843498931, 0.10030349214125137]
Generation 2: Best Fitness = 0.9844, Weights = [0.1, 0.3912848412890316, 0.07153506248963104]
Generation 3: Best Fitness = 0.9856, Weights = [0.1, 0.37653477015506504, 0.01994245092834762]
Generation 4: Best Fitness = 0.9832, Weights = [0.1, 0.3733535879946029, 0.12088589997771951]
Generation 5: Best Fitness = 0.9839, Weights = [0.048369099918866516, 0.5362672468197539, 0.12076797577507653]
Generation 6: Best Fitness = 0.9839, Weights = [0.048369099918866516, 0.5362672468197539, 0.12076797577507653]
Generation 7: Best Fitness = 0.9849, Weights = [0.026560369800360376, 0.5026526107266767, 0.10721263711824836]
Generation 8: Best Fitness = 0.9849, Weights = [0.012016692672770158, 0.5039342662388241, 0.11437954199116138]
Generation 9: Best Fitness = 0.9851, Weights = [0.012354827817499324, 

**FINAL ENSEMBLE PREDICTION ON TEST SET**

In [None]:
# After optimizing, get predictions from each model on the test set.
bayes_test_probs = bayes_model.predict_proba(x_test_flat)[:, 1]
dt_test_probs = dt_model.predict_proba(x_test_flat)[:, 1]

# CNN on test set:
cnn_model.eval()
with torch.no_grad():
    x_test_tensor = x_test_tensor.to(device)
    cnn_test_outputs = cnn_model(x_test_tensor).cpu().numpy().flatten()
cnn_test_probs = cnn_test_outputs

# Use optimized weights for ensemble on test set.
def final_ensemble(weights):
    total_weight = sum(weights)
    #Change value here also
    ensemble_probs = (weights[0]*dt_test_probs + weights[1]*cnn_test_probs + weights[2]*bayes_test_probs) / total_weight
    preds = (ensemble_probs > 0.5).astype(int)
    return preds

test_preds = final_ensemble(best_weights)
test_acc = accuracy_score(y_test, test_preds)
print("\nTest Accuracy with Optimized Ensemble:", test_acc)



Test Accuracy with Optimized Ensemble: 0.8237179487179487


In [23]:
from sklearn.metrics import f1_score, confusion_matrix

# Assume these probability outputs have been computed on the test set:
# rf_test_probs, bayes_test_probs, cnn_test_probs, and ensemble_preds from final_ensemble(best_weights)
# For example, if using your final_ensemble function:
ensemble_preds = final_ensemble(best_weights)



dt_preds    = (dt_test_probs > 0.5).astype(int)
bayes_preds = (bayes_test_probs > 0.5).astype(int)
cnn_preds   = (cnn_test_probs > 0.5).astype(int)

# Compute F1 Scores.
dt_f1       = f1_score(y_test, dt_preds)
bayes_f1    = f1_score(y_test, bayes_preds)
cnn_f1      = f1_score(y_test, cnn_preds)
ensemble_f1 = f1_score(y_test, ensemble_preds)

# Compute Confusion Matrices.
dt_cm      = confusion_matrix(y_test, dt_preds)
bayes_cm    = confusion_matrix(y_test, bayes_preds)
cnn_cm      = confusion_matrix(y_test, cnn_preds)
ensemble_cm = confusion_matrix(y_test, ensemble_preds)

# Print results.
print("=== F1 Scores ===")
print('Decision Tree: ', dt_f1)
print("Bayesian Model:", bayes_f1)
print("CNN:", cnn_f1)
print("Ensemble:", ensemble_f1)

print("\n=== Confusion Matrices ===")
print('Decision Tree:\n', dt_cm )
print("\nBayesian Model:\n", bayes_cm)
print("\nCNN:\n", cnn_cm)
print("\nEnsemble:\n", ensemble_cm)


=== F1 Scores ===
Decision Tree:  0.8574739281575897
Bayesian Model: 0.8667496886674969
CNN: 0.8660714285714286
Ensemble: 0.8755656108597285

=== Confusion Matrices ===
Decision Tree:
 [[131 103]
 [ 20 370]]

Bayesian Model:
 [[169  65]
 [ 42 348]]

CNN:
 [[116 118]
 [  2 388]]

Ensemble:
 [[127 107]
 [  3 387]]
