In [1]:

from datetime import datetime
import tensorflow as tf
import keras
from keras import layers
from keras.datasets import mnist
from keras import backend as K
import numpy as np
from numpy import random as rand
import random
import threading


2024-05-11 16:35:21.942432: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-11 16:35:21.943998: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-11 16:35:21.969659: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-11 16:35:21.969692: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-11 16:35:21.970409: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to

In [2]:
num_classes = 10
img_rows, img_cols = 28, 28

(x_train, y_train), (x_test, y_test) = mnist.load_data()

if K.image_data_format() == 'channels_first':
    x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
    x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
    input_shape = (1, img_rows, img_cols)
else:
    x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
    x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
    input_shape = (img_rows, img_cols, 1)

x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

def build():
    inputs = keras.Input(shape=(28, 28, 1))
    x = layers.Conv2D(32, kernel_size=(3, 3), activation="relu", padding="same")(inputs)
    x = layers.MaxPooling2D(pool_size=(2, 2), strides=2)(x)
    x = layers.Conv2D(32, kernel_size=(3, 3), activation="relu", padding="same")(x)
    x = layers.MaxPooling2D(pool_size=(2, 2), strides=2)(x)
    x = layers.Flatten()(x)
    x = layers.Dense(128, activation="relu")(x)
    outputs = layers.Dense(num_classes, activation="softmax")(x)
    return keras.Model(inputs=inputs, outputs=outputs)


x_train shape: (60000, 28, 28, 1)
60000 train samples
10000 test samples


In [8]:
def sgd_fit(model, batch_size = 32, learning_rate = 0.001 , x_train = x_train, y_train = y_train, validation_data=(x_test, y_test)):
    train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(batch_size)
    validation_data = tf.data.Dataset.from_tensor_slices(validation_data).batch(batch_size)

    optimizer = keras.optimizers.SGD(
        learning_rate=learning_rate
    )
    loss_fn = keras.losses.CategoricalCrossentropy(from_logits=False)

    val_accuracy_metric = keras.metrics.CategoricalAccuracy()

    @tf.function
    def run_train_step(images, labels):
        with tf.GradientTape() as tape:
            logits = model(images)
            loss = loss_fn(labels, logits)
            if model.losses:
                loss += tf.math.add_n(model.losses)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    @tf.function
    def run_val_step(images, labels):
        logits = model(images)
        val_accuracy_metric.update_state(labels, logits)

    best_val_accuracy = 0.0
    timedelt = 0.0
    i=0
    init_time = datetime.now()
    for epoch in range(2):
        # print(f"Epoch: {int(epoch)+1}")

        for images, labels in train_ds:
            run_train_step(images, labels)

        val_accuracy_metric.reset_states()

        for images, labels in validation_data:
            run_val_step(images, labels)

        val_accuracy = float(val_accuracy_metric.result().numpy())

        # print(f"Validation Accuracy: {val_accuracy}")
        if val_accuracy < best_val_accuracy:
            i = i + 1    
            
        best_val_accuracy = max(best_val_accuracy, val_accuracy)

        if i > 3: # Early stopping criteria
            break
        
    end_time = datetime.now()
    timedelt = round((end_time - init_time).total_seconds(), 4)
    

    return float(best_val_accuracy), timedelt


In [13]:
def adam_fit(model, batch_size = 32, learning_rate = 0.001, beta_1 = 0.9, beta_2 = 0.99, x_train = x_train, y_train = y_train, validation_data=(x_test, y_test)):
    train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(batch_size)
    validation_data = tf.data.Dataset.from_tensor_slices(validation_data).batch(batch_size)

    optimizer = keras.optimizers.Adam(
        learning_rate = learning_rate,
        beta_1 = beta_1,
        beta_2 = beta_2,
    )
    
    loss_fn = keras.losses.CategoricalCrossentropy(from_logits=False)

    val_accuracy_metric = keras.metrics.CategoricalAccuracy()

    @tf.function
    def run_train_step(images, labels):
        with tf.GradientTape() as tape:
            logits = model(images)
            loss = loss_fn(labels, logits)
            if model.losses:
                loss += tf.math.add_n(model.losses)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    @tf.function
    def run_val_step(images, labels):
        logits = model(images)
        val_accuracy_metric.update_state(labels, logits)

    best_val_accuracy = 0.0
    timedelt = 0.0
    i=0
    init_time = datetime.now()
    for epoch in range(2):
        # print(f"Epoch: {int(epoch)+1}")

        for images, labels in train_ds:
            run_train_step(images, labels)

        val_accuracy_metric.reset_states()

        for images, labels in validation_data:
            run_val_step(images, labels)

        val_accuracy = float(val_accuracy_metric.result().numpy())

        # print(f"Validation Accuracy: {val_accuracy}")
        if val_accuracy < best_val_accuracy:
            i = i + 1    
            
        best_val_accuracy = max(best_val_accuracy, val_accuracy)

        if i > 3: # Early stopping criteria
            break
        
    end_time = datetime.now()
    timedelt = end_time - init_time
    

    return best_val_accuracy, timedelt



In [25]:
#GA Hyperparameters
num_gens = 5
population_num = 8 #Must be a number that is divisible by 4
mutation_prob = 0.1


In [28]:
class SGD_GA:
    def __init__(self, num_gens, num_pop, mut_prob):
        self.num_gens = num_gens
        self.num_pop = num_pop
        self.mutation_chance = rand.poisson(lam=mut_prob, size=2*num_pop*num_gens)
        self.genes = []
        self.pop_eval = {}
        self.ranked_keys = []
        self.mutation_index = 0
        
    
    def generate_population(self):
        batch_sizes = 32+(32*rand.randint(low = 0, high = 15, size = self.num_pop)) #random multiples of 32 from 32 to 512
        learning_rates = np.logspace(start = 0.0001, stop = 0.01, num = self.num_pop, endpoint=0.01)-1 # evenly spaced on a log scale, technically not random but random enough for my experiment
        rand.shuffle(learning_rates)
        
        for i in range(self.num_pop):
            self.genes.append((batch_sizes[i], learning_rates[i]))  
            
        
    
    def calculate_fitness(self):
        self.pop_eval = {}
        self.ranked_keys = []
        for i in range(self.num_pop):
            val_score, train_time = sgd_fit(model = build(), batch_size=self.genes[i][0], learning_rate=self.genes[i][1])
            self.pop_eval[val_score] = (self.genes[i][0], self.genes[i][1], train_time)
        
    
    def rankNelim(self):        
        self.ranked_keys = list(self.pop_eval.keys())
        self.ranked_keys.sort(reverse=True)
        
        self.genes = []
        
        #Populate with highest performing genes
        for i in range(0,int(self.num_pop/4),1):
            self.genes.append((self.pop_eval[self.ranked_keys[i]][0], self.pop_eval[self.ranked_keys[i]][1]))
        
    
    def crossover(self):
        #Populate with crossovers of highest performing genes(HARD CODED SO MUST BE ALTERED IF GENE STRUCTURE IS CHANGED[E.G. NEW HYPERPARAMETERS ARE INTRODUCED])
        for i in range(0, int(self.num_pop/4), 2): # cross over every 2 genes' chromosomes and push it to the list of genes
            self.genes.append((self.genes[i][0], self.genes[i+1][1]))
            self.genes.append((self.genes[i+1][0], self.genes[i][1]))
            
        
        #New values to replace low performing genes
        batch_sizes = 32+(32*rand.randint(low = 0, high = 15, size = int(self.num_pop/2)))
        learning_rates = np.logspace(start = 0.001, stop = 0.01, num = int(self.num_pop/2), endpoint=0.01)-1 
        rand.shuffle(learning_rates)
        
        for i in range(int(self.num_pop/2)):
            self.genes.append((batch_sizes[i], learning_rates[i]))
        
    
    def mutate(self):
        mut_gene = []
        
        for i in range(int(self.num_pop)):
            for j in range(2):
                if self.mutation_chance[self.mutation_index] > 0:
                    if j == 0:
                        mut_gene = list(self.genes[i])
                        mut_gene[j] = 32+(32*rand.randint(low = 0, high = 15))
                        self.genes.pop(i)
                        self.genes.append(mut_gene)
                        
                    
                    if j == 1:
                        mut_gene = list(self.genes[i])
                        mut_gene[j] = random.choice([0.95, 1.05]) * (self.genes[i][j])
                        self.genes.pop(i)
                        self.genes.append(mut_gene)
                    
                
                self.mutation_index = self.mutation_index + 1
                
            
        
    
    def GA(self):
        print("Starting GA\n")
        self.generate_population()
        
        for i in range(self.num_gens):
            init_time = datetime.now()
            print(f"Generation {i+1} started")
            self.calculate_fitness()
            self.rankNelim()
            self.crossover()
            self.mutate()
            print(f"Best")
            print(f"Accuracy: {self.ranked_keys[0]}")
            print(f"Batch Size: {self.pop_eval[self.ranked_keys[0]][0]}")
            print(f"Learning Rate: {self.pop_eval[self.ranked_keys[0]][1]}")
            print(f"Time: {self.pop_eval[self.ranked_keys[0]][2]}\n\n\n")
            print(f"Generation Time: {round((datetime.now()-init_time).total_seconds(), 4)}\n\n\n")
        
        self.calculate_fitness()
        self.rankNelim()
        self.mutation_index = 0
        
        
        return self.ranked_keys[0], self.pop_eval[self.ranked_keys[0]][0], self.pop_eval[self.ranked_keys[0]][1], self.pop_eval[self.ranked_keys[0]][2]
    

In [29]:
sgd_ga = SGD_GA(num_pop = population_num, num_gens = num_gens, mut_prob = mutation_prob)
top_val, top_batchsize, top_learningrate, top_traintime = sgd_ga.GA()

Starting GA

Generation 1 started
Best
Accuracy: 0.9714999794960022
Batch Size: 32
Learning Rate: 0.016649914753727568
Time: 23.9106



Generation Time: 161.3771



Generation 2 started
Best
Accuracy: 0.9757000207901001
Batch Size: 32
Learning Rate: 0.019966045204601546
Time: 24.3951



Generation Time: 166.5048



Generation 3 started
Best
Accuracy: 0.9754999876022339
Batch Size: 32
Learning Rate: 0.019966045204601546
Time: 24.5349



Generation Time: 170.9543



Generation 4 started
Best
Accuracy: 0.9757000207901001
Batch Size: 32
Learning Rate: 0.0232929922807541
Time: 24.1636



Generation Time: 171.1751



Generation 5 started
Best
Accuracy: 0.9782000184059143
Batch Size: 32
Learning Rate: 0.0232929922807541
Time: 24.1182



Generation Time: 164.2066





In [30]:
print(top_val)
print(top_batchsize)
print(top_learningrate)
print(top_traintime)
print(sgd_ga.pop_eval)

0.9760000109672546
32
0.0232929922807541
21.975
{0.9760000109672546: (32, 0.0232929922807541, 21.975), 0.9754999876022339: (32, 0.020964347464831622, 23.2916), 0.9732999801635742: (32, 0.020964347464831622, 22.3228), 0.9746000170707703: (32, 0.0232929922807541, 24.803), 0.8787000179290771: (64, 0.0023052380778996184, 25.3439), 0.8959000110626221: (128, 0.009252886076684508, 19.9707), 0.8716999888420105: (480, 0.016248692870695525, 13.9792), 0.8888999819755554: (480, 0.022128342666716393, 13.6163)}


In [31]:
class AdaM_GA():
    def __init__(self, num_gens, num_pop, mut_prob):
        self.num_gens = num_gens
        self.num_pop = num_pop
        self.mutation_chance = rand.poisson(mut_prob, 4*num_pop*num_gens)
        self.genes = []
        self.pop_eval = {}
        self.ranked_keys = []
        self.mutation_index = 0
    
    def generate_population(self):
        batch_sizes = 32+(32*rand.randint(low = 0, high = 15, size = self.num_pop)) #random multiples of 32 from 32 to 512
        
        learning_rates = np.logspace(start = 0.0001, stop = 0.01, num = self.num_pop, endpoint=0.01)-1 # evenly spaced on a log scale, technically not random but random enough for my experiment
        rand.shuffle(learning_rates)
        
        beta_1 = np.logspace(start = 0.27, stop = 0.3, num = self.num_pop, endpoint=False)-1
        rand.shuffle(beta_1)
        
        beta_2 = np.logspace(start = 0.29, stop = 0.3, num = self.num_pop, endpoint=False)-1
        rand.shuffle(beta_2)
        
        for i in range(self.num_pop):
            self.genes.append((batch_sizes[i], learning_rates[i], beta_1[i], beta_2[i]))  

    
    def calculate_fitness(self):
        self.pop_eval = {}
        self.ranked_keys = []
        
        for i in range(self.num_pop):
            val_score, train_time = adam_fit(model = build(), batch_size=self.genes[i][0], learning_rate= self.genes[i][1], beta_1 = self.genes[i][2], beta_2 = self.genes[i][3])
            self.pop_eval[val_score] = (self.genes[i][0], self.genes[i][1], self.genes[i][2], self.genes[i][3], train_time)
    
    def rankNelim(self):        
        self.ranked_keys = list(self.pop_eval.keys())
        self.ranked_keys.sort(reverse = True)
        
        self.genes = []
        
        #Populate with highest performing genes
        for i in range(0, int(self.num_pop/4)):
            self.genes.append((self.pop_eval[self.ranked_keys[i]][0], self.pop_eval[self.ranked_keys[i]][1], self.pop_eval[self.ranked_keys[i]][2], self.pop_eval[self.ranked_keys[i]][3]))

    
    def crossover(self):
        #Crossover and append highest performing genes
        for i in range(0, int(self.num_pop/4), 2): # cross over every 2 genes' chromosomes and push it to the list of genes
            self.genes.append((self.genes[i][0], self.genes[i+1][1], self.genes[i][2], self.genes[i+1][3]))
            self.genes.append((self.genes[i+1][0], self.genes[i][1], self.genes[i+1][2], self.genes[i][3]))
        
        #New values to replace low performing genes
        batch_sizes = 32+(32*rand.randint(low = 0, high = 15, size = int(self.num_pop/2)))
        
        learning_rates = np.logspace(start = 0.001, stop = 0.01, num = int(self.num_pop/2), endpoint=False)-1 
        rand.shuffle(learning_rates)
        
        beta_1 = np.logspace(start = 0.2, stop = 0.3, num = int(self.num_pop/2), endpoint=False)-1 
        rand.shuffle(beta_1)
        
        beta_2 = np.logspace(start = 0.29, stop = 0.3, num = int(self.num_pop/2), endpoint=False)-1 
        rand.shuffle(beta_2)
        
        for i in range(int(self.num_pop/2)):
            self.genes.append((batch_sizes[i], learning_rates[i], beta_1[i], beta_2[i]))
    
    def mutate(self):
        for i in range(int(self.num_pop)):
            for j in range(4):
                if self.mutation_chance[self.mutation_index] > 0:
                    if j == 0:
                        mut_gene = list(self.genes[i])
                        mut_gene[j] = 32+(32*rand.randint(low = 0, high = 15))
                        self.genes.pop(i)
                        self.genes.append(mut_gene)
                        
                    
                    if j == 1:
                        mut_gene = list(self.genes[i])
                        mut_gene[j] = random.choice([0.95, 1.05]) * (self.genes[i][j])
                        self.genes.pop(i)
                        self.genes.append(mut_gene)
                    
                
                self.mutation_index = self.mutation_index + 1
    
    def GA(self):
        self.generate_population()
        print("Starting GA\n")
        
        for i in range(self.num_gens):
            init_time = datetime.now()
            print(f"Generation {i+1} started")
            self.calculate_fitness()
            self.rankNelim()
            self.crossover()
            self.mutate()
            print(f"Best")
            print(f"Accuracy: {self.ranked_keys[0]}")
            print(f"Batch Size: {self.pop_eval[self.ranked_keys[0]][0]}")
            print(f"Learning Rate: {self.pop_eval[self.ranked_keys[0]][1]}")
            print(f"Beta_1: {self.pop_eval[self.ranked_keys[0]][2]}")
            print(f"Beta_2: {self.pop_eval[self.ranked_keys[0]][3]}")
            print(f"Time: {self.pop_eval[self.ranked_keys[0]][4]}\n\n\n")
            print(f"Generation Time: {round((datetime.now()-init_time).total_seconds(), 4)}\n\n\n")
        
        self.calculate_fitness()
        self.rankNelim()
        self.mutation_index = 0
        
        return self.ranked_keys[0], self.pop_eval[self.ranked_keys[0]][0], self.pop_eval[self.ranked_keys[0]][1], self.pop_eval[self.ranked_keys[0]][2], self.pop_eval[self.ranked_keys[0]][3], self.pop_eval[self.ranked_keys[0]][4]
    

In [32]:
adam_ga = AdaM_GA(num_pop = population_num, num_gens = num_gens, mut_prob = mutation_prob)
tops_val, tops_batchsize, tops_learningrate, top_beta1, top_beta2, tops_traintime = adam_ga.GA()

Starting GA

Generation 1 started
Best
Accuracy: 0.9866999983787537
Batch Size: 320
Learning Rate: 0.0034928575720136745
Beta_1: 0.9611011754760475
Beta_2: 0.9781080029332396
Time: 0:00:16.335079



Generation Time: 161.0203



Generation 2 started
Best
Accuracy: 0.987500011920929
Batch Size: 320
Learning Rate: 0.0034928575720136745
Beta_1: 0.9611011754760475
Beta_2: 0.9781080029332396
Time: 0:00:15.913208



Generation Time: 137.9849



Generation 3 started
Best
Accuracy: 0.9868000149726868
Batch Size: 320
Learning Rate: 0.0075114721791962324
Beta_1: 0.9611011754760475
Beta_2: 0.9724227361148536
Time: 0:00:16.067331



Generation Time: 148.1605



Generation 4 started
Best
Accuracy: 0.9887999892234802
Batch Size: 320
Learning Rate: 0.0034928575720136745
Beta_1: 0.9611011754760475
Beta_2: 0.9781080029332396
Time: 0:00:16.731608



Generation Time: 149.9056



Generation 5 started
Best
Accuracy: 0.9868999719619751
Batch Size: 416
Learning Rate: 0.0034928575720136745
Beta_1: 0.9611011754

In [33]:
print(tops_val)
print(tops_batchsize)
print(tops_learningrate)
print(top_beta1)
print(top_beta2)
print(tops_traintime)


0.9871000051498413
320
0.0034928575720136745
0.9611011754760475
0.9781080029332396
0:00:16.815866
