<a href="https://colab.research.google.com/github/tevrat-aksoy/GlobalAIHubDLCourse/blob/main/Homeworks/Homework_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Design Choices in Convolutional Neural Networks

###  Importing packages

In [1]:
import numpy as np
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from keras import backend as K
from keras.preprocessing import image
from keras.applications.mobilenet import MobileNet
from keras.applications.vgg16 import preprocess_input, decode_predictions
from keras.models import Model
import timeit

import warnings
warnings.filterwarnings('ignore')

### Preparing Dataset

In [2]:
batch_size = 128
num_classes = 10
epochs = 2

# input image dimensions
img_rows, img_cols = 28, 28

# the data, shuffled and split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()

if K.image_data_format() == 'channels_first':
    x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
    x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
    input_shape = (1, img_rows, img_cols)
else:
    x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
    x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
    input_shape = (img_rows, img_cols, 1)

x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
x_train shape: (60000, 28, 28, 1)
60000 train samples
10000 test samples


## Part 1: Influence of convolution size

Try the models with different convolution sizes 5x5, 7x7 and 9x9 etc.

Analyze the number of model parameters, accuracy and training time

### Model with (3 x 3) Convolution

In [3]:
K.clear_session()
start = timeit.default_timer()   
model = Sequential()
model.add(Conv2D(8, kernel_size=(3, 3), activation='relu', input_shape=input_shape))
model.add(Conv2D(16, (3, 3), activation='relu'))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
model.summary()
model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adadelta(), metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(x_test, y_test))
end = timeit.default_timer()
print("Time Taken to run the model:",end - start, "seconds")  

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 26, 26, 8)         80        
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 24, 24, 16)        1168      
_________________________________________________________________
flatten (Flatten)            (None, 9216)              0         
_________________________________________________________________
dense (Dense)                (None, 32)                294944    
_________________________________________________________________
dense_1 (Dense)              (None, 10)                330       
Total params: 296,522
Trainable params: 296,522
Non-trainable params: 0
_________________________________________________________________
Epoch 1/2
Epoch 2/2
Time Taken to run the model: 67.28864693800006 seconds


### Try models with different Convolution sizes

In [4]:
# Write your code here. Use the same architecture as above. 

K.clear_session()
start = timeit.default_timer()   
model = Sequential()
model.add(Conv2D(8, kernel_size=(5, 5), activation='relu', input_shape=input_shape))
model.add(Conv2D(16, (5, 5), activation='relu'))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
model.summary()
model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adadelta(), metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(x_test, y_test))
end = timeit.default_timer()
print("Time Taken to run the model:",end - start, "seconds")  

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 24, 24, 8)         208       
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 20, 20, 16)        3216      
_________________________________________________________________
flatten (Flatten)            (None, 6400)              0         
_________________________________________________________________
dense (Dense)                (None, 32)                204832    
_________________________________________________________________
dense_1 (Dense)              (None, 10)                330       
Total params: 208,586
Trainable params: 208,586
Non-trainable params: 0
_________________________________________________________________
Epoch 1/2
Epoch 2/2
Time Taken to run the model: 89.65510916699998 seconds


In [5]:
# Write your code here. Use the same architecture as above. 

K.clear_session()
start = timeit.default_timer()   
model = Sequential()
model.add(Conv2D(8, kernel_size=(7, 7), activation='relu', input_shape=input_shape))
model.add(Conv2D(16, (7, 7), activation='relu'))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
model.summary()
model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adadelta(), metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(x_test, y_test))
end = timeit.default_timer()
print("Time Taken to run the model:",end - start, "seconds") 

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 22, 22, 8)         400       
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 16, 16, 16)        6288      
_________________________________________________________________
flatten (Flatten)            (None, 4096)              0         
_________________________________________________________________
dense (Dense)                (None, 32)                131104    
_________________________________________________________________
dense_1 (Dense)              (None, 10)                330       
Total params: 138,122
Trainable params: 138,122
Non-trainable params: 0
_________________________________________________________________
Epoch 1/2
Epoch 2/2
Time Taken to run the model: 105.62451993400009 seconds


In [6]:
# Write your code here. Use the same architecture as above. 
# Write your code here. Use the same architecture as above. 

K.clear_session()
start = timeit.default_timer()   
model = Sequential()
model.add(Conv2D(8, kernel_size=(9, 9), activation='relu', input_shape=input_shape))
model.add(Conv2D(16, (9, 9), activation='relu'))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
model.summary()
model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adadelta(), metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(x_test, y_test))
end = timeit.default_timer()
print("Time Taken to run the model:",end - start, "seconds") 


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 20, 20, 8)         656       
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 12, 12, 16)        10384     
_________________________________________________________________
flatten (Flatten)            (None, 2304)              0         
_________________________________________________________________
dense (Dense)                (None, 32)                73760     
_________________________________________________________________
dense_1 (Dense)              (None, 10)                330       
Total params: 85,130
Trainable params: 85,130
Non-trainable params: 0
_________________________________________________________________
Epoch 1/2
Epoch 2/2
Time Taken to run the model: 109.27956164399995 seconds


### Write your findings about activations here?

1.   When model validation accuracies are checked, we can see that bigger convolution filtered models don't have better accuracy. For this dataset 5x5 convolution seems a better choice but there can be a better filter size for a different dataset


2.   Also bigger convulution filter increase training time. 




## Part 2: Influence of Striding

Try the models with different stride sizes such as 2,3,4 etc.

Analyze the number of model parameters, accuracy and training time

### Model with Convolution with 2 Steps

In [7]:
start = timeit.default_timer()   
model = Sequential()
model.add(Conv2D(8, kernel_size=(3, 3), strides=2, activation='relu', input_shape=input_shape))
model.add(Conv2D(16, (3, 3), strides=2, activation='relu'))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
model.summary()
model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adadelta(), metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(x_test, y_test))
end = timeit.default_timer()
print("Time Taken to run the model:",end - start, "seconds")  

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_2 (Conv2D)            (None, 13, 13, 8)         80        
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 6, 6, 16)          1168      
_________________________________________________________________
flatten_1 (Flatten)          (None, 576)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                18464     
_________________________________________________________________
dense_3 (Dense)              (None, 10)                330       
Total params: 20,042
Trainable params: 20,042
Non-trainable params: 0
_________________________________________________________________
Epoch 1/2
Epoch 2/2
Time Taken to run the model: 13.973207538999986 seconds


In [8]:
# Write your code here. Use the same architecture as above. 

start = timeit.default_timer()   
model = Sequential()
model.add(Conv2D(8, kernel_size=(3, 3), strides=3, activation='relu', input_shape=input_shape))
model.add(Conv2D(16, (3, 3), strides=3, activation='relu'))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
model.summary()
model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adadelta(), metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(x_test, y_test))
end = timeit.default_timer()
print("Time Taken to run the model:",end - start, "seconds")  

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_4 (Conv2D)            (None, 9, 9, 8)           80        
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 3, 3, 16)          1168      
_________________________________________________________________
flatten_2 (Flatten)          (None, 144)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 32)                4640      
_________________________________________________________________
dense_5 (Dense)              (None, 10)                330       
Total params: 6,218
Trainable params: 6,218
Non-trainable params: 0
_________________________________________________________________
Epoch 1/2
Epoch 2/2
Time Taken to run the model: 7.603855410999927 seconds


In [9]:
# Write your code here. Use the same architecture as above. 
start = timeit.default_timer()   
model = Sequential()
model.add(Conv2D(8, kernel_size=(3, 4), strides=4, activation='relu', input_shape=input_shape))
model.add(Conv2D(16, (3, 3), strides=4, activation='relu'))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
model.summary()
model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adadelta(), metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(x_test, y_test))
end = timeit.default_timer()
print("Time Taken to run the model:",end - start, "seconds")  


Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_6 (Conv2D)            (None, 7, 7, 8)           104       
_________________________________________________________________
conv2d_7 (Conv2D)            (None, 2, 2, 16)          1168      
_________________________________________________________________
flatten_3 (Flatten)          (None, 64)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_7 (Dense)              (None, 10)                330       
Total params: 3,682
Trainable params: 3,682
Non-trainable params: 0
_________________________________________________________________
Epoch 1/2
Epoch 2/2
Time Taken to run the model: 6.074844444000064 seconds


In [10]:
# Write your code here. Use the same architecture as above. 

start = timeit.default_timer()   
model = Sequential()
model.add(Conv2D(8, kernel_size=(3, 3), strides=5, activation='relu', input_shape=input_shape))
model.add(Conv2D(16, (3, 3), strides=5, activation='relu'))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
model.summary()
model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adadelta(), metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(x_test, y_test))
end = timeit.default_timer()
print("Time Taken to run the model:",end - start, "seconds")  

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_8 (Conv2D)            (None, 6, 6, 8)           80        
_________________________________________________________________
conv2d_9 (Conv2D)            (None, 1, 1, 16)          1168      
_________________________________________________________________
flatten_4 (Flatten)          (None, 16)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 32)                544       
_________________________________________________________________
dense_9 (Dense)              (None, 10)                330       
Total params: 2,122
Trainable params: 2,122
Non-trainable params: 0
_________________________________________________________________
Epoch 1/2
Epoch 2/2
Time Taken to run the model: 5.021497554999996 seconds


### Write your findings about influence of striding here?

1.   It is seen that the increasing stride value decreases the training time.
2.   Also, it would be more appropriate to choose low values as it may cause the model to skip some data. Because of that increasing stride value may cause lower accuracy.




## Part 3: Influence of Padding

Try the models with padding and without padding.

Analyze the number of model parameters, accuracy and training time

### Model with (3 x 3) Convolution with Same Padding

In [11]:
start = timeit.default_timer()   
model = Sequential()
model.add(Conv2D(8, kernel_size=(3, 3), strides=1, padding='same', activation='relu', input_shape=input_shape))
model.add(Conv2D(16, (3, 3), strides=1, padding='same', activation='relu'))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
model.summary()
model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adadelta(), metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(x_test, y_test))
end = timeit.default_timer()
print("Time Taken to run the model:",end - start, "seconds")  

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_10 (Conv2D)           (None, 28, 28, 8)         80        
_________________________________________________________________
conv2d_11 (Conv2D)           (None, 28, 28, 16)        1168      
_________________________________________________________________
flatten_5 (Flatten)          (None, 12544)             0         
_________________________________________________________________
dense_10 (Dense)             (None, 32)                401440    
_________________________________________________________________
dense_11 (Dense)             (None, 10)                330       
Total params: 403,018
Trainable params: 403,018
Non-trainable params: 0
_________________________________________________________________
Epoch 1/2
Epoch 2/2
Time Taken to run the model: 83.48119941699997 seconds


In [19]:
# Write your code here. Use the same architecture as above. 

start = timeit.default_timer()   
model = Sequential()
model.add(Conv2D(8, kernel_size=(3, 3), strides=1, padding='valid', activation='relu', input_shape=input_shape))
model.add(Conv2D(16, (3, 3), strides=1, padding='valid', activation='relu'))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
model.summary()
model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adadelta(), metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(x_test, y_test))
end = timeit.default_timer()
print("Time Taken to run the model:",end - start, "seconds")  

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_16 (Conv2D)           (None, 26, 26, 8)         80        
_________________________________________________________________
conv2d_17 (Conv2D)           (None, 24, 24, 16)        1168      
_________________________________________________________________
flatten_7 (Flatten)          (None, 9216)              0         
_________________________________________________________________
dense_14 (Dense)             (None, 32)                294944    
_________________________________________________________________
dense_15 (Dense)             (None, 10)                330       
Total params: 296,522
Trainable params: 296,522
Non-trainable params: 0
_________________________________________________________________
Epoch 1/2
Epoch 2/2
Time Taken to run the model: 67.26940655599992 seconds


In [20]:
# Write your code here. Use the same architecture as above. 

start = timeit.default_timer()   
model = Sequential()
model.add(Conv2D(8, kernel_size=(3, 3), strides=1 , activation='relu', input_shape=input_shape))
model.add(Conv2D(16, (3, 3), strides=1, activation='relu'))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
model.summary()
model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adadelta(), metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(x_test, y_test))
end = timeit.default_timer()
print("Time Taken to run the model:",end - start, "seconds")  

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_18 (Conv2D)           (None, 26, 26, 8)         80        
_________________________________________________________________
conv2d_19 (Conv2D)           (None, 24, 24, 16)        1168      
_________________________________________________________________
flatten_8 (Flatten)          (None, 9216)              0         
_________________________________________________________________
dense_16 (Dense)             (None, 32)                294944    
_________________________________________________________________
dense_17 (Dense)             (None, 10)                330       
Total params: 296,522
Trainable params: 296,522
Non-trainable params: 0
_________________________________________________________________
Epoch 1/2
Epoch 2/2
Time Taken to run the model: 67.93638893100001 seconds


### Write your findings about influence of padding here?

1.   With padding model learned more features and reached higher accuracy.
2.   Since there are more parameters with padding training time also increased.




## Part 4: Influence of Pooling

Try the models with different pooling window sizes such as 2x2, 3x3, 4x4 etc.

Analyze the number of model parameters, accuracy and training time

### Model with (3 x 3) Convolution with Pooling (2 x 2) 

In [21]:
start = timeit.default_timer()   
model = Sequential()
model.add(Conv2D(8, kernel_size=(3, 3), activation='relu', input_shape=input_shape))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(16, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
model.summary()
model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adadelta(), metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(x_test, y_test))
end = timeit.default_timer()
print("Time Taken to run the model:",end - start, "seconds")  

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_20 (Conv2D)           (None, 26, 26, 8)         80        
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 13, 13, 8)         0         
_________________________________________________________________
conv2d_21 (Conv2D)           (None, 11, 11, 16)        1168      
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 5, 5, 16)          0         
_________________________________________________________________
flatten_9 (Flatten)          (None, 400)               0         
_________________________________________________________________
dense_18 (Dense)             (None, 32)                12832     
_________________________________________________________________
dense_19 (Dense)             (None, 10)              

### Model with (3 x 3) Convolution with Pooling (3 x 3) 

In [22]:
# Write your code here 

# Use the same model design from the above cell 

start = timeit.default_timer()   
model = Sequential()
model.add(Conv2D(8, kernel_size=(3, 3), activation='relu', input_shape=input_shape))
model.add(MaxPooling2D(pool_size=(3, 3)))
model.add(Conv2D(16, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(3, 3)))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
model.summary()
model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adadelta(), metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(x_test, y_test))
end = timeit.default_timer()
print("Time Taken to run the model:",end - start, "seconds")  

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_22 (Conv2D)           (None, 26, 26, 8)         80        
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 8, 8, 8)           0         
_________________________________________________________________
conv2d_23 (Conv2D)           (None, 6, 6, 16)          1168      
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 2, 2, 16)          0         
_________________________________________________________________
flatten_10 (Flatten)         (None, 64)                0         
_________________________________________________________________
dense_20 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_21 (Dense)             (None, 10)              

In [25]:
# Write your code here 
#Model with (3 x 3) Convolution with Pooling (4 x 4)
# Use the same model design from the above cell 
start = timeit.default_timer()   
model = Sequential()
model.add(Conv2D(8, kernel_size=(3, 3), activation='relu', input_shape=input_shape))
model.add(MaxPooling2D(pool_size=(4, 4)))
model.add(Conv2D(16, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(4, 4)))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
model.summary()
model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adadelta(), metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(x_test, y_test))
end = timeit.default_timer()
print("Time Taken to run the model:",end - start, "seconds")  

Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_26 (Conv2D)           (None, 26, 26, 8)         80        
_________________________________________________________________
max_pooling2d_8 (MaxPooling2 (None, 6, 6, 8)           0         
_________________________________________________________________
conv2d_27 (Conv2D)           (None, 4, 4, 16)          1168      
_________________________________________________________________
max_pooling2d_9 (MaxPooling2 (None, 1, 1, 16)          0         
_________________________________________________________________
flatten_12 (Flatten)         (None, 16)                0         
_________________________________________________________________
dense_24 (Dense)             (None, 32)                544       
_________________________________________________________________
dense_25 (Dense)             (None, 10)              

### Write your findings about influence of pooling here?
1.   Increasing pooling size decreased learnable parameter number and training time.  
2.   4x4 pooling size has the best validation accuracy compared to other sizes but this model has a less learnable parameter. If we increase training epochs I think 2x2 filter will perform better

