Week 9 Coding Assignment 1

Steven VanOmmeren

In [1]:
import tensorflow as tf
mnist = tf.keras.datasets.fashion_mnist

# Basic Dense Network
First, we train a basic model with 2 dense layers. The first layer has 128 neurons and uses a ReLU activation, while the second layer has 10 neurons (corresponding to the 10 output classes) and uses softmax to ensure that the model is predicting probabilities. 

In [2]:
# Prepare data
(training_images, training_labels), (test_images, test_labels) = mnist.load_data()
training_images=training_images/255.0
test_images=test_images/255.0

# Define the model
model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(10, activation='softmax')
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
tf.random.set_seed(1234)    
model.fit(training_images, training_labels, epochs=5)
model.summary()
test_loss, test_accuracy = model.evaluate(test_images, test_labels)
print ('Test loss: {}, Test accuracy: {}'.format(test_loss, test_accuracy*100))

Epoch 1/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.8267 - loss: 0.4947
Epoch 2/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.8658 - loss: 0.3702
Epoch 3/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.8798 - loss: 0.3292
Epoch 4/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8880 - loss: 0.3043
Epoch 5/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8942 - loss: 0.2865


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 885us/step - accuracy: 0.8751 - loss: 0.3459
Test loss: 0.345913290977478, Test accuracy: 87.51000165939331


The Dense model has 87.5% accuracy on the test set, with 101,770 trainable parameters. Training time was 14 seconds.

# Basic CNN
Next, we add convolutional layers to the beginning of the network, with max pooling layers to compress the data for later layers. We have to prepare the data again to ensure that the inputs are in 2-dimensional image format (rather than flat vectors). 

In [3]:
# Prepare data for CNN
(training_images, training_labels), (test_images, test_labels) = mnist.load_data()
training_images=training_images.reshape(60000, 28, 28, 1)
training_images=training_images / 255.0
test_images = test_images.reshape(10000, 28, 28, 1)
test_images=test_images / 255.0

# Define the CNN model
model = tf.keras.models.Sequential([
  tf.keras.layers.Conv2D(64, (3, 3), activation='relu', input_shape=(28, 28, 1)),
  tf.keras.layers.MaxPooling2D(2, 2),
  tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
  tf.keras.layers.MaxPooling2D(2,2),
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(10, activation='softmax')
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()
tf.random.set_seed(1234)    
model.fit(training_images, training_labels, epochs=5)
test_loss, test_accuracy = model.evaluate(test_images, test_labels)
print ('Test loss: {}, Test accuracy: {}'.format(test_loss, test_accuracy*100))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 7ms/step - accuracy: 0.8419 - loss: 0.4372
Epoch 2/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 7ms/step - accuracy: 0.8936 - loss: 0.2897
Epoch 3/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 7ms/step - accuracy: 0.9101 - loss: 0.2436
Epoch 4/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 7ms/step - accuracy: 0.9227 - loss: 0.2087
Epoch 5/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 7ms/step - accuracy: 0.9335 - loss: 0.1809
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9026 - loss: 0.2766
Test loss: 0.27658793330192566, Test accuracy: 90.25999903678894


The CNN model has 90.3% accuracy on the test set, with 243,786 trainable parameters. Training time was 68.6 seconds, much longer than the Dense network.

In [12]:
# Define some helper functions to avoid repetitive model definition code
def make_model(
        conv_layers=[64, 64],
        use_max_pooling=[True, True],
        dense_layers=[128, 10],
        ):
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.InputLayer(shape=(28, 28, 1)))
    for i, filters in enumerate(conv_layers):
        model.add(tf.keras.layers.Conv2D(filters, (3, 3), activation='relu'))
        if use_max_pooling[i]:
            model.add(tf.keras.layers.MaxPooling2D(2, 2))
    model.add(tf.keras.layers.Flatten())
    for units in dense_layers[:-1]:
        model.add(tf.keras.layers.Dense(units, activation='relu'))
    model.add(tf.keras.layers.Dense(dense_layers[-1], activation='softmax'))
    
    return model

def run_model(model):
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    print(model.summary())
    tf.random.set_seed(1234)    
    model.fit(training_images, training_labels, epochs=5)
    test_loss, test_accuracy = model.evaluate(test_images, test_labels)
    print ('Test loss: {}, Test accuracy: {}'.format(test_loss, test_accuracy*100))

# Exercise 1
Try editing the convolutions. Change the number of convolutions from 32 to either 16 or 64. What impact does that have on accuracy and training time?

In [13]:
model = make_model(conv_layers=[16, 16], use_max_pooling=[True,True], dense_layers=[128, 10])
run_model(model)

None
Epoch 1/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.8175 - loss: 0.5034
Epoch 2/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.8744 - loss: 0.3418
Epoch 3/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.8902 - loss: 0.3001
Epoch 4/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.8985 - loss: 0.2737
Epoch 5/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9056 - loss: 0.2536
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8914 - loss: 0.2990
Test loss: 0.2989948093891144, Test accuracy: 89.13999795913696


Changing to use 16 filters in each convolution layer dramatically reduced the parameter count from 243,786 to 55,098. Accuracy decreased slightly from 90.3% to 89.1%, which is still a notable improvement over the Dense model.

# Exercise 2
Remove the final convolution. What impact does that have on accuracy or training time?

In [14]:
model = make_model(conv_layers=[16], use_max_pooling=[True], dense_layers=[128, 10])
run_model(model)

None
Epoch 1/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.8601 - loss: 0.3970
Epoch 2/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9009 - loss: 0.2771
Epoch 3/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9165 - loss: 0.2330
Epoch 4/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9283 - loss: 0.1992
Epoch 5/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9390 - loss: 0.1699
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9115 - loss: 0.2655
Test loss: 0.2655012309551239, Test accuracy: 91.14999771118164


Surpisingly, removing one of the convolution layers *increased* test accuracy from 90.3% to 91.1%. The parameter count increased from 243,786 to 347,690 because we also remove the second max pooling layer, so that after flattening our convolution layer, there are more neurons to connect to our dense layers.

Another surprising finding to me is that training time on this model is less than the original CNN, taking 33 seconds vs 68 seconds in the first CNN. Even though the model has more parameters, it trains faster. 

# Exercise 3
Add more convolutions. What impact does that have?

In [15]:
model = make_model(conv_layers=[64,32,16], use_max_pooling=[True,True,False], dense_layers=[128, 10])
run_model(model)

None
Epoch 1/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 5ms/step - accuracy: 0.8015 - loss: 0.5405
Epoch 2/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - accuracy: 0.8706 - loss: 0.3556
Epoch 3/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - accuracy: 0.8870 - loss: 0.3099
Epoch 4/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - accuracy: 0.8964 - loss: 0.2830
Epoch 5/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - accuracy: 0.9038 - loss: 0.2611
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8917 - loss: 0.3015
Test loss: 0.30146539211273193, Test accuracy: 89.1700029373169


I added another convolution layer to the model, but did not apply max pooling to the final convolution. This is because applying another max pooling layer would make the resulting image 1 by 1 pixels. The resulting model has 43,578 parameters and obtains about the same accuracy as my model from Exercise 1. This is really impressive performance for the parameter count. The model beats a Dense network with more than twice as many parameters.

In [16]:
model = make_model(conv_layers=[64,32,16], use_max_pooling=[True,False,False], dense_layers=[128, 10])
run_model(model)

None
Epoch 1/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 6ms/step - accuracy: 0.8419 - loss: 0.4381
Epoch 2/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 6ms/step - accuracy: 0.8972 - loss: 0.2831
Epoch 3/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 6ms/step - accuracy: 0.9119 - loss: 0.2383
Epoch 4/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 6ms/step - accuracy: 0.9239 - loss: 0.2059
Epoch 5/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.9352 - loss: 0.1783
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9057 - loss: 0.2749
Test loss: 0.2748929560184479, Test accuracy: 90.57000279426575


# Exercise 4
Remove all convolutions but the first. What impact does that have? Experiment with it.

In [26]:
model = make_model(conv_layers=[32], use_max_pooling=[True], dense_layers=[128, 10])
run_model(model)

None
Epoch 1/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.8597 - loss: 0.3997
Epoch 2/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.9030 - loss: 0.2686
Epoch 3/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.9195 - loss: 0.2216
Epoch 4/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.9333 - loss: 0.1849
Epoch 5/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.9449 - loss: 0.1548
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9130 - loss: 0.2639
Test loss: 0.2639484107494354, Test accuracy: 91.29999876022339


When using just one 32-filter convolutional layer, the model has a total of 693,962 parameters and gets the best test accuracy so far at 91.3%! I was surprised that training only took 1 minute, which is less than other models with far fewer parameters.

In [27]:
model = make_model(conv_layers=[32], use_max_pooling=[False], dense_layers=[50, 10])
run_model(model)

None
Epoch 1/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 7ms/step - accuracy: 0.8587 - loss: 0.3985
Epoch 2/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 7ms/step - accuracy: 0.9028 - loss: 0.2668
Epoch 3/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 7ms/step - accuracy: 0.9212 - loss: 0.2151
Epoch 4/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 7ms/step - accuracy: 0.9365 - loss: 0.1747
Epoch 5/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 7ms/step - accuracy: 0.9497 - loss: 0.1407
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8960 - loss: 0.3294
Test loss: 0.3293915390968323, Test accuracy: 89.60000276565552


I tried this much larger model with over 1 million parameters, but it did not improve performance.

In [30]:
model = tf.keras.models.Sequential([
  tf.keras.layers.Conv2D(64, (3,3), activation='relu', input_shape=(28, 28, 1)),
  tf.keras.layers.MaxPooling2D(2, 2),
  tf.keras.layers.Conv2D(128, (3,3), activation='relu', input_shape=(28, 28, 1)),
  tf.keras.layers.MaxPooling2D(2, 2),
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dropout(0.5),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(10, activation='softmax')
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()
tf.random.set_seed(1234)    
model.fit(training_images, training_labels, epochs=5)
test_loss, test_accuracy = model.evaluate(test_images, test_labels)
print ('Test loss: {}, Test accuracy: {}'.format(test_loss, test_accuracy*100))

Epoch 1/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 10ms/step - accuracy: 0.7939 - loss: 0.5610
Epoch 2/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 10ms/step - accuracy: 0.8679 - loss: 0.3653
Epoch 3/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 10ms/step - accuracy: 0.8859 - loss: 0.3115
Epoch 4/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 10ms/step - accuracy: 0.8962 - loss: 0.2827
Epoch 5/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 10ms/step - accuracy: 0.9051 - loss: 0.2584
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9016 - loss: 0.2707
Test loss: 0.2706586420536041, Test accuracy: 90.16000032424927


This was my last-ditch effort to improve test accuracy, where I use two convolutional layers with lots of filters, an additional dense layer, and I incorporate dropout to reduce overfitting. This model has half a million parameters, and gets 90.2% accuracy, which is not as good as other models I tried.