In [3]:
import tensorflow as tf 
import numpy as np 
import matplotlib.pyplot as plt

# Let's build GoogleNet 


## This architecture uses the Inception module, We will explain this module :)


<h3>Important notes about the first reading : </h3>
<br>
<ul> 
    <li>
        <i>" 1 × 1 convolutions have dual purpose: most critically, they are used mainly as dimension reduction modules to remove computational bottlenecks "</i>
    </li>
    <li>
        <i>The most straightforward way of improving the performance of deep neural networks is by increasing their size</i>
    </li>
    <li>
        <i>Bigger size typically means a larger number of parameters, which makes the enlarged network more
            prone to overfitting</i>
    </li>
    <li>
        <i>Another drawback of uniformly increased network size is the dramatically increased use of computational resources</i>
    </li>
    <li>
        <i> current incarnations of the Inception architecture are restricted to filter sizes 1×1,
3×3 and 5×5, however this decision was based more on convenience rather than necessity</i>
    <li>
        <img src="inception.PNG"/>
    </li>
    <li>
        <i>Besides being used as reductions, they also include the use of rectified linear activation which makes them dual-purpose</i>
    </li>
    <li>
        <i>occasional max-pooling layers with stride 2 to halve the resolution of the grid</i>
    </li>
    <li>
        <img src="GoogleNetSummary.PNG">
    </li>
    <li>
        <i>
            We have also
used a deeper and wider Inception network, the quality of which was slightly inferior, but adding it
to the ensemble seemed to improve the results marginally.
        </i>
    </li>
</ul>

<h2>Topology</h2>

<ul>
    <li><i>size of the receptive field in our network is 224×224 taking RGB color channels with mean subtraction</i></li>
    <li><i>“#3×3 reduce” and “#5×5 reduce” stands for the number of 1×1 filters in the reduction</i></li>
    <li><i>The network is 22 layers deep when counting only layers with
        parameters (or 27 layers if we also count pooling)</i></li>
    <li><i>The overall number of layers (independent building blocks) used for the construction of the network is about 100.</i></li>
    <li><i>however the use of dropout remained
        essential even after removing the fully connected layers</i></li>
    <li><i>By adding auxiliary classifiers connected to
these intermediate layers, we would expect to encourage discrimination in the lower stages in the
classifier, increase the gradient signal that gets propagated back, and provide additional regularization</i></li>
    <li>First extra Classifier : </li>
    <img src="first_classifier.PNG">
    <li>Second extra classifier </li>
    <img src="extra_classifier2.PNG">
    <li><i>An average pooling layer with 5×5 filter size and stride 3, resulting in an 4×4×512 output
        for the (4a), and 4×4×528 for the (4d) stage</i></li>
    <li><i>A 1×1 convolution with 128 filters for dimension reduction and rectified linear activation.</i></li>
    <li><i>A fully connected layer with 1024 units and rectified linear activation.</i></li>
    <li><i>A dropout layer with 70% ratio of dropped outputs</i></li>
    <li><i>A linear layer with softmax loss as the classifier (predicting the same 1000 classes as the
        main classifier, but removed at inference time).</i></li>
    <img src="full_GoogleNet.PNG">
</ul>
<h2>Training </h2>

<ul>
    <li><i> Specifically, we resize the image to 4 scales where the shorter dimension (height or
        width) is 256, 288, 320 and 352 respectively</i></li>
    <li><i>During training, their loss gets added to the total loss of the
network with a discount weight (the losses of the auxiliary classifiers were weighted by 0.3). At
        inference time, these auxiliary networks are discarded.</i></li>

## The Inception block

In [17]:
def inception_block(x,conv1, conv2, conv3, conv4, classifier=False): 
    x1 = tf.keras.layers.Conv2D(filters=conv1[0], kernel_size=1, strides=1, padding="SAME")(x)
    x2 = tf.keras.layers.Conv2D(filters=conv2[0], kernel_size=1, strides=1, padding="SAME")(x)
    x2_b = tf.keras.layers.Conv2D(filters=conv2[1], kernel_size=3, strides=1, padding="SAME")(x)
    x3 = tf.keras.layers.Conv2D(filters=conv3[0], kernel_size=1, strides=1, padding="SAME")(x)
    x3_b = tf.keras.layers.Conv2D(filters=conv3[1], kernel_size=5, strides=1, padding="SAME")(x)
    #Max pooling => 1x1
    max_pool = tf.keras.layers.MaxPooling2D(strides=1)(x)
    x4 = tf.keras.layers.Conv2D(filters=conv4[0], kernel_size=1, strides=1, padding="SAME")(x)
    
    if classifier: 
        x5 = tf.keras.layers.AveragePooling2D(pool_size=(5,5),strides=3, padding="VALID")(x)
        x5_b = tf.keras.layers.Conv2D(filters=128)
        x5_c = tf.keras.layers.Dense(units=1024)
    #concatenation 
    y = tf.keras.layers.Concatenate(axis=3)([x1,x2_b,x3_b,x4])
    
    return y

#### let's test our inception_block with the first inception topology in the paper (3a). According to the paper, we should have a 28x28x256 output tensor

In [18]:
# we use tf.keras.Input to indicate the input shape
A = tf.keras.Input(shape=(28,28,192))
# We pass our tensor to the inception block
y = inception_block(A, conv1=[64], conv2=[96, 128], conv3=[16,32], conv4=[32])
# We compare our output tensor's dimension to the output tensor's dimension in the paper
y.shape

TensorShape([None, 28, 28, 256])