# Checking the Dimension of the smallest images

In [1]:
import cv2
import os
from skimage.io import imread_collection
from skimage.transform import rescale, resize, downscale_local_mean
import statistics
from random import shuffle

In [2]:
#train directory and loading in images
train_dir = "C:/Users/weeho/Desktop/Deep Learning Bootcamp/Datathon/eye_gender_data/train/*.jpg"
train = imread_collection(train_dir)
print(len(train))

9220


In [3]:
#test directory and loading in images
test_dir = "C:/Users/weeho/Desktop/Deep Learning Bootcamp/Datathon/eye_gender_data/test/*.jpg"
test = imread_collection(test_dir)
print(len(test))

2305


In [4]:
#Get the median dimension of the image in the training and testing directory
lst=[]
for i in range(len(train)):
    lst.append(train[i].shape[0])
for i in range(len(test)):
    lst.append(test[i].shape[0])
size=statistics.median(lst)
print(size)

56


# Loading Images

In [5]:
import glob
import pandas as pd
import cv2
import numpy as np
import matplotlib.pyplot as plt
class_names = ['female','male']
IMG_SIZE = size # pick the median dimension for resizing of image

In [6]:
#read in the training data csv for the labels
df=pd.read_csv("C:/Users/weeho/Desktop/Deep Learning Bootcamp/Datathon/eye_gender_data/Training_set.csv")

In [7]:
#Create a function that reads the data from a training data set and tag it with the correct label based on the csv
IMG_DIMS = (size, size)
def get_data(path):
    data = []
    files = glob.glob(path+"/*") # get files in each folder(class)
    for f in files:
        img = cv2.imread(f) #read the image
        img = cv2.resize(img,(IMG_SIZE,IMG_SIZE)) #resize the image
        if df.loc[df['filename'] == os.path.basename(f), 'label'].item()=="female":
            x=[0,1] #female is 1
        else:
            x=[1,0] #male is 0
        data.append([np.array(img)/255,np.array(x)])
    shuffle(data)
    return data

In [8]:
#Get the training set data prepared
training_data = get_data('C:/Users/weeho/Desktop/Deep Learning Bootcamp/Datathon/eye_gender_data/train') 

In [9]:
#Checking the shape of the training data array
training_data[1][0].shape

(56, 56, 3)

# Using VGG 16 Model

In [10]:
import tensorflow as tf
#define INPUT shape
INPUT_SHAPE=(size, size, 3)

#get VGG 16 model
vgg_layers = tf.keras.applications.vgg19.VGG19(weights='imagenet', include_top=False,
input_shape=INPUT_SHAPE)

vgg_layers.summary() #view summary of the vgg layers

Model: "vgg19"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 56, 56, 3)]       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 56, 56, 64)        1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 56, 56, 64)        36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 28, 28, 64)        0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 28, 28, 128)       73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 28, 28, 128)       147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 14, 14, 128)       0     

In [11]:
# Fine-tune all the layers
for layer in vgg_layers.layers:
    layer.trainable = True

# Check the trainable status of the individual layers
for layer in vgg_layers.layers:
    print(layer, layer.trainable)

<tensorflow.python.keras.engine.input_layer.InputLayer object at 0x00000262BA8C52C8> True
<tensorflow.python.keras.layers.convolutional.Conv2D object at 0x00000262C459E608> True
<tensorflow.python.keras.layers.convolutional.Conv2D object at 0x00000262C45DC588> True
<tensorflow.python.keras.layers.pooling.MaxPooling2D object at 0x00000262C4803788> True
<tensorflow.python.keras.layers.convolutional.Conv2D object at 0x00000262C480F088> True
<tensorflow.python.keras.layers.convolutional.Conv2D object at 0x00000262C4811908> True
<tensorflow.python.keras.layers.pooling.MaxPooling2D object at 0x00000262C481C608> True
<tensorflow.python.keras.layers.convolutional.Conv2D object at 0x00000262C4816A08> True
<tensorflow.python.keras.layers.convolutional.Conv2D object at 0x00000262C48263C8> True
<tensorflow.python.keras.layers.convolutional.Conv2D object at 0x00000262C263D488> True
<tensorflow.python.keras.layers.convolutional.Conv2D object at 0x00000262C4826208> True
<tensorflow.python.keras.layer

In [12]:
# define sequential model
model = tf.keras.models.Sequential()

# Add the vgg convolutional base model
model.add(vgg_layers)

# add flatten layer
model.add(tf.keras.layers.Flatten())

# add dense layers with dropout
model.add(tf.keras.layers.Dense(131072*2, activation='relu'))
model.add(tf.keras.layers.Dropout(rate=0.8))

# add output layer
#the labels I assigned to my training data is one-hot encoding, softmax is used
model.add(tf.keras.layers.Dense(2, activation='softmax'))

# compile model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-6),
loss='categorical_crossentropy',
metrics=['accuracy'])

# view model layers
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
vgg19 (Functional)           (None, 1, 1, 512)         20024384  
_________________________________________________________________
flatten (Flatten)            (None, 512)               0         
_________________________________________________________________
dense (Dense)                (None, 262144)            134479872 
_________________________________________________________________
dropout (Dropout)            (None, 262144)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 524290    
Total params: 155,028,546
Trainable params: 155,028,546
Non-trainable params: 0
_________________________________________________________________


In [13]:
EPOCHS = 100
#Separate the labels from the image array for input into tensorflow
train_labels=[i[1] for i in training_data]
training=np.array([i[0] for i in training_data]).reshape(-1,size,size,3)

In [14]:
#To ensure that my data is in the correct format for tensorflow
train_labels= tf.stack(train_labels)
training = tf.stack(training)

In [15]:
#Early Stopping to prevent overfitting
es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2,
restore_best_weights=True,
verbose=1)

#Execution of the model training
history = model.fit(training, train_labels,
batch_size=32,
callbacks=[es_callback],
validation_split=0.1, epochs=EPOCHS,
verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Restoring model weights from the end of the best epoch.
Epoch 00006: early stopping


# Model Prediction

In [16]:
# Separate function to prepare the testing data, uses the testing data csv to ensure that the data is
# being read into prediction model in the correct order
def test_data(path, excel):
    excel=pd.read_csv(excel)
    test = []
    files = glob.glob(path+"/*") # get files in each folder(class)
    for i in range(len(files)):
        img = cv2.imread(path+'/'+excel['filename'].iloc[i]) #read the image
        img = cv2.resize(img,(IMG_SIZE,IMG_SIZE)) #resize the image
        test.append([np.array(img)/255,excel['filename'].iloc[i]])
    return test

In [17]:
#Prepare testing data
testing_data= test_data('C:/Users/weeho/Desktop/Deep Learning Bootcamp/Datathon/eye_gender_data/test','C:/Users/weeho/Desktop/Deep Learning Bootcamp/Datathon/eye_gender_data/Testing_set.csv')

In [18]:
#Checking the testing data is being read in correctly
testing_data[:5]

[[array([[[0.43137255, 0.50980392, 0.67058824],
          [0.43921569, 0.51372549, 0.67843137],
          [0.44313725, 0.50980392, 0.67843137],
          ...,
          [0.49019608, 0.58431373, 0.86666667],
          [0.49019608, 0.58431373, 0.86666667],
          [0.50196078, 0.6       , 0.89019608]],
  
         [[0.42352941, 0.49803922, 0.66666667],
          [0.41176471, 0.48627451, 0.65490196],
          [0.43137255, 0.49803922, 0.67058824],
          ...,
          [0.47843137, 0.57254902, 0.85490196],
          [0.47058824, 0.56470588, 0.84313725],
          [0.4745098 , 0.57254902, 0.8627451 ]],
  
         [[0.44313725, 0.51764706, 0.69411765],
          [0.41960784, 0.49411765, 0.67058824],
          [0.40392157, 0.47843137, 0.65490196],
          ...,
          [0.47058824, 0.56470588, 0.83921569],
          [0.45490196, 0.55294118, 0.82745098],
          [0.46666667, 0.56470588, 0.84705882]],
  
         ...,
  
         [[0.42745098, 0.50588235, 0.75686275],
          [0.4

In [19]:
#Preparing the test image into the correct format for the model to read
test_x = np.array([i[0] for i in testing_data]).reshape(-1,IMG_SIZE,IMG_SIZE,3)
test_x=tf.stack(test_x)
prediction=model.predict(test_x) #predicting the test data

In [20]:
#Observe the first fifteen predictions
prediction[:15]

array([[9.99985337e-01, 1.46421180e-05],
       [7.06477940e-01, 2.93522060e-01],
       [8.53563070e-01, 1.46436870e-01],
       [9.94184673e-01, 5.81538444e-03],
       [9.71890330e-01, 2.81096939e-02],
       [9.88357961e-01, 1.16420379e-02],
       [1.90202962e-03, 9.98097956e-01],
       [9.81286943e-01, 1.87130719e-02],
       [9.46390331e-01, 5.36097065e-02],
       [1.13956441e-04, 9.99886036e-01],
       [5.16945720e-01, 4.83054280e-01],
       [1.16520934e-01, 8.83479059e-01],
       [8.91323611e-02, 9.10867631e-01],
       [9.94980991e-01, 5.01901843e-03],
       [9.84669685e-01, 1.53302625e-02]], dtype=float32)

In [21]:
#convert the predicted probability of each category to its category based on the highest predicted probability
test_result= np.argmax(prediction, axis=1)

In [22]:
#observe the test result
test_result

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

# Preparing data for export and submission

In [23]:
# Preparing the dataframe for manipulation for export
result=pd.DataFrame(data=test_result, index=None, columns=["label"]) #create dataframe for predictions

In [24]:
#Convert numerical output of the model back their categorical name
new_value = {0:'male', 1:'female'}
result['label'].replace(new_value,inplace=True)
result

Unnamed: 0,label
0,male
1,male
2,male
3,male
4,male
...,...
2300,female
2301,female
2302,male
2303,male


In [25]:
#Export data as csv
result["label"].to_csv('C:/Users/weeho/Desktop/Deep Learning Bootcamp/Datathon/eye_gender_data/submission_3.csv', mode = 'w' ,index=False)

In [None]:
#Saving the model
model.save('C:/Users/weeho/Desktop/Deep Learning Bootcamp/Datathon/eye_gender_data/my_model_3')