# Downloading Data

In [6]:
%%bash

# checking if kaggle API is configured

if [ ! -d ~/.kaggle/ ]; then

    echo "Kaggle credentials are not configured"

else

    echo "Kaggle API is already configured"

fi

Kaggle API is already configured


In [18]:
%%bash

# checking if data is downloaded

if [ ! -d data/ ]; then

  echo "Downloading dataset..."
  kaggle competitions download dogs-vs-cats-redux-kernels-edition
  
  mkdir data/

  echo "Unzipping datasets"
  unzip -qq dogs-vs-cats-redux-kernels-edition.zip
  unzip -qq test.zip -d data/
  unzip -qq train.zip -d data/
  
  mv sample_submission.csv data/
  
  rm -rf test.zip train.zip dogs-vs-cats-redux-kernels-edition.zip

else

  echo "Dataset already dataset downloaded."

fi

Downloading dataset...
Downloading dogs-vs-cats-redux-kernels-edition.zip to /home/xiormeesh/git/ML-notebooks/nlp-getting-started

Unzipping datasets


  0%|          | 0.00/814M [00:00<?, ?B/s]  0%|          | 1.00M/814M [00:00<01:21, 10.4MB/s]  1%|          | 5.00M/814M [00:00<01:03, 13.4MB/s]  1%|          | 7.00M/814M [00:00<01:12, 11.7MB/s]  1%|          | 9.00M/814M [00:00<01:23, 10.1MB/s]  1%|▏         | 11.0M/814M [00:01<01:40, 8.36MB/s]  1%|▏         | 12.0M/814M [00:01<01:51, 7.56MB/s]  2%|▏         | 13.0M/814M [00:01<01:57, 7.12MB/s]  2%|▏         | 14.0M/814M [00:01<02:04, 6.73MB/s]  2%|▏         | 15.0M/814M [00:01<02:07, 6.55MB/s]  2%|▏         | 16.0M/814M [00:01<02:10, 6.42MB/s]  2%|▏         | 17.0M/814M [00:02<02:11, 6.33MB/s]  2%|▏         | 18.0M/814M [00:02<02:14, 6.21MB/s]  2%|▏         | 19.0M/814M [00:02<02:15, 6.16MB/s]  2%|▏         | 20.0M/814M [00:02<02:15, 6.13MB/s]  3%|▎         | 21.0M/814M [00:02<02:15, 6.15MB/s]  3%|▎         | 22.0M/814M [00:02<02:18, 6.01MB/s]  3%|▎         | 23.0M/814M [00:03<02:18, 5.98MB/s]  3%|▎         | 24.0M/814M [00:03<02:14, 6.15MB/s]  3%|▎         | 25.

In [28]:
import numpy as np
import pandas as pd
import os

from sklearn.model_selection import train_test_split

import tensorflow as tf
print("Tensorflow version is", tf.__version__)

BASE_DIR=os.getcwd()+'/data/'

TRAIN_DIR = BASE_DIR + "train/"
TEST_DIR = BASE_DIR + "test/"

TRAIN_SIZE = len([name for name in os.listdir(TRAIN_DIR)])
TEST_SIZE = len([name for name in os.listdir(TEST_DIR)])
print("Number of training images:", TRAIN_SIZE)
print("Number of test images:", TEST_SIZE)

VALID_FRACTION = 0.1
BATCH_SIZE = 100
EPOCHS = 10

IMAGE_WIDTH = IMAGE_HEIGHT = 150

Tensorflow version is 2.0.0
Number of training images: 25000
Number of test images: 12500


In [29]:
# creating df with train labels
train_filenames = os.listdir(TRAIN_DIR)
train_labels = []

for filename in train_filenames:
    label = filename.split('.')[0]
    train_labels.append(label)

train_df = pd.DataFrame({
    'id': train_filenames,
    'label': train_labels
})

# splitting to train & valid
train_df, valid_df = train_test_split(train_df, test_size=VALID_FRACTION)

# augmentation settings
train_datagen = tf.keras.preprocessing.image.ImageDataGenerator(    
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    rescale=1./255.,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# not doing any data augmentation on validation test set
valid_datagen  = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255.)

# creating train and valid generators (not using valid_split to avoid doing data augmentation on validation set)
train_generator = train_datagen.flow_from_dataframe(
    train_df, 
    TRAIN_DIR, 
    x_col='id',
    y_col='label',
    target_size=(IMAGE_WIDTH, IMAGE_HEIGHT),
    class_mode='binary',
    batch_size=BATCH_SIZE
)

valid_generator = valid_datagen.flow_from_dataframe(
    valid_df, 
    TRAIN_DIR, 
    x_col='id',
    y_col='label',
    target_size=(IMAGE_WIDTH, IMAGE_HEIGHT),
    class_mode='binary',
    batch_size=BATCH_SIZE
)

Found 22500 validated image filenames belonging to 2 classes.
Found 2500 validated image filenames belonging to 2 classes.


In [30]:
model = tf.keras.models.Sequential([
    # the images were resized by ImageDataGenerator 150x150 with 3 bytes color
    tf.keras.layers.Conv2D(32, (3,3), activation='relu', input_shape=(IMAGE_WIDTH, IMAGE_HEIGHT, 3)),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2), 
    tf.keras.layers.Conv2D(128, (3,3), activation='relu'), 
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Conv2D(128, (3,3), activation='relu'), 
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Flatten(), 
    # 512 neuron hidden layer
    tf.keras.layers.Dense(512, activation='relu'),
    # since we have only 2 classes to predict we can use 1 neuron and sigmoid
    tf.keras.layers.Dense(1, activation='sigmoid')  
])

model.summary()

model.compile(optimizer=tf.keras.optimizers.RMSprop(lr=0.001),
    loss='binary_crossentropy',
    metrics = ['accuracy'])

es = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
    mode='min',
    restore_best_weights=True, 
    verbose=1,
    patience=5)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 148, 148, 32)      896       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 74, 74, 32)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 72, 72, 64)        18496     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 36, 36, 64)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 34, 34, 128)       73856     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 17, 17, 128)       0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 15, 15, 128)       1

In [None]:
%%time

# training
history = model.fit_generator(train_generator,
    validation_data=valid_generator,
    steps_per_epoch=round(TRAIN_SIZE*(1.-VALID_FRACTION)/BATCH_SIZE),
    validation_steps=round(TRAIN_SIZE*VALID_FRACTION/BATCH_SIZE),
    epochs=EPOCHS,
    callbacks=[es],
    verbose=1)

Epoch 1/10

In [None]:
#plotting

import matplotlib.pyplot as plt

#-----------------------------------------------------------
# Retrieve a list of list results on training and test data
# sets for each training epoch
#-----------------------------------------------------------
acc = history.history['accuracy']
val_acc = history.history[ 'val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs   = range(len(acc)) # Get number of epochs

#------------------------------------------------
# Plot training and validation accuracy per epoch
#------------------------------------------------
plt.plot(epochs, acc)
plt.plot(epochs, val_acc)
plt.title('Training and validation accuracy')
plt.figure()

#------------------------------------------------
# Plot training and validation loss per epoch
#------------------------------------------------
plt.plot(epochs, loss)
plt.plot(epochs, val_loss)
plt.title('Training and validation loss')

In [None]:
%%time

# preparing testing data
test_filenames = os.listdir(TEST_DIR)
test_df = pd.DataFrame({
    'id': test_filenames
})

test_datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale = 1.0/255.)

test_generator = test_datagen.flow_from_dataframe(
    test_df, 
    TEST_DIR, 
    x_col='id',
    y_col=None,
    class_mode=None,
    target_size=(IMAGE_WIDTH, IMAGE_HEIGHT),
    batch_size=BATCH_SIZE,
    shuffle=False
)

yhat = model.predict_generator(test_generator, steps=np.ceil(TEST_SIZE/BATCH_SIZE))

In [None]:
# sigmoid returns probability between 0 and 1, need to convert it to an integer class
yhat = [1 if y > 0.5 else 0 for y in yhat]

test_df['label'] = yhat

# restoring back to class names (dog|cat)
label_map = dict((v,k) for k,v in train_generator.class_indices.items())
test_df['label'] = test_df['label'].replace(label_map)

# encoding according to submission format, 1 = dog, 0 = cat
test_df['label'] = test_df['label'].replace({ 'dog': 1, 'cat': 0 })

test_df.to_csv('submission.csv', index=False)

In [None]:
!kaggle competitions submit -c dogs-vs-cats-redux-kernels-edition -f sample_submission.csv -m "testing API submission"