In [45]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/aerial-cactus-identification/train.csv
/kaggle/input/aerial-cactus-identification/test.zip
/kaggle/input/aerial-cactus-identification/train.zip
/kaggle/input/aerial-cactus-identification/sample_submission.csv


Import the necessary libraries

In [47]:
import glob
import os
import shutil
import zipfile

import pandas as pd
import pkg_resources
import tensorflow as tf
from matplotlib import pyplot as plt
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.image import ImageDataGenerator
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

TERMCOLOR = True
if TERMCOLOR:
    from termcolor import colored
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('max_colwidth', 500)
pd.set_option('display.width', 10000)

Num GPUs Available:  1


A function to list the packages and versions installed in the environment

In [84]:
def list_packages_versions():
    installed_packages = pkg_resources.working_set
    installed_packages_list = sorted(["%s==%s" % (i.key, i.version) for i in installed_packages])
    for item in installed_packages_list:
        print(item)
    print('\n')
    return


A function that converts the raw directory structure into one that is compatible with ImageDataGenerator, a Keras class that facilitates easy train/validation/test data preprocessing.

In [113]:
def data_raw2ImageDataGenerator(path):
    
    if not os.path.exists(path + 'data'):
        os.makedirs(path+'data')
        
    if not os.path.exists(path + 'train.csv'):
        shutil.copyfile('/kaggle/input/aerial-cactus-identification/train.csv', path + 'train.csv')
    if not os.path.exists(path + 'test.zip'):
        shutil.copyfile('/kaggle/input/aerial-cactus-identification/test.zip', path + 'test.zip')
    if not os.path.exists(path + 'train.zip'):
        shutil.copyfile('/kaggle/input/aerial-cactus-identification/train.zip', path + 'train.zip')
    if not os.path.exists(path + 'sample_submissoin.csv'):
        shutil.copyfile('/kaggle/input/aerial-cactus-identification/sample_submission.csv', path + 'sample_submission.csv')
    
    with zipfile.ZipFile(path+'train.zip', 'r') as zip_ref:
        zip_ref.extractall(path)
    with zipfile.ZipFile(path+'test.zip', 'r') as zip_ref:
        zip_ref.extractall(path)
        
    if not os.path.exists(path+'test/no_label'):    
        os.makedirs(path+'test/no_label')
    source_dir = path+'test'
    target_dir = path+'test/no_label'
    file_names = os.listdir(source_dir)

    for file_name in file_names:
        try:
            shutil.move(os.path.join(source_dir, file_name), target_dir)
        except shutil.Error:
            pass
    
    # create a dictionary of image labels. it is best to cache this information now to avoid an N^2 image sorting
    # algorithm. Now it is 2*N. Much better
    label_dict = {}
    with open(path + 'train.csv', 'r') as a_file:
        for line in a_file:
            pair = line.split(',')
            if pair[1].rstrip() == '0':
                label = 'not_has_cactus'
            else:
                label = 'has_cactus'
            label_dict[pair[0]] = label
    a_file.close()

    # create the flow_from_directory folders
    if not os.path.exists(path + '/train/has_cactus'):
        os.makedirs(path + '/train/has_cactus')
    if not os.path.exists(path + '/train/not_has_cactus'):
        os.makedirs(path + '/train/not_has_cactus')
    if not os.path.exists(path + '/submissions'):
        os.makedirs(path + '/submissions')

    # go through each image in the original folder and look up its placement in the dictionary
    for image in glob.glob(path + '/train/*.jpg'):
        image = image.split('/')[-1]
        shutil.move(path + 'train/' + image, path + '/train/' + label_dict[image] + '/' + image)
      

    return

The 2DConvNet. A simple model implemented from scratch.

In [111]:
def build_model(data_root_path):
    DESIRED_ACCURACY = 0.99

    class myCallback(tf.keras.callbacks.Callback):
        def on_epoch_end(self, epoch, logs={}):
            if logs.get('accuracy') >= DESIRED_ACCURACY:  # this is the stopping criterion for the training
                print("\nReached " + str(DESIRED_ACCURACY * 100) + "% accuracy so cancelling training!")
                self.model.stop_training = True

    callbacks = myCallback()

    model = tf.keras.models.Sequential([
        # Note the input shape is the desired size of the image 300x300 with 3 bytes color
        # This is the first convolution
        tf.keras.layers.Conv2D(16, (3, 3), activation='relu', input_shape=(32, 32, 3)),
        tf.keras.layers.MaxPooling2D(2, 2),
        # The second convolution
        tf.keras.layers.Conv2D(32, (3, 3), activation='relu'),
        tf.keras.layers.MaxPooling2D(2, 2),
        # The third convolution
        tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
        tf.keras.layers.MaxPooling2D(2, 2),
        # # The fourth convolution
        # tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
        # tf.keras.layers.MaxPooling2D(2, 2),
        # # The fifth convolution
        # tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
        # tf.keras.layers.MaxPooling2D(2, 2),
        # Flatten the results to feed into a DNN
        tf.keras.layers.Flatten(),
        # 512 neuron hidden layer
        tf.keras.layers.Dense(512, activation='relu'),
        # Only 1 output neuron. It will contain a value from 0-1 where 0 for 1 class ('horses') and 1 for the other ('humans')
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    print(model.summary())

    model.compile(loss='binary_crossentropy',
                  optimizer=RMSprop(lr=0.001),
                  metrics=['accuracy'])

    # This code block should create an instance of an ImageDataGenerator called train_datagen
    # And a train_generator by calling train_datagen.flow_from_directory
    train_datagen = ImageDataGenerator(featurewise_center=True,
                                       featurewise_std_normalization=True,
                                       rotation_range=40,
                                       width_shift_range=0.2,
                                       zoom_range=0.2,
                                       horizontal_flip=True,
                                       rescale=1 / 255,
                                       validation_split=0.2)

    # Flow training images in batches of 128 using train_datagen generator
    train_generator = train_datagen.flow_from_directory(data_root_path + 'train',
                                                        subset='training',
                                                        target_size=(32, 32),
                                                        batch_size=128,
                                                        class_mode='binary')
    validation_generator = train_datagen.flow_from_directory(data_root_path + 'train',
                                                             subset='validation',
                                                             target_size=(32, 32),
                                                             batch_size=128,
                                                             class_mode='binary')

    # model fitting
    history = model.fit_generator(
        train_generator,
        validation_data=validation_generator,
        validation_steps=20,
        steps_per_epoch=20,
        epochs=100,
        verbose=1,
        callbacks=[callbacks]
    )

    # summarize history for accuracy and loss
    plt.figure(figsize=(6, 4))
    plt.plot(history.history['accuracy'], "g--", label="Accuracy of training data")
    plt.plot(history.history['val_accuracy'], "g", label="Accuracy of validation data")
    plt.plot(history.history['loss'], "r--", label="Loss of training data")
    plt.plot(history.history['val_loss'], "r", label="Loss of validation data")
    plt.title('Model Accuracy and Loss')
    plt.ylabel('Accuracy and Loss')
    plt.xlabel('Training Epoch')
    plt.ylim(0)
    plt.legend()
    plt.show()

    return model

A function to evaluate the trained model

In [66]:
def test_model(data_root_path, model):
    test_datagen = ImageDataGenerator(featurewise_center=True,
                                      featurewise_std_normalization=True,
                                      rotation_range=40,
                                      width_shift_range=0.2,
                                      zoom_range=0.2,
                                      horizontal_flip=True,
                                      rescale=1 / 255)

    test_generator = test_datagen.flow_from_directory(data_root_path + '/test/',
                                                      target_size=(32, 32),
                                                      batch_size=16,
                                                      class_mode=None,  # only data, no labels
                                                      shuffle=False)

    probabilities = model.predict_generator(test_generator)

    return probabilities


A function to write the submission file.

In [52]:
def write_submission(raw_data_path, probs):
    test_result_dict = {}

    for i, image in enumerate(glob.glob(raw_data_path + 'test/no_label/*.jpg')):
        image = image.split('/')[-1]
        test_result_dict[image] = probs[i][0]

    df_submission = pd.read_csv(raw_data_path + 'sample_submission.csv')

    def result_fill(x):
        return test_result_dict[x[0]]

    df_submission['has_cactus'] = df_submission.apply(result_fill, axis=1)
    df_submission.sort_values(by=['id']).to_csv(raw_data_path + 'submissions/submission.csv')

    return

Driver code:

In [53]:
# look at versions to avoid confusion with deprecation/compatibility
list_packages_versions()

absl-py==0.11.0
adal==1.2.2
affine==2.3.0
aiobotocore==1.1.2
aiohttp-cors==0.7.0
aiohttp==3.7.2
aioitertools==0.7.1
aioredis==1.3.1
alabaster==0.7.12
albumentations==0.5.1
alembic==1.4.3
allennlp==1.2.1
altair==4.1.0
anaconda-client==1.7.2
anaconda-project==0.8.3
annoy==1.17.0
ansiwrap==0.8.4
appdirs==1.4.3
argh==0.26.2
arrow==0.15.5
arviz==0.10.0
asn1crypto==1.3.0
astroid==2.3.3
astropy==4.0.1.post1
astunparse==1.6.3
async-generator==1.10
async-timeout==3.0.1
atomicwrites==1.3.0
attrs==19.3.0
audioread==2.1.9
autopep8==1.5.1
babel==2.8.0
backcall==0.1.0
backports.shutil-get-terminal-size==1.0.0
baker==1.3
basemap==1.2.1
bayesian-optimization==1.2.0
bayespy==0.5.20
bcolz==1.2.1
beautifulsoup4==4.9.0
binaryornot==0.4.4
biopython==1.78
bitarray==1.2.1
bkcharts==0.2
black==19.10b0
bleach==3.1.4
blessings==1.7
blinker==1.4
blis==0.4.1
bokeh==2.2.3
boruta==0.3
boto3==1.16.19
boto==2.49.0
botocore==1.19.19
bottleneck==1.3.2
bq-helper==0.4.1
bqplot==0.12.19
branca==0.4.1
brewer2mpl==1.4.1
bro

In [114]:
# do some proprocessing to make it compatible with the flow_from_directory() method
raw_data_path = 'data/'
data_raw2ImageDataGenerator(raw_data_path)

In [None]:
# model the training data with simple 2DConvNet from scratch
model_cactus = build_model(raw_data_path)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_6 (Conv2D)            (None, 30, 30, 16)        448       
_________________________________________________________________
max_pooling2d_6 (MaxPooling2 (None, 15, 15, 16)        0         
_________________________________________________________________
conv2d_7 (Conv2D)            (None, 13, 13, 32)        4640      
_________________________________________________________________
max_pooling2d_7 (MaxPooling2 (None, 6, 6, 32)          0         
_________________________________________________________________
conv2d_8 (Conv2D)            (None, 4, 4, 64)          18496     
_________________________________________________________________
max_pooling2d_8 (MaxPooling2 (None, 2, 2, 64)          0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 256)              

It you look at the images found, it should correspond to the specified train/validation split. Look at the number of found classes and make sure it's correct. If not, you probably set up your directory structure incorrectly.

In [90]:
# get model results    
probabilities = test_model(raw_data_path, model_cactus)

/kaggle/working/data/test
/kaggle/working
Found 0 images belonging to 0 classes.


ValueError: Asked to retrieve element 0, but the Sequence has length 0

In [None]:
# write model results in the format required by the Kaggle competition.
write_submission(raw_data_path, probabilities)