# BirdBrain Data Wrangling and EDA

## Data Source:
ImageNet for transfer learning https://www.kaggle.com/c/imagenet-object-localization-challenge/overview/description

Kaggle Dataset: https://www.kaggle.com/gpiosenka/100-bird-species

## Problem Statement:
How can we identify the images of birds and the bird species with increasing photograph sets
produced by the current bird camera and the 5 to be added to the network?

In [3]:
#!pip3 install pydot

In [7]:
#imports
import os
from os.path import exists
import sys
import time
import numpy as np
from typing import Any, List, Tuple, Union
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
from skimage import io
from skimage import data
from skimage.util import compare_images
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras import models
from keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import EfficientNetV2L, EfficientNetV2S, EfficientNetV2B0
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler, ModelCheckpoint
import pickle

#Use this to check if the GPU is configured correctly
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 9105478188730521232
xla_global_id: -1
]


In [8]:
#constants
PROJECT_DATA_PATH = './Data/'
#image expected size
IMAGE_HEIGHT = 224
IMAGE_WIDTH = 224

In [9]:
#helper functions
def imageSizedCorrectly(expectedHeight, expectedWidth, imagedata):
    if imagedata.shape[0] == expectedHeight and imagedata.shape[1] == expectedWidth and imagedata.shape[2] > 1:
        return True
    return False

def getImageData(imagePath):
    if exists(imagePath):
        return io.imread(imagePath)

def loadFilePointers(csvPath, dataset):
    if not exists(csvPath):
        raise Exception('File Not Found: ' + csvPath)
    csv = pd.read_csv(csvPath)
    return csv[csv["data set"] == dataset]
    
def showDuplicateImages(set1ImagePaths, set2ImagePaths):
    for idx in range(0, len(set1ImagePaths)):
        fig = plt.figure(figsize=(10, 5))
        ax1 = fig.add_subplot(idx+1, idx+2, 1)
        imgplot1 = plt.imshow(mpimg.imread(set1ImagePaths[idx]))
        ax1.set_title(set1ImagePaths[idx])
        ax2 = fig.add_subplot(idx+1, idx+2, 2)
        imgplot2 = plt.imshow(mpimg.imread(set2ImagePaths[idx]))
        ax2.set_title(set2ImagePaths[idx])
        plt.show()
        
def remove_bad_images(folder_path):
    num_skipped = 0
    for fname in os.listdir(folder_path):
        fpath = os.path.join(folder_path, fname)
        try:
            fobj = open(fpath, "rb")
            is_jfif = tf.compat.as_bytes("JFIF") in fobj.peek(10)
        finally:
            fobj.close()

        if not is_jfif:
            num_skipped += 1
            # Delete corrupted image
            os.remove(fpath)
            print(f'Removed: {fpath}')
    if num_skipped > 0:
        print(f'Total Removed: {num_skipped}')


In [10]:
for subdirs, dirs, files in os.walk(PROJECT_DATA_PATH + "train"):
    for subdir in dirs:
        remove_bad_images(os.path.join(PROJECT_DATA_PATH + "train", subdir))
        
for subdirs, dirs, files in os.walk(PROJECT_DATA_PATH + "valid"):
    for subdir in dirs:
        remove_bad_images(os.path.join(PROJECT_DATA_PATH + "valid", subdir))

for subdirs, dirs, files in os.walk(PROJECT_DATA_PATH + "test"):
    for subdir in dirs:
        remove_bad_images(os.path.join(PROJECT_DATA_PATH + "test", subdir))
print('done!')

done!


In [11]:
image_size = (IMAGE_WIDTH, IMAGE_HEIGHT)
batch_size = 32

train_ds = tf.keras.preprocessing.image_dataset_from_directory(
    PROJECT_DATA_PATH + "train",
    labels='inferred',
    label_mode='categorical',
    seed=1337,
    image_size=image_size,
    batch_size=batch_size,
)
val_ds = tf.keras.preprocessing.image_dataset_from_directory(
    PROJECT_DATA_PATH + "valid",
    labels='inferred',
    label_mode='categorical',
    seed=1337,
    image_size=image_size,
    batch_size=batch_size,
)

num_classes = train_ds.element_spec[1].shape[1]
print(f'Train has {num_classes} classes.')


Found 47332 files belonging to 325 classes.
Found 1625 files belonging to 325 classes.
Train has 325 classes.


In [15]:
class Logger(object):
    """Redirect stderr to stdout, optionally print stdout to a file, and 
    optionally force flushing on both stdout and the file."""

    def __init__(self, file_name: str = None, file_mode: str = "w", \
                 should_flush: bool = True):
        self.file = None

        if file_name is not None:
            self.file = open(file_name, file_mode)

        self.should_flush = should_flush
        self.stdout = sys.stdout
        self.stderr = sys.stderr

        sys.stdout = self
        sys.stderr = self

    def __enter__(self) -> "Logger":
        return self

    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
        self.close()

    def write(self, text: str) -> None:
        """Write text to stdout (and a file) and optionally flush."""
        if len(text) == 0: 
            return

        if self.file is not None:
            self.file.write(text)

        self.stdout.write(text)

        if self.should_flush:
            self.flush()

    def flush(self) -> None:
        """Flush written text to both stdout and a file, if open."""
        if self.file is not None:
            self.file.flush()

        self.stdout.flush()

    def close(self) -> None:
        """Flush, close possible files, and remove stdout/stderr mirroring."""
        self.flush()

        # if using multiple loggers, prevent closing in wrong order
        if sys.stdout is self:
            sys.stdout = self.stdout
        if sys.stderr is self:
            sys.stderr = self.stderr

        if self.file is not None:
            self.file.close()


class MyModelCheckpoint(ModelCheckpoint):
  def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)

  def on_epoch_end(self, epoch, logs=None):
    super().on_epoch_end(epoch,logs)\
    
    print(f'Epoch: {epoch}')
    if logs == None:
        logs = {'val_loss': 1.0}
        
    print(logs)
    
    # Also save the optimizer state
    filepath = self._get_file_path(epoch, logs)
    filepath = filepath.rsplit( ".", 1 )[ 0 ] 
    filepath += ".pkl"

    with open(filepath, 'wb') as fp:
      pickle.dump(
        {
          'opt': model.optimizer.get_config(),
          'epoch': epoch+1
         # Add additional keys if you need to store more values
        }, fp, protocol=pickle.HIGHEST_PROTOCOL)
    print('\nEpoch %05d: saving optimizaer to %s' % (epoch + 1, filepath))

In [20]:
#image augmentation
data_augmentation = keras.Sequential(
    [
        layers.RandomFlip("horizontal"),
        layers.RandomRotation(0.1),
    ]
)
#model building
def make_model(input_shape, num_classes):
    base = EfficientNetV2B0(weights="imagenet", include_top=False, input_shape=input_shape, classifier_activation="softmax") #keras.Input(shape=input_shape)
    model = models.Sequential()
    model.add(base)
    model.add(layers.GlobalMaxPooling2D(name="gap"))
    #model.add(data_augmentation)
    #avoid overfitting
    model.add(layers.Dropout(0.2, name="dropout_out"))
    # Set NUMBER_OF_CLASSES to the number of your final predictions.
    model.add(layers.Dense(num_classes, activation="softmax", name="fc_out"))
    base.trainable = False
    return 0, model, keras.optimizers.Adam(1e-3).get_config()

def load_model_data(model_path, opt_path):
    model = load_model(model_path)
    with open(opt_path, 'rb') as fp:
      d = pickle.load(fp)
      epoch = d['epoch']
      opt = d['opt']
      return epoch, model, opt
    
def train_model(model, initial_epoch=0, max_epochs=50):
    start_time = time.time()
    
    callbacks = [
        MyModelCheckpoint(
            os.path.join(PROJECT_DATA_PATH + 'models/', 'model-{epoch:02d}-{val_loss:.2f}.hdf5'),
            monitor='val_loss',verbose=1),
    ]
    
    model.fit(
        train_ds,
        epochs=max_epochs,
        initial_epoch = initial_epoch,
        verbose=2,
        callbacks=callbacks,
        validation_data=val_ds,
    )

    #score = model.evaluate(x_test, y_test, verbose=0, callbacks=cb)
    #print('Test loss: {}'.format(score[0]))
    #print('Test accuracy: {}'.format(score[1]))

    elapsed_time = time.time() - start_time
    print("Elapsed time: {}".format(hms_string(elapsed_time)))
    

In [21]:
if exists(PROJECT_DATA_PATH + "models"):
    for subdirs, dirs, files in os.walk(PROJECT_DATA_PATH + "models"):
        for file in files:
            print(file)
else:
    os.mkdir(PROJECT_DATA_PATH + "models")

epoch, model, opt = make_model(input_shape=(IMAGE_WIDTH, IMAGE_HEIGHT, 3), num_classes=num_classes)
#epoch, model, opt = load_model_data(model_path, opt_path)

In [22]:

#keras.utils.plot_model(model, show_shapes=True)

model.compile(
    optimizer=tf.keras.optimizers.Adam.from_config(opt),
    loss=tf.keras.losses.CategoricalCrossentropy(),
    metrics=["accuracy"],
)

In [None]:
if not exists(PROJECT_DATA_PATH + "logs"):
    os.mkdir(PROJECT_DATA_PATH + "logs")
    
with Logger(os.path.join(PROJECT_DATA_PATH + 'logs', 'log.txt')):
    train_model(model, initial_epoch=epoch, max_epochs=50)

Epoch 1/50


In [None]:
img = keras.preprocessing.image.load_img(
    PROJECT_DATA_PATH + "test/BELTED KINGFISHER/4.jpg", target_size=image_size
)
img_array = keras.preprocessing.image.img_to_array(img)
img_array = tf.expand_dims(img_array, 0)  # Create batch axis

predictions = model.predict(img_array)
score = predictions[0]
print(
    "This image is %.2f percent cat and %.2f percent dog."
    % (100 * (1 - score), 100 * score)
)