In [75]:
from extract_zip import extract_zip_to_memory # function for decompressing zip in memory
from PIL import Image, ImageOps # image handling
import pandas as pd # data manipulation
import numpy as np
import keras
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from keras.models import Sequential

basepath = "/mnt/datasets/plankton/flowcam/"

Using TensorFlow backend.


In [44]:
img_files = extract_zip_to_memory(basepath + "imgs.zip")

In [60]:
img_files_keys = img_files.keys()
print(len(img_files_keys))
list(img_files_keys)[:5]

243610


['imgs/32582800.jpg',
 'imgs/32601208.jpg',
 'imgs/32674954.jpg',
 'imgs/32722436.jpg',
 'imgs/32531609.jpg']

In [2]:
# dataset
features_native_RawDF = pd.read_csv(basepath + 'features_native.csv.gz')
meta_RawDF = pd.read_csv(basepath + 'meta.csv')
taxo_RawDF = pd.read_csv(basepath + 'taxo.csv')

In [82]:
print(features_native_RawDF.shape)
print(meta_RawDF.shape)
print(taxo_RawDF.shape)

(243610, 65)
(243610, 14)
(178, 8)


In [76]:
# change datatype of meta
meta_RawDF['objid'] = meta_RawDF['objid'].astype(np.int64, errors='ignore')

In [19]:
max_height = features_native_RawDF.height.max()
max_width = features_native_RawDF.width.max()
print(max_height)
print(max_width)

739
972


In [58]:
# code inspired by https://jdhao.github.io/2017/11/06/resize-image-to-square-with-padding/

def image_processing(path):
    desired_size = 972 # max of width and height in dataset
    im_orginal = Image.open(path)
    old_size = im_orginal.size  # old_size[0] is in (width, height) format
    
    ratio = float(desired_size)/max(old_size)
    new_size = tuple([int(x*ratio) for x in old_size])
    im_scaled = im_orginal.resize(new_size) # scale up picture before padding to keep information
    
    delta_w = desired_size - new_size[0]
    delta_h = desired_size - new_size[1]
    padding = (delta_w//2, delta_h//2, delta_w-(delta_w//2), delta_h-(delta_h//2))
    im_padded = ImageOps.expand(im_scaled, padding, fill=255) # padding of scaled picture
    
    final_size = (100, 100) # input size for CNN
    im_final = im_padded.resize(final_size, resample=0) # resize to input size of CNN
    im_final.save('new_image.png') # save file locally, should be changed to a return
    
image_processing(img_files['imgs/32582800.jpg']) # test



In [None]:
# processes image before training

def autoloader():
    

In [87]:
# split to train and test data

# 9000 first elements as train
train_keys = list(img_files_keys)[:9000]

y = pd.DataFrame(columns=['label 1', 'label 2'])

for key in train_keys:
    key_id = int(key[5:13])
    label1 = meta_RawDF.loc[meta_RawDF.objid == key_id].level1.values[0]
    label2 = meta_RawDF.loc[meta_RawDF.objid == key_id].level2.values[0]
    y.loc[key_id] = [label1, label2]
    
# 1000 next elements as test
test_keys = list(img_files_keys)[9000:10000]

y.head()

Unnamed: 0,label 1,label 2
32582800,silks,silks
32601208,Neoceratium furca (Neoceratium),Neoceratium
32674954,detritus,detritus
32722436,detritus,detritus
32531609,detritus,detritus


In [None]:
# input dimensions
input_shape = (100, 100, 1)

# network parameters 
batch_size = 128
num_classes = 10
epochs = 5 # Further Fine Tuning can be done


In [None]:
# Keras CNN model

model = Sequential()

# add first convolutional layer
model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=input_shape))

# add second convolutional layer
model.add(Conv2D(64, (3, 3), activation='relu'))

# add one max pooling layer 
model.add(MaxPooling2D(pool_size=(2, 2)))

# add one dropout layer
model.add(Dropout(0.25))

# add flatten layer
model.add(Flatten())

# add dense layer
model.add(Dense(128, activation='relu'))

# add another dropout layer
model.add(Dropout(0.5))

# add dense layer
model.add(Dense(num_classes, activation='softmax'))

# complile the model and view its architecur
model.compile(loss=keras.losses.categorical_crossentropy,  optimizer=keras.optimizers.Adadelta(), metrics=['accuracy'])

model.summary()