# Sorting in folders

In [None]:
from sklearn.model_selection import train_test_split

import os
import pandas as pd
from shutil import copyfile
from tqdm.auto import tqdm

def check_path(path):
    os.system("if [ ! -d " + path + " ]; then mkdir -p " + path + "; fi")

In [None]:
labels_path = 'train.csv'
labels = pd.read_csv(labels_path)
display(labels.head(2))

In [None]:
train_X, val_X = train_test_split(labels, test_size=0.1)

In [None]:
train_X.head(2)

In [None]:
categories_train = train_X.groupby(['category'])['name'].apply(lambda grp: list(grp)).to_dict()
categories_val = val_X.groupby(['category'])['name'].apply(lambda grp: list(grp)).to_dict()

In [None]:
def make_cat(categories,train_path,output_path):
    check_path(output_path)
    for key in tqdm(categories.keys()):
        check_path(output_path+str(key))
        for pict_name in categories[key]:
            try:
                copyfile(train_path+pict_name, output_path+str(key)+"/"+pict_name)
            except Exception as e:
                print("Problem with ", train_path+pict_name)
                print(e)

In [None]:
# os.listdir("train/train")

In [None]:
train_path = "train/train/"
output_train_path = "image_data/train/"
output_val_path = "image_data/val/"

make_cat(categories_train,train_path,output_train_path)
make_cat(categories_val,train_path,output_val_path)

# Augmentation pipeline

In [1]:
import glob
import numpy as np
import matplotlib.pyplot as plt
from pylab import rcParams
import PIL.Image as img

# Uploading dataset
def img_array(path):
    image = img.open(path)
    tmp = np.array(image)
    image.close()
    return tmp

In [None]:
import Augmentor
from Augmentor import Pipeline

def augmentation(path, n):
    p = Augmentor.Pipeline(path) # ensure you press enter after this, don't just c&p this code.
    Pipeline.set_seed(100)
    p.rotate(probability=0.3, max_left_rotation=3, max_right_rotation=3)
    p.random_distortion(probability=0.9, grid_width=20, grid_height=20, magnitude=2)
    p.random_erasing(probability=0.5, rectangle_area=0.2)
    p.zoom(probability=0.5, min_factor=1.1, max_factor=1.5)
    p.sample(n)

path = output_train_path

for i in range(0,100):
    augmentation(path+str(i)+'/', 400 - len(categories_train[i]))

# Modelling

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'

# Standard data science libraries
import psutil
import humanize
import os
from IPython.display import display_html

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

dataDirectory= "" 

In [3]:
import numpy as np
import keras
from keras import backend as K
from keras.models import Sequential
from keras.models import Model
from sklearn.model_selection import train_test_split
from keras.layers import Activation
from keras.layers.core import Dense, Flatten
from keras.optimizers import RMSprop
from keras.metrics import categorical_crossentropy
from keras.preprocessing.image import ImageDataGenerator
from keras.layers.normalization import BatchNormalization
from keras.layers.core import Dropout
from keras.layers.convolutional import *
from keras.callbacks import ModelCheckpoint
from keras.applications.inception_v3 import InceptionV3
from keras.applications.inception_v3 import preprocess_input
from keras.applications.inception_v3 import decode_predictions
from sklearn.metrics import confusion_matrix
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from keras.models import model_from_json
import itertools
import matplotlib.pyplot as plt
import time
import pandas as pd

In [4]:
train_path = dataDirectory+'image_data2/train'
val_path = dataDirectory+'image_data2/val'
test_path  = dataDirectory+'test/test'
# print(os.listdir(train_path))
# print(os.listdir(test_path))

In [5]:
train_datagen = ImageDataGenerator(
        rescale = 1./255,
        rotation_range=40,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        fill_mode='nearest') # set validation split

In [6]:
batchSize=64

selectedClasses = [str(i) for i in range(100)]

train_generator = train_datagen.flow_from_directory(
    train_path,
    target_size=(224, 224),
    batch_size=batchSize,
    classes=selectedClasses,
    subset='training') # set as training data

validation_generator = ImageDataGenerator(rescale = 1./255).flow_from_directory(
    val_path, # same directory as training data
    target_size=(224, 224),
    batch_size=batchSize,
    classes=selectedClasses) # set as validation data

# test_generator = ImageDataGenerator().flow_from_directory(
#     'test/', 
#     target_size=(224,224), 
#     classes=None,
#     shuffle= False,
#     batch_size = batchSize)# set as test data

Found 40000 images belonging to 100 classes.
Found 1686 images belonging to 100 classes.


In [7]:
#InceptionV3

base_model = InceptionV3(weights='imagenet', 
                                include_top=False, 
                                input_shape=(224, 224,3))
base_model.trainable = True

x = base_model.output
x = keras.layers.GlobalAveragePooling2D()(x)
# let's add a fully-connected layer
x = Dropout(0.5)(x)
# and a sofymax/logistic layer -- we have 6 classes
predictions = Dense(100, activation='softmax')(x)

# this is the model we will train
model = Model(input=base_model.input, output=predictions)


model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 224, 224, 3)  0                                            
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 111, 111, 32) 864         input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, 111, 111, 32) 96          conv2d_1[0][0]                   
__________________________________________________________________________________________________
activation_1 (Activat

  app.launch_new_instance()


In [8]:
#Atutomatic rename with epoch number and val accuracy:
#filepath="checkpoints/weights-improvement-epeoch-{epoch:02d}-val_acc-{val_acc:.2f}.hdf5"
modelName= "InceptionTutorial"
#save the best weights over the same file with the model name

#filepath="checkpoints/"+modelName+"_bestweights.hdf5"
filepath=modelName+"_bestweights_2_pipi.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

model.compile(RMSprop(lr=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])

In [9]:
stepsPerEpoch= (train_generator.samples+ (batchSize-1)) // batchSize
print("stepsPerEpoch: ", stepsPerEpoch)

validationSteps=(validation_generator.samples+ (batchSize-1)) // batchSize
print("validationSteps: ", validationSteps)


#validationSteps=(test_generator.samples+ (batchSize-1)) // batchSize
#print("validationSteps: ", validationSteps)

stepsPerEpoch:  625
validationSteps:  27


In [None]:
train_generator.reset()
validation_generator.reset()

# Fit the model
history = model.fit_generator(
    train_generator, 
    validation_data = validation_generator,
    epochs = 5,
    steps_per_epoch = stepsPerEpoch,
    validation_steps= validationSteps,
    callbacks=callbacks_list,
    verbose=1)

Epoch 1/5

Epoch 00001: val_acc improved from -inf to 0.53618, saving model to InceptionTutorial_bestweights_2_pipi.hdf5
Epoch 2/5

Epoch 00002: val_acc improved from 0.53618 to 0.59193, saving model to InceptionTutorial_bestweights_2_pipi.hdf5
Epoch 3/5

Epoch 00003: val_acc improved from 0.59193 to 0.60142, saving model to InceptionTutorial_bestweights_2_pipi.hdf5
Epoch 4/5

In [67]:
test_generator = ImageDataGenerator(rescale = 1./255).flow_from_directory(
        'test/',
        target_size=(224, 224),
        shuffle = False,
        class_mode=None,
        batch_size=1)

# Get the filenames & remove directory specification in front of filename
filenames = [filename for filename in test_generator.filenames]

# Not predicting in batches but each inidividual item, therefore we need to know the amount of predictions
nb_samples = len(filenames)

Found 16858 images belonging to 1 classes.


In [68]:
# Reset before each call to predict
test_generator.reset()

pred = model.predict_generator(test_generator, steps = len(test_generator), verbose = 1)

predicted_class_indices = np.argmax(pred, axis = 1)



In [69]:
# Map predictions to the correct labels
labels = (train_generator.class_indices)
labels = dict((v,k) for k,v in labels.items())
predictions = [labels[k] for k in predicted_class_indices]

ff = [f[5:] for f in filenames]

# Submit file
submission = pd.DataFrame({'name':ff,'category':predictions})
submission.to_csv('Inception_6_epoch_augmentation_+rescaling.csv', index=False)
print("Done!")

Done!
