## Train your own model

1. Install Python and pip (Py Package Manager) on your computer (instructions are available online). I used python 2.7
2. Install these packages with pip: keras with tensorflow, jupyter notebook, sklearn, requests, h5py (google how to do it on your platform)
3. Download [train_sample.csv](https://hackiit.slack.com/files/U70G56S3C/F75UCE2CE/train_sample.csv) (reduced training data set with 5 labels)
4. Download this notebook and put it in a folder!
5. Follow [me](http://github.com/bolein) on github :D
5. Run jupyter notebook and open this file (notebook)
4. Run below cells one by one following the instructions
5. Ask in Slack if you have any questions or problems
6. Have fun!

There's a good set of articles to get an idea about the neural networks http://neuralnetworksanddeeplearning.com/

In [None]:
# install my custom directory iterator for keras (feel fre to go and star it)
# https://github.com/bolein/keras_img_iterator
!pip install --user git+https://github.com/bolein/keras_img_iterator.git --upgrade

### Now restart your kernel with Kernel -> Restart

In [None]:
# Download the data (Run this cell only once! It may take some time!)

## Load Libraries
import os
import requests, zipfile, io

# load data into platform
url = requests.get('https://he-s3.s3.amazonaws.com/media/hackathon/deep-learning-challenge-1/identify-the-objects/a0409a00-8-dataset_dp.zip')
data = zipfile.ZipFile(io.BytesIO(url.content))
data.extractall('data/')

In [None]:
# wait until the asterix * in the previous task dissapears
# check if the files have been download in current directory
os.listdir('data')

### Now put the file train_sample.csv inside the data directory next to your notebook file!

In [None]:
import pandas as pd
import tensorflow as tf
import numpy as np

from keras.models import save_model, load_model
from keras.preprocessing.image import ImageDataGenerator
from keras_img_iterator import SingleDirectoryIterator

from sklearn.model_selection import train_test_split

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.regularizers import l2


def convnet(num_classes, image_size):
    model = Sequential()
    
    model.add(Conv2D(32, (3, 3), input_shape=(image_size, image_size, 3),
                     activation='relu', padding='same', kernel_regularizer=l2(0.001)))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Conv2D(32, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.001)))
    model.add(MaxPooling2D(pool_size=(2, 2)))
        
    model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    
    model.add(Flatten())  # this converts our 3D feature maps to 1D vectors
    model.add(Dropout(0.3))
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))
    return model


In [None]:
# Load the meta data
# Make sure you downloaded train_sample.csv file
meta_data = pd.read_csv('data/train_sample.csv', header=0) # change filename to train.csv for full data
filenames = meta_data['image_id'].apply(lambda id: id + '.png').values
labels = meta_data['label'].values
classes = list(set(labels))

# split into test and validation
files_train, files_validate, labels_train, labels_validate = \
    train_test_split(filenames, labels, test_size=0.2, random_state=42)

num_train_samples = files_train.shape[0]
num_val_samples = files_validate.shape[0]
num_classes = len(classes)

In [None]:
# this is the augmentation configuration we will use for training
batch_size = 32
image_size = 128

train_gen = ImageDataGenerator(
    rescale=1. / 255,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True, 
    width_shift_range=0.2, 
    height_shift_range=0.2)

# this is a similar generator, for validation data
# only rescaling
test_gen = ImageDataGenerator(rescale=1. / 255)

train_iterator = SingleDirectoryIterator(
    directory='data/train_img/',
    filenames=files_train,
    labels=labels_train,
    classes=classes,
    image_data_generator=train_gen,
    batch_size=batch_size,
    target_size=(image_size, image_size),
    seed=1337)

validation_iterator = SingleDirectoryIterator(
    directory='data/train_img/',
    filenames=files_validate,
    labels=labels_validate,
    classes=classes,
    image_data_generator=test_gen,
    batch_size=batch_size,
    target_size=(image_size, image_size),
    seed=1337)

In [None]:
# initialize and compile the model
model = convnet(num_classes, image_size)

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
# only run this cell if you have previously saved model! 
# OR load from saved file (only if the model previously saved)
# don't forget to put the right name for it
model = load_model('model_0.321674930874.h5', compile = True)

In [None]:
# Train the model
num_epochs = 2

validation_iterator.reset()
train_iterator.reset()

history = model.fit_generator(
    train_iterator,
    steps_per_epoch=num_train_samples // batch_size + 1,
    epochs=num_epochs,
    validation_data=validation_iterator,
    validation_steps=num_val_samples // batch_size + 1)

In [None]:
# Visualize learning
import matplotlib.pyplot as plt

# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
# Calculate f1-score against validation set
validation_iterator.reset()
loss, score = model.evaluate_generator(
    validation_iterator,
    steps=num_val_samples // batch_size + 1)

print("model scored {} on validation set".format(score))

In [None]:
# Only run these cells if you want to test your model over the test data provided from the competition
# (Make sure you've trained on the full data)
# Test model
# Read test data set
test_data = pd.read_csv('data/test.csv', header=0)
files_test = test_data['image_id'].apply(lambda id: id + '.png').values

In [None]:
# Set up iterator for test set
test_iterator = SingleDirectoryIterator(
    directory='data/test_img/',
    filenames=files_test,
    image_data_generator=test_gen,
    batch_size=batch_size,
    target_size=(image_size, image_size),
    shuffle=False)

# make prediction
num_test_samples = files_test.shape[0]
predictions = model.predict_generator(
    generator=test_iterator,
    steps=num_test_samples // batch_size + 1)

test_labels = [classes[i] for i in np.argmax(predictions, axis=1)]

In [None]:
# function for downloading results
from IPython.display import HTML
import base64

def create_download_link(df, filename):  
    csv = df.to_csv(index=False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{filename}</a>'
    html = html.format(payload=payload,filename=filename)
    return HTML(html)

In [None]:
# save to file and create download link
submission = pd.DataFrame({'image_id':test_data.image_id, 'label':test_labels})
create_download_link(submission, "submission.csv")

In [None]:
# save model
model_file = 'model_{}.h5'.format(score)
save_model(model, model_file)
print('Training complete. model was saved as ', model_file)