In [1]:
# For easier dev of local modules:
%load_ext autoreload
%autoreload 2
%matplotlib widget

In [2]:

import os, sys

# Initialize my ml_toolkit which is cloned to this instance
path = os.path.abspath(os.path.join(os.path.dirname('../../..')))
print(path)
sys.path.append(path)
import ml_toolkit as utils

from ml_toolkit.visualization import Visualization
visualization = Visualization()

/home/ec2-user/SageMaker


## Computer Vision I Final Project

In this walkthrough, we will look at reading training, test data and creating a submission file for your final project. Once you train your model and get your predictions, submit your model's .csv output to the class [Leaderboard](https://leaderboard.corp.amazon.com/tasks/312)

## 1. Load training data and convert

In [3]:
import pandas as pd
import os.path
import ml_toolkit as utils


file_folder = '/home/ec2-user/SageMaker'

conversion_exists = True
conversion_exists &= os.path.isfile(file_folder+'/train.pkl')
conversion_exists &= os.path.isfile(file_folder+'/val.pkl')

if not conversion_exists:
    print("conversion doesn't exists, converting...")
    df = pd.read_pickle('/tmp/training_data.pkl')
    train_df, validation_df = utils.images.split_convert_and_write(df, folder=file_folder)
else:
    print("conversion exists :-) loading it...")
    train_df, validation_df = utils.data.load_train_validation_pickle(file_folder)


conversion exists :-) loading it...


In [4]:
train_dataset, validation_dataset = utils.images.prepare_dataset_with_gluon(train_df, validation_df)
train_df.head()

Unnamed: 0,ID,data,label
0,1588,"[[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0...",4
1,2011,"[[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0...",1
2,1321,"[[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0...",4
3,669,"[[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0...",3
4,1674,"[[[0.99607843, 0.99607843, 0.99607843, 0.99607...",1


## Plot the class distribution

In [5]:
train_labels = train_df['label'].values
utils.visualization.draw_class_distribution(train_labels)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [6]:
validation_labels = validation_df['label'].values
utils.visualization.draw_class_distribution(validation_labels)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [8]:
train_df = utils.image.augment.generate_augmented_images(train_df, 
                                                         with_rotate=True, 
                                                         rotations=[30, -30],
                                                         with_random_noise=True, 
                                                         with_flip=True, 
                                                         flips=[1]
                                                        )
train_labels = train_df['label'].values
utils.visualization.draw_class_distribution(train_labels)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [9]:

train_df = utils.image.upsample.upsample(train_df, train_df['label'], [0,1,2,3,4])
train_labels = train_df['label'].values
utils.visualization.draw_class_distribution(train_labels)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## Training and Validation

In [10]:
import mxnet as mx
# Set this to CPU or GPU depending on your training instance
# ctx = mx.cpu()
ctx = mx.gpu()

epochs = 20
batch_size = 16
learning_rate = 0.01
class_labels = ['Inconclusive', 'Two wheels', 'Four wheels', 'Not luggage', 'Zero wheels']

# Create the network. We have 5 classes
num_outputs = 5

net = utils.mxnet.network.build_alexnet_network(ctx, num_outputs)
# net = utils.mxnet.network.build_vgg16_network(ctx, num_outputs)

utils.mxnet.training.train(ctx, net, 'sgd', learning_rate, batch_size, epochs, train_df, validation_df)

Downloading /home/ec2-user/.mxnet/models/alexnet-44335d1f.zip46a2d09d-b983-4cca-a2f4-8dbafb57cbd1 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/alexnet-44335d1f.zip...
Epoch 0, training loss: 1.37, validation loss: 0.96, training accuracy: 0.37, validation accuracy: 0.69
Epoch 1, training loss: 0.98, validation loss: 0.69, training accuracy: 0.58, validation accuracy: 0.74
Epoch 2, training loss: 0.76, validation loss: 0.89, training accuracy: 0.68, validation accuracy: 0.72
Epoch 3, training loss: 0.63, validation loss: 0.68, training accuracy: 0.73, validation accuracy: 0.76
Epoch 4, training loss: 0.52, validation loss: 0.76, training accuracy: 0.78, validation accuracy: 0.77
Epoch 5, training loss: 0.45, validation loss: 0.71, training accuracy: 0.81, validation accuracy: 0.77
Epoch 6, training loss: 0.38, validation loss: 0.73, training accuracy: 0.84, validation accuracy: 0.81
Epoch 7, training loss: 0.33, validation loss: 0.90, training accuracy: 0

## 2. Test Data

In [14]:
# If you're unsure of how to submit to Leaderboard, no problemo.You'll use the training file loaded above to make your ML model and then predict on the files below:
import matplotlib.pyplot as plt
test_df = pd.read_pickle("/tmp/test_data.pkl")
plt.imshow(test_df['data'][90])

<matplotlib.image.AxesImage at 0x7f8021a1a160>

In [15]:
test_df.head()

Unnamed: 0,ID,data
0,1453,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
1,655,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
2,1178,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
3,548,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
4,1547,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."


## Sample zero submission file

In [16]:
# Below is an example submission of a very poor model

test_submission = pd.read_csv('/tmp/sample_model_output.csv', header=0)
test_submission.head(5)

Unnamed: 0,ID,label
0,1453,0.0
1,655,0.0
2,1178,0.0
3,548,0.0
4,1547,0.0


In [19]:
from skimage.transform import resize
def getImages(images):
    # Create the image holder array
    image_arr = np.zeros((images.shape[0], 3, 224, 224), dtype="float32")
    extra_images = []
    
    # Iterate through the image data
    for i, im in enumerate(images):
        # Get image from the data column of the current row
        
        # We need a fixed size input, our images have different sizes, let's pick 224x224.
        # Resize image below
        im = resize(im, output_shape=(224, 224))
        
        # Gluon/mxnet expects images in this format (channel, row, column)
        # This is the opposite of (row, column, channel), let's fix it
        im = np.moveaxis(im, -1, 0)
        
        # Assign the value in the image array
        image_arr[i] = im
    return (image_arr)

In [22]:
import numpy as np
from mxnet import gluon

test_images = getImages(test_df["data"].values)
test_loader = gluon.data.DataLoader(test_images, batch_size=batch_size)

test_predictions = []
for i, data in enumerate(test_loader):
    data = data.as_in_context(ctx)
    output = net(data)
    test_predictions = test_predictions + np.argmax(output.asnumpy(), axis=1).tolist()
print(test_predictions)

[4, 1, 4, 1, 2, 1, 0, 1, 4, 3, 2, 4, 2, 1, 1, 4, 4, 1, 4, 1, 3, 4, 1, 1, 0, 2, 1, 1, 1, 1, 1, 3, 4, 2, 1, 0, 2, 3, 3, 2, 2, 4, 2, 4, 1, 2, 2, 1, 2, 4, 1, 4, 1, 2, 1, 1, 4, 2, 2, 1, 1, 4, 4, 4, 2, 0, 1, 1, 1, 2, 4, 2, 4, 1, 4, 4, 4, 4, 1, 2, 4, 2, 1, 4, 4, 4, 1, 1, 2, 3, 1, 1, 1, 2, 2, 2, 1, 1, 2, 4, 1, 1, 4, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 1, 1, 4, 1, 4, 1, 2, 4, 2, 0, 1, 1, 2, 4, 2, 4, 4, 0, 2, 1, 4, 2, 2, 2, 2, 3, 4, 2, 1, 4, 1, 4, 3, 4, 4, 1, 1, 2, 2, 4, 4, 1, 4, 2, 2, 2, 2, 2, 4, 4, 3, 2, 0, 4, 2, 1, 1, 2, 1, 2, 4, 4, 2, 4, 4, 2, 1, 1, 4, 4, 1, 1, 1, 1, 2, 1, 2, 2, 1, 2, 1, 3, 4, 4, 2, 2, 2, 2, 2, 0, 2, 2, 4, 4, 1, 0, 2, 2, 4, 4, 2, 4, 1, 1, 2, 2, 1, 1, 1, 4, 4, 4, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2, 4, 2, 2, 1, 4, 2, 1, 1, 4, 4, 1, 2, 4, 1, 1, 1, 4, 2, 1, 2, 2, 1, 1, 4, 2, 2, 4, 1, 2, 1, 4, 4, 2, 2, 2, 1, 4, 1, 2, 0, 1, 1, 1, 1, 2, 1, 1, 3, 2, 1, 2, 2, 1, 2, 2, 1, 4, 1, 1, 1, 4, 0, 2, 4, 2, 1, 2, 2, 1, 2, 1, 1, 4, 1, 2, 1, 2, 0, 2, 0, 2, 2, 2, 4, 4, 

## Your submission file

In [23]:
import pandas as pd
import os

result_df = pd.DataFrame(columns=['ID', 'label'])
result_df["ID"] = test_df["ID"]
# Get your model's predictions when submitting (not the zero submission here)
result_df["label"] = test_predictions #test_submission['label'].values

result_df.to_csv("results_cv_project.csv", index=False)

If you navigate to the day1/results folder in the Jupyter file browser, you can select the results_cv_project.csv and dowload it locally. Or just click this [link...](./results_cv_project.csv)

## Getting our model output into Leaderboard

We now have our model's output .csv and are ready to upload to Leaderboard
1. Go to your class [Leaderboard instance](https://leaderboard.corp.amazon.com/tasks/312) and go to the 'Make a Submission' section
2. Upload your local file and include your notebook version URL for tracking
3. Your score on the public leaderboard should now appear. Marvel on how much room for improvement there is