In [None]:
import sagemaker
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torchvision
import os
import json
import boto3

  from .autonotebook import tqdm as notebook_tqdm


Each image has a coresponding metadata file that indicates the number of items in the image. These metadata files are accessed and parsed to identify only the images that contain 1-5 items. The filenames of these images are saved to the `full_list.json`, ordered by class. At this point, the file needs to be an json file with empty lists for each class.

In [None]:
import pandas as pd
import json

s3_client = boto3.client('s3')
bucket='aft-vbi-pds'
s3 = boto3.resource('s3')
data_bucket = s3.Bucket(bucket)

with open('full_list.json','r+') as file:
    file_data = json.load(file)
    
    
count = 0
for object_summary in data_bucket.objects.filter(Prefix="metadata"):
    key = object_summary.key
    if key.endswith(".json"):
        data_location = 's3://{}/{}'.format(bucket, key)
        
        data = pd.read_json(data_location) 

        num = data['EXPECTED_QUANTITY']
        
        for n in num:
            if n > 0 and n < 6:
                file_data[str(n)].append(key)
                count += 1
                
                if count % 10000 == 0:
                    with open('check.json', 'w+') as file:
                        json.dump(file_data, file)
                    print(count)
            break
        
with open('check.json','w+') as file:
    json.dump(file_data, file)


10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000


Download images of the target classes, resize the images and upload to bucket. The `large_list.json` file contains the filenames of all the images that belong to the target classes.

In [None]:
from tqdm import tqdm

def download_and_arrange_data():
    s3_client = boto3.client('s3')

    with open('large_list.json', 'r') as f:
        d=json.load(f)

    for k, v in d.items():
        print(f"Downloading Images with {k} objects")
        directory=os.path.join('large_train_data', k)
        if not os.path.exists(directory):
            os.makedirs(directory)
        for file_path in tqdm(v):
            file_name=os.path.basename(file_path).split('.')[0]+'.jpg'
            s3_client.download_file('aft-vbi-pds', os.path.join('bin-images', file_name),
                             os.path.join(directory, file_name))

download_and_arrange_data()

Downloading Images with 1 objects


100%|██████████| 25987/25987 [38:54<00:00, 11.13it/s] 


Downloading Images with 2 objects


100%|██████████| 48103/48103 [1:13:14<00:00, 10.95it/s]


Downloading Images with 3 objects


100%|██████████| 56409/56409 [1:27:47<00:00, 10.71it/s] 


Downloading Images with 4 objects


100%|██████████| 50384/50384 [1:17:35<00:00, 10.82it/s]


Downloading Images with 5 objects


100%|██████████| 39117/39117 [1:01:22<00:00, 10.62it/s]


In [None]:
#TODO: Perform any data cleaning or data preprocessing
print(len(os.listdir('large_train_data/1')))
print(len(os.listdir('large_train_data/2')))
print(len(os.listdir('large_train_data/3')))
print(len(os.listdir('large_train_data/4')))
print(len(os.listdir('large_train_data/5')))

1229
2300
2666
2373
1875


Create a random train-test split of 90-10

In [None]:
dataset = torchvision.datasets.ImageFolder("large_train_data")

In [None]:
test_size = int(0.1 * len(dataset))
train_data, test_data = torch.utils.data.random_split(dataset, [len(dataset) - test_size, test_size])

Save the training and test splits locally and then upload to s3 bucket. This is so train and test data can be downloaded straight from the s3 bucket and be consistant across instances and training runs. Otherwise, the train-test split would be random each time the data is downloaded for training.

In [None]:
count = 0
for img, label in train_data:
    img.save(os.path.join("large_train", str(label + 1), str(count) +".jpg"))
    count += 1

In [None]:
count = 0
for img, label in test_data:
    img.save(os.path.join("large_test", str(label + 1), str(count) +".jpg"))
    count += 1

Resize the images to avoid this preprocessing step when training

In [None]:
os.mkdir("resized_train")
os.mkdir("resized_train/1")
os.mkdir("resized_train/2")
os.mkdir("resized_train/3")
os.mkdir("resized_train/4")
os.mkdir("resized_train/5")

In [None]:
os.mkdir("resized_test")
os.mkdir("resized_test/1")
os.mkdir("resized_test/2")
os.mkdir("resized_test/3")
os.mkdir("resized_test/4")
os.mkdir("resized_test/5")

In [None]:
import torchvision.transforms as T

transforms = T.Compose([T.Resize((224, 224))])

In [None]:
train_dataset = torchvision.datasets.ImageFolder("large_train", transform=transforms)
test_dataset = torchvision.datasets.ImageFolder("large_test", transform=transforms)

In [None]:
count = 0
for img, label in train_dataset:
    img.save(os.path.join("resized_train", str(label + 1), str(count) +".jpg"))
    count += 1

In [None]:
count = 0
for img, label in test_dataset:
    img.save(os.path.join("resized_test", str(label + 1), str(count) +".jpg"))
    count += 1

Upload the images to a s3 bucket, including the originals just in case.

In [None]:
s3_path_to_data = sagemaker.Session().upload_data(bucket='bincapstone', 
                                                  path='resized_train', 
                                                  key_prefix='capstone/data/resized_train')

In [None]:
s3_path_to_data = sagemaker.Session().upload_data(bucket='bincapstone', 
                                                  path='resized_test', 
                                                  key_prefix='capstone/data/resized_test')

In [None]:
s3_path_to_data = sagemaker.Session().upload_data(bucket='bincapstone', 
                                                  path='large_train', 
                                                  key_prefix='capstone/data/large_train')

In [None]:
s3_path_to_data_test = sagemaker.Session().upload_data(bucket='bincapstone', 
                                                  path='large_test', 
                                                  key_prefix='capstone/data/large_test')

Determine the normalization values for this dataset. Here the training and test data is assumed to be moved and renamed to local folders named `data/train_data` and `data/test_data`.

In [None]:
train_dataset = torchvision.datasets.ImageFolder("data/train_data")
test_dataset = torchvision.datasets.ImageFolder("data/test_data")

In [None]:
from torch.utils.data import DataLoader

train_data_loader = DataLoader(train_dataset, batch_size=128, shuffle=False, num_workers=1)
test_data_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=1)

In [None]:
def batch_mean_std(loader):
    cnt = 0
    fst_moment = torch.empty(3)
    snd_moment = torch.empty(3)
    
    for images, _ in loader:
        b,c,h,w = images.shape
        nb_pixels = b*h*w
        sum_ = torch.sum(images, dim=[0,2,3])
        sum_of_squares = torch.sum(images **2, dim=[0,2,3])
        
        fst_moment = (cnt * fst_moment + sum_) / (cnt + nb_pixels)
        snd_moment = (cnt * snd_moment + sum_of_squares) / (cnt + nb_pixels)
        
    mean = fst_moment
    std = torch.sqrt(snd_moment - fst_moment ** 2)
    return mean, std
    

In [None]:
mean,std = batch_mean_std(train_data_loader)
print(mean)
print(std)

tensor([0.5300, 0.4495, 0.3624])
tensor([0.1691, 0.1476, 0.1114])


In [None]:
mean,std = batch_mean_std(test_data_loader)
print(mean)
print(std)

tensor([0.5206, 0.4400, 0.3570])
tensor([0.1658, 0.1467, 0.1113])
