In [None]:
'''Importing Modules'''

'''
Prerequisites:
NumPy             https://numpy.org/doc/stable/
Matplotlib        https://matplotlib.org/stable/index.html
PyTorch           https://pytorch.org/docs/stable/index.html
Torchvision       https://pytorch.org/docs/stable/index.html
PIL               https://pillow.readthedocs.io/en/stable/
GitPython         https://gitpython.readthedocs.io/en/stable/
split-folders:    https://pypi.org/project/split-folders/
python-dotenv:    https://pypi.org/project/python-dotenv/
'''

# vanilla:
import os
import json
import time
from time import strptime
import datetime
from datetime import timedelta
import shutil
from collections import OrderedDict
import random
import sys
# external:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import torchvision
from torchvision import datasets, transforms, models
import PIL.Image as Image
import git
import splitfolders
from dotenv import load_dotenv
# custom:
from routines import *
from displays import *
import myTransforms

println([('done.', 'g')])

In [None]:
'''Cloning the Remote Data-Repository'''


bad_repo_url = False
# Load the required .env file:
if not load_dotenv('./.env'):
    raise FileNotFoundError('cannot find the required .env file')
# Clone the data repo
gh_token = os.getenv('GH_TOKEN')
gh_username = os.getenv('GH_USERNAME')
remote_repo_name = os.getenv('REMOTE_REPO_NAME')
local_repo_name = remote_repo_name
remote_repo_url = f'https://{gh_token}@github.com/{gh_username}/{remote_repo_name}.git'
# Checks if a leftover repo already exists, if so will overwrite it:
if os.path.exists(local_repo_name):
    git.rmtree(local_repo_name)
# Clones the repo, will raise an exception if the remote URL is corrupted
try:
    git.Repo.clone_from(remote_repo_url, local_repo_name)
except Exception as e:
    bad_repo_url=True
    pass
if bad_repo_url:
    e_msg='bad remote repository URL'
    raise SystemExit(e_msg)
dataset_path = local_repo_name + '\\dataset'
classes_path = local_repo_name + '\\classes.json'

println([('done.', 'g')])

In [None]:
'''Parsing the JSON File from the Data Repository'''


json_not_found = False
try:
    with open(classes_path, 'r') as f:
        json_file = json.load(f)
        classes = OrderedDict(json_file[0])
        images_per_class = json_file[1]['images_per_class']
        # Create a list of all subdirs of dataset dir:
        dir_names = [dataset_path + '\\%.2d' % i for i in range(1, len(classes) + 1)]
    # Displays the JSON file metadata:
    println(['total classes', 'images per class'], header=True)
    println([len(classes), images_per_class])
except FileNotFoundError as e:
    json_not_found=True
if json_not_found:
    e_msg=f'cannot locate the "classes.json" file in "{local_repo_name}".'\
        + f'\nre-run \'Data Repository Cloning\' cell and try again!'
    raise SystemExit(e_msg)

println([('done.', 'g')])

In [None]:
'''Validating the Dataset Directory'''


println([('performing a valdiation of the repo according to the JSON file,\n\
before any further training can take place...', 'y')])

files_per_class = []
bad_dirs = []
json_ne_dirs = False

# Validates the number of classes defined in the JSON equals to number of classes subdirs:
if len(os.listdir(dataset_path)) != len(classes):
    json_ne_dirs=True

# Validates that the number of images in each class subdir equals to the one defined in the JSON:
for dir_tuple in os.walk(dataset_path):
    if dir_tuple[0] in dir_names: # skips junk directories
        images_in_dir = len(dir_tuple[2])
        files_per_class.append(images_in_dir)
        if images_in_dir != images_per_class:
            bad_dirs.append(dir_tuple[0])

# Raise exceptions if needed:
if json_ne_dirs:
    e_msg=f'number of classes according to the JSON file ({len(classes)})'\
        + f' does not correlate with total dirs ({len(os.listdir(dataset_path))})'\
        + f' in \"{dataset_path}\".'\
        + f'\nre-run \'Data Repository Cloning\' cell then re-run this cell.'
    raise SystemExit(e_msg)
elif bad_dirs != []:
    e_msg=f'image count in the following directories is incorrect: {bad_dirs}'
    raise SystemExit(e_msg)

# If the number of files found in a class subdir does not strictly equal
#  to the defined number (from the JSON file), the number will be highlighted
#  with red color; elsewise, in green.
println(['id', 'parsed class', 'images found'], header=True)
for i, (ID, Class) in enumerate(classes.items()):

    println([ID,
            Class.upper() if Class in ['uk','usa'] else Class.capitalize(),
            (files_per_class[i], ('g' if files_per_class[i] == images_per_class else 'r'))])
println(['','','total images'], header=True)
println(['','',(sum(files_per_class), ('g' if (sum(files_per_class) == (len(classes) * images_per_class)) else 'r'))])

println([('done.', 'g')])

In [None]:
'''Splitting the Dataset'''


println([('creating a new \'sets\' dir, with three subdirs of images: \
\'train\', \'valid\', \'test\'...', 'y')])

# Deleting a leftover 'sets' directory if such exists:
sets_path = 'sets'
if os.path.exists(sets_path):
    shutil.rmtree(sets_path)

# Randomly splitting the dataset into 'test', 'valid', 'test' image directories:
try:
    splitfolders.ratio(
        dataset_path,
        output=sets_path,
        seed=1337,
        ratio=(.8, .1, .1),
        group_prefix=None,
        move=False)
except:
    pass

# Defining the paths for the subdirs:
train_set_path = sets_path + '\\train'
valid_set_path = sets_path + '\\val'
test_set_path = sets_path + '\\test'

println([('done.', 'g')])

In [None]:
'''Creating DataLoaders'''


batch_size = 32
print(f'batch size is {batch_size}')

# Instanciating each set:
train_data = datasets.ImageFolder(train_set_path, transform=myTransforms.train_transforms)
valid_data = datasets.ImageFolder(valid_set_path, transform=myTransforms.valid_transforms)
test_data = datasets.ImageFolder(test_set_path, transform=myTransforms.test_transforms)

# Creating a DataLoader for each set:
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_data, batch_size=batch_size)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size)

println([('done.', 'g')])

In [None]:
'''Instaciating a Model and a Classifier'''


pretrained = True
weights=('DEFAULT' if pretrained else None)
model = models.vgg16(weights=weights)
model_name = 'VGG16'
for param in model.parameters():
    # Freeze the MODEL parameters so we don't backprop through them! Only through the classifier.
    param.requires_grad = False
dropout_probability = .5
in_features = 25088
out_features = 1024
od = OrderedDict([('fc1', nn.Linear(in_features, out_features)),
                ('drop', nn.Dropout(p=dropout_probability)),
                ('relu', nn.ReLU()),
                ('fc2', nn.Linear(out_features, len(classes))),
                ('output', nn.LogSoftmax(dim=1))])
classifier = nn.Sequential(od)
model.classifier = classifier

println(['Model'], header=True)
println([f'{model_name}, ' + ('Pretrained ' if pretrained else 'Not Pretrained')])
println(['Classifier'], header=True)
print([f'{layer}' for layer in od.keys()])

println([('done.', 'g')])

In [None]:
'''Loading a Model Checkpoint'''

println([('looking for \'.pth\' files in root, will load the latest one; \
if none were found it is still OK...', 'y')])

# Template for loading a checkpoint:
def load_checkpoint(file_path):
    checkpoint = torch.load(file_path)
    learning_rate = checkpoint['learning_rate']
    model = getattr(torchvision.models, checkpoint['network'])(weights=weights)
    model.classifier = checkpoint['classifier']
    model.epochs = checkpoint['epochs']
    model.optimizer = checkpoint['optimizer']
    model.load_state_dict(checkpoint['state_dict'])
    model.class_to_idx = checkpoint['class_to_idx']
    
    return model

# Sort all checkpoints in root dir by their filename, which tells their creation date (ascending order):
timestamp_format = '%H%M%S_%d%m%y'
checkpoints = []
for i in os.listdir():
    if i.endswith('.pth'):
        filename, ext = os.path.splitext(i)
        filename_date_tuple = (time.strptime(filename, timestamp_format), i)
        checkpoints.append(filename_date_tuple)

# TODO: add a print for loading a cp

if checkpoints != []:
    # Get the latest checkpoint and load it onto the model instance
    latest_checkpoint = (sorted(checkpoints, key=lambda x: x[0])[-1])[1]
    model = load_checkpoint(latest_checkpoint)
    print(f'checkpoint loaded from: {latest_checkpoint}')
else:
    print('no \'.pth\' files were found.')
println([('done.', 'g')])

In [None]:
'''Defining the Training Hyperparameters'''


# Hyperparameters:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
epochs = 10
learning_rate = .001
criterion = nn.NLLLoss()
# Only train the CLASSIFIER parameters, FEATURE parameters are frozen!
optimizer = optim.Adam(model.classifier.parameters(), lr = learning_rate)
# Casting the model instance to the available hardware:
model.to(device)
# Hyperparameters names for displaying:
device_name = ('GPU' if device == torch.device('cuda') else 'CPU')
criterion_name = 'Negative Log Loss'
optimizer_name = 'Adam'
# Displaying:
println(['model', 'pretrained', 'device', 'epochs'], header=True)
println([model_name, ('yes' if model_is_pretrained else 'no'), 
        (device_name, ('g' if device_name == 'GPU' else 'r')), epochs])
println(['learning rate', 'loss function', 'optimizer'], header=True)
println([learning_rate, criterion_name, optimizer_name])

println([('done.', 'g')])

In [None]:
'''Model Training, Validation, and Testing'''


println([(f'training on {device_name} started...', 'y')])


# Training and validating part:
train_metadata = []
start_training_time = time.time()
println(['epoch', 'time', 'train loss', 'valid loss', 'accuracy'], header=True)
for idx in range(epochs):
    # Keep the model object up-to-date (because we send it to another function):
    hyperparams = (model, optimizer, device, criterion)
    # Epoch metadata:
    start_time = time.time()
    end_time = None
    train_loss = 0
    valid_loss = 0
    accuracy = 0
    # Switching model mode to TRAINING:
    model.train()
    # Training the model using the entire train image set:
    for inputs, labels in train_loader:     
        train_loss += train(hyperparams, inputs, labels)
    # Switching model mode to EVALUATION:
        model.eval()
    # Validating the model using the entire valid image set:
    with torch.no_grad():
        for inputs, labels in valid_loader:
            loss, acc = test(hyperparams, inputs, labels)
            valid_loss += loss
            accuracy += acc
    end_time = time.time()
    aggregated_metadata = (idx, start_time, end_time, train_loss, valid_loss, accuracy, (train_loader, valid_loader))
    # Collect this epoch's metadata and add to the list list:
    collect(train_metadata, aggregated_metadata)
    # Display this epoch's metadata:
    displayTrain(train_metadata, idx)
end_training_time = time.time()
total_training_time = (end_training_time - start_training_time )
# Displaying the collected training metadata:
println([(f'training finished, results:', 'y')])
displayTrain(train_metadata)


# Testing part:
println([(f'testing the trained model:', 'y')])
test_loss = 0
accuracy = 0
# Testing loop:
model.eval()
hyperparams = (model, optimizer, device, criterion)
for inputs, labels in test_loader:
    loss, acc = test(hyperparams, inputs, labels)
    test_loss += loss
    accuracy += acc
displayTest(test_loss, accuracy, test_loader)

println([('done.', 'g')])

In [None]:
'''Saving a Model Checkpoint'''


println([('saving a new \'.pth\' file with a timestamp of current time...', 'y')])

timestamp_format = '%H%M%S_%d%m%y'
timestamp = datetime.datetime.now().strftime(timestamp_format)
checkpoint_name = f'{timestamp}.pth'
model.class_to_idx = train_data.class_to_idx
checkpoint = {'network': 'vgg16',
              'input_size': in_features,
              'output_size': len(classes),
              'learning_rate': learning_rate,       
              'batch_size': batch_size,
              'classifier' : classifier,
              'epochs': epochs,
              'optimizer': optimizer.state_dict(),
              'state_dict': model.state_dict(),
              'class_to_idx': model.class_to_idx}
torch.save(checkpoint, checkpoint_name)
checkpoint_path = os.path.abspath(os.getcwd()) + '\\' + checkpoint_name
print(f'checkpoint saved to \"{checkpoint_path}\"')

println([('done.', 'g')])

In [None]:
# def process_image(pil_image):
#     ''' Scales, crops, and normalizes a PIL image for a PyTorch model,
#         returns an Numpy array
#     '''
    
#     img_loader = transforms.Compose([transforms.Resize(size_resize),
#                                      transforms.CenterCrop(size_crop), 
#                                      transforms.ToTensor()])
    
#     #pil_image = Image.open(image)
#     pil_image = img_loader(pil_image).float()
    
#     np_image = np.array(pil_image)    
    
#     mean = np.array(normalize_mean)
#     std = np.array(normalize_std)
#     np_image = (np.transpose(np_image, (1, 2, 0)) - mean) / std    
#     np_image = np.transpose(np_image, (2, 0, 1))
            
#     return np_image

# def imshow(np_image, ax = None, title = None):
#     if ax is None:
#         fig, ax = plt.subplots()
    
#     # PyTorch tensors assume the color channel is the first dimension
#     # but matplotlib assumes is the third dimension
#     np_image = np.transpose(np_image, (1, 2, 0))
    
#     # Undo preprocessing
#     mean = np.array(normalize_mean)
#     std = np.array(normalize_std)
#     np_image = std * np_image + mean
    
#     # Image needs to be clipped between 0 and 1 or it looks like noise when displayed
#     np_image = np.clip(np_image, 0, 1)
    
#     ax.imshow(np_image)
    
#     return ax


# images_paths = result = [os.path.join(dp, f) for dp, dn, filenames in os.walk(sets_path) for f in filenames if os.path.splitext(f)[1] == '.jpg']
# random.seed()
# random_image_path = random.choice(images_paths)
# random_image = Image.open(random_image_path)
# imshow(process_image(random_image))

In [None]:
# # Predict the class from an image file:

# def predict(pil_image, model, top_k_probabilities = 5):
#     ''' Predict the class (or classes) of an image using a trained deep learning model.
#     '''
    
#     # Use GPU if it's available
#     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#     #print(device)

#     model.to(device)
#     model.eval()
    
#     np_image = process_image(pil_image)
#     tensor_image = torch.from_numpy(np_image)
    
#     inputs = Variable(tensor_image)
    
#     if torch.cuda.is_available():
#         inputs = Variable(tensor_image.float().cuda())           
        
#     inputs = inputs.unsqueeze(dim = 0)
#     log_probabilities = model.forward(inputs)
#     probabilities = torch.exp(log_probabilities)    

#     top_probabilities, top_classes = probabilities.topk(top_k_probabilities, dim = 1)
#     #print(top_probabilities)
#     #print(top_classes)
    
#     class_to_idx_inverted = {model.class_to_idx[c]: c for c in model.class_to_idx}
#     top_mapped_classes = list()
    
#     for label in top_classes.cpu().detach().numpy()[0]:
#         top_mapped_classes.append(class_to_idx_inverted[label])
    
#     return top_probabilities.cpu().detach().numpy()[0], top_mapped_classes

# # with open(classes_json, 'r') as f:
# #     category_label_to_name = json.load(f)

# top_probabilities, top_classes = predict(random_image, model, top_k_probabilities = 5)

# for c in top_classes:
#     if int(c) < 10: # overcome the '0' padding in the filename
#         c = str(int(c))
#     print(classes[c])

In [None]:
# # Display an image along with the top 5 classes

# max_index = np.argmax(top_probabilities)
# max_probability = top_probabilities[max_index]
# label = top_classes[max_index]

# if int(label) < 10: # overcome the '0' padding in the filename
#     label = str(int(label))

# fig = plt.figure(figsize=(6,6))
# ax1 = plt.subplot2grid((15,9), (0,0), colspan = 9, rowspan = 9)
# ax2 = plt.subplot2grid((15,9), (9,2), colspan = 5, rowspan = 5)

# ax1.axis('off')
# ax1.set_title(classes[label])
# ax1.imshow(random_image)

# labels = []
# for c in top_classes:
#     if int(c) < 10: # overcome the '0' padding in the filename
#         c = str(int(c))
#     labels.append(classes[c])

# y_pos = np.arange(5)
# ax2.set_yticks(y_pos)
# ax2.set_yticklabels(labels)
# ax2.set_xlabel('Probability')
# ax2.invert_yaxis()
# ax2.barh(y_pos, top_probabilities, xerr = 0, align = 'center', color = 'blue')

# plt.show()

In [None]:
# # Sanity check - display an image along with the top 5 classes

# test_case = 10

# for i in range(test_case):
#     random.seed()
#     random_class_path = random.choice(os.listdir(test_set_path))
#     random.seed()
#     random_image_path = test_set_path + '\\' + random_class_path + '\\' + random.choice(os.listdir(test_set_path + '\\' + random_class_path))

#     pil_image = Image.open(random_image_path)
#     plt.imshow(pil_image)

#     top_probabilities, top_classes = predict(pil_image, model, top_k_probabilities = 5)
#     max_index = np.argmax(top_probabilities)
#     max_probability = top_probabilities[max_index]
#     label = top_classes[max_index]

#     fig = plt.figure(figsize=(6,6))
#     ax1 = plt.subplot2grid((15,9), (0,0), colspan = 9, rowspan = 9)
#     ax2 = plt.subplot2grid((15,9), (9,2), colspan = 5, rowspan = 5)

#     ax1.axis('off')
#     ax1.set_title(classes[flower_class]) #Real class
#     ax1.imshow(pil_image)

#     labels = []
#     for c in top_classes:
#         if int(c) < 10: # overcome the '0' padding in the filename
#             c = str(int(c))
#         labels.append(classes[c])

#     y_pos = np.arange(5)
#     ax2.set_yticks(y_pos)
#     ax2.set_yticklabels(labels)
#     ax2.set_xlabel('Probability')
#     ax2.invert_yaxis()
#     ax2.barh(y_pos, top_probabilities, xerr = 0, align = 'center', color = 'blue')

#     plt.show()