# Pytorch - Densnet201 - 16 classes - Training

In [1]:
from __future__ import print_function, division

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler

In [2]:
from torchvision import datasets, models, transforms



import matplotlib as mpl
#  ######################################################
#  #### Matplotlib X display error - removing for server#
#  ######################################################
mpl.use('Agg')  # This has to run before pyplot import

import matplotlib.pyplot as plt
import time
import datetime
import os
import copy
import sys
import pandas as pd

import ipywidgets as widgets # for ipython widgets

In [3]:
plt.ion()   # interactive mode

# Training 

In [4]:
# Take date and time for saving points
now = datetime.datetime.now()
date_and_time = now.strftime("%Y%m%d%H%M")

## Initial varaibles to change for different running

In [23]:
main_data_dir = "../../../data/data_generated_medicotask_70_30_v2"  # Main data directory to be handled
model_name = date_and_time + "_13_3_densenet201_70_30"
checkpoint_name_format = date_and_time + "_13_3_weights-improvement-{epoch:02d}-{val_acc:.4f}.hdf5"

## Number of epochs and batchsize (to be changed)

In [9]:
number_of_epochs = 1

batch_size = 25

## Directories to save output data

In [11]:
data_dir = main_data_dir
model_dir = data_dir + '/pytorch_models'
plot_dir  = data_dir + '/pytorch_plots'
history_dir = data_dir + '/pytorch_history'

In [28]:
acc_loss_plot_name = 'acc_loss_plot_' + model_name
accuracy_plot_name = 'accuracy_plot_' + model_name
loss_plot_name = 'loss_plot_' + model_name

In [12]:
########################################################################
#  Managin Directory structure
########################################################################
if not os.path.exists(model_dir):
    os.mkdir(model_dir)

if not os.path.exists(plot_dir):
    os.mkdir(plot_dir)

if not os.path.exists(history_dir):
    os.mkdir(history_dir)


## Preparing Data - Training and Validation

In [13]:
data_transforms = {
    'train': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(229),
        transforms.RandomHorizontalFlip(),
        transforms.RandomVerticalFlip(),
        transforms.RandomRotation(90),
        transforms.ToTensor(),
        transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
    ]),
    'validation': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(229),
        transforms.ToTensor(),
        transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
    ]),
}

In [14]:
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
                                          data_transforms[x])
                  for x in ['train', 'validation']}

dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batch_size,
                                              shuffle=True, num_workers=1)
               for x in ['train', 'validation']}


dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'validation']}

## Selecting the computing device

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## The Main method to train a model

In [16]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    history_tensor = torch.empty((num_epochs, 4), device=device)  # 4- trai_acc, train_loss, val_acc, val_loss
    checkpoint_name = None

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)



        # Each epoch has a training and validation phase
        for phase in ['train', 'validation']:
            if phase == 'train':
                scheduler.step()
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            indicator = 0  # just for print batch processing status (no of batches)

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:



                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)
                  #  print("outputs=", outputs) # only for testing - vajira
                  #  print("labels = ", labels) # only for testing - vajira
                    print(indicator, sep='-', end='=', flush=True)
                    indicator += 1

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # Collecting data for making plots
            if phase == 'train':
                history_tensor[epoch, 0] = epoch_acc
                history_tensor[epoch, 1] = epoch_loss
            if phase == 'validation':
                history_tensor[epoch, 2] = epoch_acc
                history_tensor[epoch, 3] = epoch_loss

            # deep copy the model
            if phase == 'validation' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
                checkpoint_name = checkpoint_name_format.format(epoch=epoch, val_acc=best_acc)
                print("Found a best model:", checkpoint_name)
            elif phase== 'validation':
                print("No improvement from the previous best model ")

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model, history_tensor, checkpoint_name


## A methods to plot and save plots

In [17]:
###########################################################
#  Ploting history and save plots to plots directory
###########################################################
def plot_and_save_training_history(history_tensor):
    history_data = history_tensor.cpu().numpy()
    df = pd.DataFrame(history_data, columns=['train_acc', 'train_loss', 'val_acc', 'val_loss'])
    pie = df.plot()
    fig = pie.get_figure()
    fig.savefig(os.path.join(plot_dir, "_training_" + acc_loss_plot_name))

## Loading a pretrained mode and modifying the last layers

In [19]:
model_ft = models.densenet201(pretrained=False) # without pretrained weights
num_ftrs = model_ft.classifier.in_features
model_ft.fc = nn.Linear(num_ftrs, 16)

  nn.init.kaiming_normal(m.weight.data)


## Setting parameters

In [20]:
#  # Setting model parameters
criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.1, momentum=0.9)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=10, gamma=0.1)

## If multiple GPUs are there, then use all of them

In [21]:
## #######################################################
# If multiple GPUS are there, run on multiple GPUS
##########################################################
#  Setting model in multiple GPUs
if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
    model_ft = nn.DataParallel(model_ft)
elif torch.cuda.device_count() == 1:
    print("Found only one GPU")
else:
    print("No GPU.. Runing on CPU")

Found only one GPU


## Loading the model to the GPUs and run it to train

In [25]:
##############################################################
#  Loading model to GPUs and setting parameters
##############################################################
model_ft = model_ft.to(device)


#############################################################
### start Training
############################################################

model_ft, history_tensor, check_point_name = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler,
                       num_epochs=number_of_epochs)


Epoch 0/0
----------
0=1=2=3=4=5=6=7=8=9=10=11=12=13=14=15=16=17=18=19=20=21=22=23=24=25=26=27=28=29=30=31=32=33=34=35=36=37=38=39=40=41=42=43=44=45=46=47=48=49=50=51=52=53=54=55=56=57=58=59=60=61=62=63=64=65=66=67=68=69=70=71=72=73=74=75=76=77=78=79=80=81=82=83=84=85=86=87=88=89=90=91=92=93=94=95=96=97=98=99=100=101=102=103=104=105=106=107=108=109=110=111=112=113=114=115=116=117=118=119=120=121=122=123=124=125=126=127=128=129=130=131=132=133=134=135=136=137=138=139=140=141=142=143=144=145=146=147=train Loss: 1.6422 Acc: 0.4011
0=1=2=3=4=5=6=7=8=9=10=11=12=13=14=15=16=17=18=19=20=21=22=23=24=25=26=27=28=29=30=31=32=33=34=35=36=37=38=39=40=41=42=43=44=45=46=47=48=49=50=51=52=53=54=55=56=57=58=59=60=61=62=63=validation Loss: 1.4097 Acc: 0.4317
Found a best model: 201808201525_13_3_weights-improvement-00-0.4317.hdf5

Training complete in 2m 2s
Best val Acc: 0.431704


## Save the best model to the model directory

In [26]:
############################################################
### Save the model to the directory
############################################################

if not os.path.exists(model_dir):
    os.mkdir(model_dir)  # to save plots

if not check_point_name==None:
    print(check_point_name)
    torch.save(model_ft.state_dict(), os.path.join(model_dir, check_point_name))
    print("Model saved")

201808201525_13_3_weights-improvement-00-0.4317.hdf5
Model saved


## Plot and save training history 

In [29]:
plot_and_save_training_history(history_tensor)

print("Plots saved to", plot_dir)

Plots saved to ../../../data/data_generated_medicotask_70_30_v2/pytorch_plots


# Re-Training

## Loading the model to retrain

In [31]:
# Take date and time for saving points
now = datetime.datetime.now()
date_and_time = now.strftime("%Y%m%d%H%M")

In [None]:
best_weight_file_name = input('Please, enter the best weights value file name:')

In [None]:
model_ft.load_state_dict(torch.load(os.path.join(model_dir, best_weight_file_name)))
print('Model loaded')