# Imports Library

In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score
import pickle
from PIL import Image
from torch.utils import data
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
import torchvision.transforms as transforms
import os
import numpy as np

# Path and Parameter Settings
- Path: Datasets and Models
- Parameter: CNN, LSTM, Training

In [None]:
# data path
data_path = "./data/task04/"
save_model_path = "./crnn_model/"

In [None]:
# cnn architecture
CNN_fc_hidden1, CNN_fc_hidden2 = 1024, 768
cnn_embed_dim = 512
img_x, img_y = 256, 342
dropout_p = 0.0

In [None]:
# lstm architecture
RNN_hidden_layers = 3
RNN_hidden_nodes = 512
RNN_FC_dim = 256

In [None]:
# training parameters
k = 4                   # number of target category
epochs = 120
batch_size = 1
learning_rate = 1e-4
log_interval = 10       # interval for displaying training info

In [None]:
# select which frame to begin & end in videos
b_frame, e_frame, s_frame = 1, 29, 1

# Model Definition
- CNN model: A CNN function encodes (meaning compressing dimension) every 2D image into a 1D vector
- LSTM model: A RNN receives a sequence input vectors from the CNN encoder and outputs another 1D sequence

In [None]:
class CNN(nn.Module):
    def __init__(self, img_x=90, img_y=120, fc_hidden1=512, fc_hidden2=512, drop_p=0.3, CNN_embed_dim=300):
        super(CNN, self).__init__()
        self.img_x = img_x
        self.img_y = img_y
        self.CNN_embed_dim = CNN_embed_dim

        # CNN architechtures
        self.ch1, self.ch2, self.ch3, self.ch4 = 32, 64, 128, 256
        self.k1, self.k2, self.k3, self.k4 = (5, 5), (3, 3), (3, 3), (3, 3)  # 2d kernal size
        self.s1, self.s2, self.s3, self.s4 = (2, 2), (2, 2), (2, 2), (2, 2)  # 2d strides
        self.pd1, self.pd2, self.pd3, self.pd4 = (0, 0), (0, 0), (0, 0), (0, 0)  # 2d padding

        # conv2D output shapes
        self.conv1_outshape = conv2D_output_size((self.img_x, self.img_y), self.pd1, self.k1,
                                                 self.s1)  # Conv1 output shape
        self.conv2_outshape = conv2D_output_size(self.conv1_outshape, self.pd2, self.k2, self.s2)
        self.conv3_outshape = conv2D_output_size(self.conv2_outshape, self.pd3, self.k3, self.s3)
        self.conv4_outshape = conv2D_output_size(self.conv3_outshape, self.pd4, self.k4, self.s4)

        # fully connected layer hidden nodes
        self.fc_hidden1, self.fc_hidden2 = fc_hidden1, fc_hidden2
        self.drop_p = drop_p

        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=self.ch1, kernel_size=self.k1, stride=self.s1, padding=self.pd1),
            nn.BatchNorm2d(self.ch1, momentum=0.01),
            nn.ReLU(inplace=True),
            # nn.MaxPool2d(kernel_size=2),
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=self.ch1, out_channels=self.ch2, kernel_size=self.k2, stride=self.s2,
                      padding=self.pd2),
            nn.BatchNorm2d(self.ch2, momentum=0.01),
            nn.ReLU(inplace=True),
            # nn.MaxPool2d(kernel_size=2),
        )

        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=self.ch2, out_channels=self.ch3, kernel_size=self.k3, stride=self.s3,
                      padding=self.pd3),
            nn.BatchNorm2d(self.ch3, momentum=0.01),
            nn.ReLU(inplace=True),
            # nn.MaxPool2d(kernel_size=2),
        )

        self.conv4 = nn.Sequential(
            nn.Conv2d(in_channels=self.ch3, out_channels=self.ch4, kernel_size=self.k4, stride=self.s4,
                      padding=self.pd4),
            nn.BatchNorm2d(self.ch4, momentum=0.01),
            nn.ReLU(inplace=True),
            # nn.MaxPool2d(kernel_size=2),
        )

        self.drop = nn.Dropout2d(self.drop_p)
        self.pool = nn.MaxPool2d(2)
        self.fc1 = nn.Linear(self.ch4 * self.conv4_outshape[0] * self.conv4_outshape[1],
                             self.fc_hidden1)  # fully connected layer, output k classes
        self.fc2 = nn.Linear(self.fc_hidden1, self.fc_hidden2)
        self.fc3 = nn.Linear(self.fc_hidden2, self.CNN_embed_dim)  # output = CNN embedding latent variables

    def forward(self, x_3d):
        cnn_embed_seq = []
        for t in range(x_3d.size(1)):
            # CNNs
            x = self.conv1(x_3d[:, t, :, :, :])
            x = self.conv2(x)
            x = self.conv3(x)
            x = self.conv4(x)
            x = x.view(x.size(0), -1)  # flatten the output of conv

            # FC layers
            x = F.relu(self.fc1(x))
            # x = F.dropout(x, p=self.drop_p, training=self.training)
            x = F.relu(self.fc2(x))
            x = F.dropout(x, p=self.drop_p, training=self.training)
            x = self.fc3(x)
            cnn_embed_seq.append(x)

        # swap time and sample dim such that (sample dim, time dim, CNN latent dim)
        cnn_embed_seq = torch.stack(cnn_embed_seq, dim=0).transpose_(0, 1)
        # cnn_embed_seq: shape=(batch, time_step, input_size)

        return cnn_embed_seq

In [None]:
class LSTM(nn.Module):
    def __init__(self, CNN_embed_dim=300, h_RNN_layers=3, h_RNN=256, h_FC_dim=128, drop_p=0.3, num_classes=50):
        super(LSTM, self).__init__()
        self.RNN_input_size = CNN_embed_dim
        self.h_RNN_layers = h_RNN_layers  # RNN hidden layers
        self.h_RNN = h_RNN  # RNN hidden nodes
        self.h_FC_dim = h_FC_dim
        self.drop_p = drop_p
        self.num_classes = num_classes

        self.LSTM = nn.LSTM(
            input_size=self.RNN_input_size,
            hidden_size=self.h_RNN,
            num_layers=h_RNN_layers,
            batch_first=True,  # input & output will has batch size as 1s dimension. e.g. (batch, time_step, input_size)
        )

        self.fc1 = nn.Linear(self.h_RNN, self.h_FC_dim)
        self.fc2 = nn.Linear(self.h_FC_dim, self.num_classes)

    def forward(self, x_RNN):
        self.LSTM.flatten_parameters()
        RNN_out, (h_n, h_c) = self.LSTM(x_RNN, None)
        """ h_n shape (n_layers, batch, hidden_size), h_c shape (n_layers, batch, hidden_size) """
        """ None represents zero initial hidden state. RNN_out has shape=(batch, time_step, output_size) """

        # FC layers
        x = self.fc1(RNN_out[:, -1, :])  # choose RNN_out at the last time step
        x = F.relu(x)
        x = F.dropout(x, p=self.drop_p, training=self.training)
        x = self.fc2(x)

        return x

In [None]:
def conv2D_output_size(img_size, padding, kernel_size, stride):
    # compute output shape of conv2D
    outshape = (np.floor((img_size[0] + 2 * padding[0] - (kernel_size[0] - 1) - 1) / stride[0] + 1).astype(int),
                np.floor((img_size[1] + 2 * padding[1] - (kernel_size[1] - 1) - 1) / stride[1] + 1).astype(int))
    return outshape

# Dataset Loading
- Load the selected frame in the dataset and transform it into a tensor

In [None]:
class Dataset_CRNN(data.Dataset):
    "Characterizes a dataset for PyTorch"
    def __init__(self, data_path, folders, labels, frames, transform=None):
        "Initialization"
        self.data_path = data_path
        self.labels = labels
        self.folders = folders
        self.transform = transform
        self.frames = frames

    def __len__(self):
        "Denotes the total number of samples"
        return len(self.folders)

    def read_images(self, path, selected_folder, use_transform):
        x = []
        for i in self.frames:
            image = Image.open(os.path.join(path, selected_folder, 'frame{:d}.png'.format(i)))

            if use_transform is not None:
                image = use_transform(image)

            x.append(image)
        x = torch.stack(x, dim=0)

        return x

    def __getitem__(self, index):
        "Generates one sample of data"
        # Select sample
        folder = self.folders[index]

        # Load data
        x = self.read_images(self.data_path, folder, self.transform)     # (input) spatial images
        y = torch.LongTensor([self.labels[index]])                  # (labels) LongTensor are for int64 instead of FloatTensor

        # print(x.shape)
        return x, y

# Function Definition
- Train: Set the training model and parameters to update and display the information
- Validation: Set up validated models and processes and save model records

In [None]:
def train(log_intreval, model, device, train_loader, optimizer, epoch):
    # Set model as training mode
    cnn_encoder, rnn_decoder = model
    cnn_encoder.train() # cnn
    rnn_decoder.train() # lstm

    losses = []
    scores = []
    N_count = 0 # counting total training sample in one epoch

    for batch_idx, (x, y) in enumerate(train_loader): 
        # distribute data to device
        x,y = x.to(device), y.to(device).view(-1, ) 

        N_count += x.size(0)

        optimizer.zero_grad() 
        output = rnn_decoder(cnn_encoder(x)) # output dim = (batch_size, number of classes)

        loss = F.cross_entropy(output, y)
        losses.append(loss.item()) 

        # to compute accurary
        y_pred = torch.max(output, 1)[1] 
        step_score = accuracy_score(y.cpu().data.squeeze().numpy(), 
                                    y_pred.cpu().data.squeeze().numpy()) 
        scores.append(step_score) # computed on GPU

        loss.backward() 
        optimizer.step() 

        # show infotmation
        if(batch_idx + 1) % log_intreval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}, Accu: {:.2f}%'.format(
                epoch + 1, N_count, len(train_loader.dataset), 100. * (batch_idx + 1) / len(train_loader), loss.item(), 100 * step_score))
        return losses, scores

In [None]:
def validation(model, device, optimizer, test_loader):
    # set model as testing mode
    cnn_encoder, rnn_decoder = model
    cnn_encoder.eval() 
    rnn_decoder.eval()

    test_loss = 0
    all_y = []
    all_y_pred = []
    with torch.no_grad(): 
        for x, y in test_loader:
            # distribute data to device
            x, y = x.to(device), y.to(device).view(-1, )

            output = rnn_decoder(cnn_encoder(x))

            loss = F.cross_entropy(output, y, reduction='sum')
            test_loss += loss.item()                  # sum up the batch loss
            y_pred = output.max(1, keepdim = True)[1] # (y_pred != output) get the index of the max log-probability

            # collect all y and y_pred in all batches
            all_y.extend(y) 
            all_y_pred.extend(y_pred)

    test_loss /= len(test_loader.dataset)

    # compute accuracy
    all_y = torch.stack(all_y, dim=0)
    all_y_pred = torch.stack(all_y_pred, dim = 0)
    test_score = accuracy_score(all_y.cpu().data.squeeze().numpy(), all_y_pred.cpu().data.squeeze().numpy())

    # show information
    print('\nTest set ({:d} samples): Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format(len(all_y), test_loss, 100 * test_score))

    # save Pytorch model of best record
    torch.save(cnn_encoder.state_dict(), os.path.join(save_model_path, 'cnn_epoch{}.pth'.format(epoch + 1)))
    torch.save(rnn_decoder.state_dict(), os.path.join(save_model_path, 'lstm_epoch{.pth'.format(epoch + 1)))
    torch.save(optimizer.state.dict(), os.path.join(save_model_path, 'optimizer_eopch{}.pth'.format(epoch + 1)))
    print("Epoch {} model saved!".format(epoch + 1))

    return test_loss, test_score

# CUDA and Data Processing
- CUDA: Check GPU exists and parallelize model
- Data Processing: Find the category of the task and the defined label from the dataset and store it in the list

In [None]:
# Detect devices
use_cuda = torch.cuda.is_available()                 # check if GPU exists
device = torch.device("cuda" if use_cuda else "cpu") # use CPU or GPU

# Data loading parameters
params = {'batch_size': 1, 'shuffle': True, 'pin_memory': True} if use_cuda else {}

In [None]:
# load Task actions names
task_labels = ['JumpForward', 'JumpForward', 'Run', 'Run', 'TurnLeft', 'TurnLeft', 'TurnRight', 'TurnRight']
# task_labels = ['JumpForward', 'Run', 'TurnLeft', 'TurnRight']

# convert labels -> category
le = LabelEncoder()
le.fit(task_labels)

# show how many classes there are
list(le.classes_)

# convert category -> one-hot
action_category = le.transform(task_labels).reshape(-1, 1)
enc = OneHotEncoder()
enc.fit(action_category)

tasks = []
fnames = os.listdir(data_path)

all_tasks_names = []
for f in fnames:
    loc1 = f.find('v_')
    loc2 = f.find('_g')
    tasks.append(f[(loc1 + 2) : loc2])

    all_tasks_names.append(f)

In [None]:
# all data files
x_list = all_tasks_names              # all video file names
y_list = le.transform(task_labels)    # all video labels

In [None]:
# random split sample set to training set and test set
train_list, test_list, train_label, test_label = train_test_split(x_list, y_list, test_size=0.25, random_state=42)

transform = transforms.Compose([transforms.Resize([img_x, img_y]),
                                transforms.ToTensor(),
                                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])

selected_frames = np.arange(b_frame, e_frame, s_frame).tolist()

train_set, valid_set = Dataset_CRNN(data_path, train_list, train_label, selected_frames, transform=transform) ,\
                       Dataset_CRNN(data_path, test_list, test_label, selected_frames, transform=transform)

train_loader = data.DataLoader(train_set, **params)
valid_loader = data.DataLoader(valid_set, **params)

In [None]:
# create model
cnn = CNN().to(device)
lstm = LSTM().to(device)

# Parallelize model to multiple GPUs
if torch.cuda.device_count() > 1:
    print("Using", torch.cuda.device_count(), "GPUs!")
    cnn_encoder = nn.DataParallel(cnn)
    rnn_decoder = nn.DataParallel(lstm)

crnn_params = list(cnn.parameters()) + list(lstm.parameters())
optimizer = torch.optim.Adam(crnn_params, lr=learning_rate)

# record training process
epoch_train_losses = []
epoch_train_scores = []
epoch_test_losses = []
epoch_test_scores = []

# Training
- Start training and save results by epoch

In [None]:
# start training
for epoch in range(epochs):
    # train, test model
    train_losses, train_scores = train(log_interval, [cnn, lstm], device, train_loader, optimizer, epoch)
    epoch_test_loss, epoch_test_score = validation([cnn, lstm], device, optimizer, valid_loader)

    # save results
    epoch_train_losses.append(train_losses)
    epoch_train_scores.append(train_scores)
    epoch_test_losses.append(epoch_test_loss)
    epoch_test_scores.append(epoch_test_score)

    # save all train test results
    A, B, C, D = np.array(epoch_train_losses), np.array(epoch_train_scores), \
                 np.array(epoch_test_losses), np.array(epoch_test_scores)
    np.save('./outputs/CRNN_epoch_training_losses.npy', A)
    np.save('./outputs/CRNN_epoch_training_scores.npy', B)
    np.save('./outputs/CRNN_epoch_test_loss.npy', C)
    np.save('./outputs/CRNN_epoch_test_score.npy', D)

# GAIL (Todo)