In [1]:
from torchvision import models
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from random import shuffle
import numpy as np

res_model = models.resnet50(pretrained=True)
# print(res_model)

In [2]:
import os
# print(os.listdir('data/data_first_25/ApplyEyeMakeup'))

In [3]:
#import imageio
from PIL import Image
from torchvision import transforms

test_transform = transforms.Compose(
    [transforms.Resize(256),  # 1. Resize smallest side to 256.
     transforms.CenterCrop(224), # 2. Crop center square of 224x224 pixels.
     transforms.ToTensor(), # 3. Convert to pytorch tensor.
     transforms.Normalize(mean = [0.485, 0.456, 0.406],  # normalize.
                          std = [0.229, 0.224, 0.225])
    ])

img_pil = Image.open('data/data_first_25/ApplyEyeMakeup/g01_c01/v_ApplyEyeMakeup_g01_c01_frame0.jpg')

# 1. Forward propagate the image through the CNN.
# Unsqueeze adds a dummy batch dimension needed to pass through the model.
input_img =  test_transform(img_pil).unsqueeze(0)

# print(res_model(input_img))

In [4]:
import random

def get_img_list(cat,d1):
    ret = list()
    for frame in os.listdir('data/data_first_25/{}/{}'.format(cat,d1)):
        img_pil = Image.open('data/data_first_25/{}/{}/{}'.format(cat,d1,frame))
        input_img = test_transform(img_pil).unsqueeze(0)
        ret.append(input_img)
    return ret

def createTrainAndValSet(categories,trainPercentage):
    category_options = os.listdir('data/data_first_25')
    category_names = category_options[:categories]
    train_set = []
    val_set = []
    i=0
    for cat in category_names:
        for d1 in os.listdir('data/data_first_25/{}'.format(cat)):
            r = random.uniform(0,1)
            img_list = get_img_list(cat,d1)
            if r < trainPercentage:
                train_set.append((img_list,i))
            else:
                val_set.append((img_list,i))
        i+=1
    return train_set,val_set


In [29]:
# Source: https://github.com/HHTseng/video-classification/blob/master/ResNetCRNN/functions.py



# 2D CNN encoder using ResNet-152 pretrained
class ResCNNEncoder(nn.Module):
    def __init__(self, fc_hidden1=512, fc_hidden2=512, drop_p=0.3, CNN_embed_dim=300):
        """Load the pretrained ResNet-152 and replace top fc layer."""
        super(ResCNNEncoder, self).__init__()

        self.fc_hidden1, self.fc_hidden2 = fc_hidden1, fc_hidden2
        self.drop_p = drop_p

        resnet = models.resnet50(pretrained=True)
        modules = list(resnet.children())[:-1]      # delete the last fc layer.
        self.resnet = nn.Sequential(*modules)
        self.fc1 = nn.Linear(resnet.fc.in_features, fc_hidden1)
        self.bn1 = nn.BatchNorm1d(fc_hidden1, momentum=0.01)
        self.fc2 = nn.Linear(fc_hidden1, fc_hidden2)
        self.bn2 = nn.BatchNorm1d(fc_hidden2, momentum=0.01)
        self.fc3 = nn.Linear(fc_hidden2, CNN_embed_dim)
        
    def forward(self, x_3d):
        cnn_embed_seq = []
        for t in range(x_3d.size(1)):
            # ResNet CNN
            with torch.no_grad():
                x = self.resnet(x_3d[:, t, :, :, :])  # ResNet
                x = x.view(x.size(0), -1)             # flatten output of conv

            # FC layers
            x = self.fc1(x)
            x = F.relu(x)
            x = self.fc2(x)
            x = F.relu(x)
            x = F.dropout(x, p=self.drop_p, training=self.training)
            x = self.fc3(x)

            cnn_embed_seq.append(x)

        # swap time and sample dim such that (sample dim, time dim, CNN latent dim)
        cnn_embed_seq = torch.stack(cnn_embed_seq, dim=0).transpose_(0, 1)
        # cnn_embed_seq: shape=(batch, time_step, input_size)

        return cnn_embed_seq


class DecoderRNN(nn.Module):
    def __init__(self, CNN_embed_dim=300, h_RNN_layers=3, h_RNN=256, h_FC_dim=128, drop_p=0.3, num_classes=50):
        super(DecoderRNN, self).__init__()

        self.RNN_input_size = CNN_embed_dim
        self.h_RNN_layers = h_RNN_layers   # RNN hidden layers
        self.h_RNN = h_RNN                 # RNN hidden nodes
        self.h_FC_dim = h_FC_dim
        self.drop_p = drop_p
        self.num_classes = num_classes

        self.LSTM = nn.LSTM(
            input_size=self.RNN_input_size,
            hidden_size=self.h_RNN,        
            num_layers=h_RNN_layers,       
            batch_first=True,       # input & output will has batch size as 1s dimension. e.g. (batch, time_step, input_size)
        )

        self.fc1 = nn.Linear(self.h_RNN, self.h_FC_dim)
        self.fc2 = nn.Linear(self.h_FC_dim, self.num_classes)

    def forward(self, x_RNN):
        
        self.LSTM.flatten_parameters()
        RNN_out, (h_n, h_c) = self.LSTM(x_RNN, None)  
        """ h_n shape (n_layers, batch, hidden_size), h_c shape (n_layers, batch, hidden_size) """ 
        """ None represents zero initial hidden state. RNN_out has shape=(batch, time_step, output_size) """

        # FC layers
        x = self.fc1(RNN_out[:, -1, :])   # choose RNN_out at the last time step
        x = F.relu(x)
        x = F.dropout(x, p=self.drop_p, training=self.training)
        x = self.fc2(x)

        return x

In [None]:
categories_used = 5
lossFn = nn.CrossEntropyLoss()
batchSize = 1
trainPercentage = 1

train_set, val_set = createTrainAndValSet(categories_used,trainPercentage)

In [None]:
import torch

learningRate = 5e-3
epochs = 50

train_accuracies = []; val_accuracies = []
train_losses = []; val_losses = []


def train_model(categories,lossFn,batchSize,learningRate,epochs):
    cnn_encoder = ResCNNEncoder(CNN_embed_dim=categories).cuda()
    rnn_decoder = DecoderRNN(num_classes=categories).cuda()
    cnn_encoder.train()
    rnn_decoder.train()


    lossFn = lossFn.cuda()
    crnn_params = list(cnn_encoder.fc1.parameters()) + \
                  list(cnn_encoder.fc2.parameters()) + \
                  list(cnn_encoder.fc3.parameters()) #+ list(rnn_decoder.parameters())
    crnn_params = list(cnn_encoder.fc3.parameters())
    optimizer = torch.optim.Adam(crnn_params, lr=learningRate)

    for epoch in range(epochs):
        shuffle(train_set)
        correct = 0
        cum_loss = 0

        i=0
        for video in train_set:
            frame_list,target_cat = video
            frame_list = torch.stack(frame_list, dim=0).transpose(0, 1).cuda()
#             scores = rnn_decoder(cnn_encoder(frame_list))
            scores = cnn_encoder(frame_list)
            scores = scores[0][0].view(1,5)
#             print(scores)
#             print(target_cat)
            
            loss = lossFn(scores,torch.tensor(np.array([target_cat]),dtype=torch.long).cuda())
            max_score, max_label = scores.max(1)
            if max_label == target_cat:
                correct+=1
            cum_loss+=loss.item()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if (i + 1) % 100 == 0:
                print('Train-epoch %d. Iteration %05d, Avg-Loss: %.4f, Accuracy: %.4f' % 
                    (epoch, i + 1, cum_loss / (i + 1), correct / ((i + 1) * batchSize)))
            i+=1
        train_accuracies.append(correct / len(train_set))
        train_losses.append(cum_loss / (i + 1))   

   
train_model(categories_used,lossFn,batchSize,learningRate,epochs)

Train-epoch 0. Iteration 00100, Avg-Loss: 1.6671, Accuracy: 0.2000
Train-epoch 0. Iteration 00200, Avg-Loss: 1.6688, Accuracy: 0.1950
Train-epoch 0. Iteration 00300, Avg-Loss: 1.6480, Accuracy: 0.2233
Train-epoch 0. Iteration 00400, Avg-Loss: 1.6613, Accuracy: 0.2100
Train-epoch 0. Iteration 00500, Avg-Loss: 1.6609, Accuracy: 0.2060
Train-epoch 0. Iteration 00600, Avg-Loss: 1.6570, Accuracy: 0.2133
Train-epoch 1. Iteration 00100, Avg-Loss: 1.6325, Accuracy: 0.2100
Train-epoch 1. Iteration 00200, Avg-Loss: 1.6307, Accuracy: 0.2400
Train-epoch 1. Iteration 00300, Avg-Loss: 1.6325, Accuracy: 0.2467
Train-epoch 1. Iteration 00400, Avg-Loss: 1.6222, Accuracy: 0.2650
Train-epoch 1. Iteration 00500, Avg-Loss: 1.6264, Accuracy: 0.2520
Train-epoch 1. Iteration 00600, Avg-Loss: 1.6251, Accuracy: 0.2517
Train-epoch 2. Iteration 00100, Avg-Loss: 1.6490, Accuracy: 0.2300
Train-epoch 2. Iteration 00200, Avg-Loss: 1.6538, Accuracy: 0.2450
Train-epoch 2. Iteration 00300, Avg-Loss: 1.6510, Accuracy: 0.

In [None]:
from numba import cuda
print(cuda.gpus[0].name)