# Task 3: Understand human gesture and body language based on your own built dataset and model

## 1. Do literature search on dataset building and other deep learning based models applied on gesture recognition. Comment on their applications and benefits.

## 2. In the earlier two tasks, you have learned how to do the gesture classification task using the given dataset. Now, you need to collect data by yourself and build your own dataset. The dataset is not limited to gestures. Postures and behavior are encouraged. Please place your data referring to the format of the given dataset. For good performance, the number of data in each class is recommended over 50. For the number of classes, it is better to have more than 3.

## 3. Design your own neural network architecture. Fully connected or convolutional layers used in the first two tasks is acceptable. But you are encouraged to learn more deep learning models and achieve it as possible as you can.

## 3. Write down the problems you encountered during the experiment, the solutions, and your experiences.

In [1]:
import cv2
import numpy as np
import os
import itertools
import torch.utils.data as utils_data
import torch
from torchvision import transforms
from torchvision.datasets import ImageFolder
from torch.nn import Flatten, LogSoftmax, Linear, ReLU, LeakyReLU, CrossEntropyLoss, Sequential, Conv1d, Conv2d, MaxPool2d, Module, Softmax, BatchNorm1d, BatchNorm2d, Dropout
from torch.optim import Adam, SGD

In [2]:
#get current path
os.listdir()


['.ipynb_checkpoints',
 'dataset',
 'dataset.zip',
 'Example.ipynb',
 'Task 1 (3).ipynb',
 'Task 2(anh).ipynb',
 'Task 3.ipynb',
 'task3_images',
 'task3_images.zip',
 'task3_model',
 'trained_models']

In [3]:
#get path for images from Stanford40actions dataset
path = './task3_images'
#resize the dataset images to get a standard image size and convert to tensor
dataset = ImageFolder(path,transform = transforms.Compose([
    transforms.Resize((100,100)),transforms.ToTensor()
]))
print(dataset)

Dataset ImageFolder
    Number of datapoints: 2939
    Root location: ./task3_images
    StandardTransform
Transform: Compose(
               Resize(size=(100, 100), interpolation=bilinear, max_size=None, antialias=None)
               ToTensor()
           )


In [4]:
class CNNModel(Module):
    def __init__(self, num_classes):
        super(CNNModel, self).__init__()
        # code by yourself
        # first CONV => RELU => POOL combos
        self.conv1 = Conv2d(in_channels=3, out_channels=6,
        kernel_size=(5,5))
        self.relu1 = ReLU()
        self.maxpool1 = MaxPool2d(kernel_size=(2, 2), stride=2)
        
        # second CONV + LeakyRELU + POOL combos
        self.conv2 = Conv2d(in_channels=6, out_channels=16,
        kernel_size=(5,5))
        self.lrelu2 = LeakyReLU(0.1)
        self.maxpool2 = MaxPool2d(kernel_size=(2, 2), stride=2)
        
        # third CONV + RELU + POOL combos
        self.conv3 = Conv2d(in_channels=16, out_channels=32,
        kernel_size=(3,3))
        self.relu3 = ReLU()
        self.maxpool3 = MaxPool2d(kernel_size=(2, 2), stride=2)
        
        # fourth CONV + RELU + POOL combos
        self.conv4 = Conv2d(in_channels=32, out_channels=64,
        kernel_size=(3,3))
        self.lrelu4 = LeakyReLU(0.1)
        self.maxpool4 = MaxPool2d(kernel_size=(2, 2), stride=2)
        
        # initialize first (and only) set of FC => RELU layers
        self.fc1 = Linear(in_features=64*4*4, out_features=1024)
        self.relu5 = ReLU()
        self.fc2 = Linear(in_features=1024, out_features=256)
        self.lrelu6 = LeakyReLU(0.1)
        self.fc3 = Linear(in_features=256, out_features=84)
        self.relu7 = ReLU()
        self.fc4 = Linear(in_features=84, out_features=num_classes)
        
    def forward(self, x):
        # pass the input through our first set of CONV => RELU =>
        # POOL layers
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.maxpool1(x)
        # pass the output from the previous layer through the second
        # set of CONV => RELU => POOL layers
        x = self.conv2(x)
        x = self.lrelu2(x)
        x = self.maxpool2(x)
        #conv layer 3
        x = self.conv3(x)
        x = self.relu3(x)
        x = self.maxpool3(x)
        #last conv layer 4
        x = self.conv4(x)
        x = self.lrelu4(x)
        x = self.maxpool4(x)
        
        # flatten the output from the previous layer and pass it
        # through our only set of FC => RELU layers
        x = x.view(-1,64*4*4)
        x = self.fc1(x)
        x = self.relu5(x)
        x = self.fc2(x)
        x = self.lrelu6(x)
        x = self.fc3(x)
        x = self.relu7(x)
       
        
        # pass the output to our softmax classifier to get our output
        # predictions
        output = self.fc4(x)
        # return the output predictions
        return output

In [5]:
#classes: applauding, blowing bubbles, brushing teeth,
#drinking, holding umbrella, jumping, phoning, reading, 
#running, textinf, waving, writing
model = CNNModel(num_classes=12)
if torch.cuda.is_available():
    model = model.cuda()
    print("cuda activated")
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=0.0001)
loss_func = CrossEntropyLoss()

In [6]:
#set training and testing data loaders
dataset_size = len(dataset)
split_ratio = 0.8 #80%training, 20%testing
train_size = int(split_ratio * dataset_size)
test_size = dataset_size - train_size
train_set, test_set = utils_data.random_split(dataset, [train_size, test_size])
train_loader = utils_data.DataLoader(dataset=train_set, batch_size=16, shuffle=True)
test_loader = utils_data.DataLoader(dataset=test_set, batch_size=16, shuffle=True)
print('Data is ready!')

Data is ready!


In [8]:
best_accuracy = 0
#Unlike in task 2, since this model has a larger data size, no.of epochs redued to 100
for epoch in range(100):
    running_loss = 0.0
    train_acc = 0.0
    for step, (batch_image, batch_label) in enumerate(train_loader):
        model.train()
        if torch.cuda.is_available():
             batch_image, batch_label = batch_image.cuda(), batch_label.cuda()
        batch_output = model(batch_image)
        batch_loss = loss_func(batch_output, batch_label)

        optimizer.zero_grad()
        batch_loss.backward()
        optimizer.step()

        running_loss += batch_loss.item()

        # train accuracy
        _, train_predicted = torch.max(batch_output.data, 1)
        train_acc += (train_predicted == batch_label).sum().item()

    train_acc /= train_size
    running_loss /= (step+1)

    # ----------test----------
    model.eval()
    test_acc = 0.0
    for test_image, test_label in test_loader:
        test_output = model(test_image)
        _, predicted = torch.max(test_output.data, 1)
        test_acc += (predicted == test_label).sum().item()
    test_acc /= test_size

    print('epoch={:d}\ttrain loss={:.6f}\ttrain accuracy={:.3f}\ttest accuracy={:.3f}'.format(
        epoch, running_loss, train_acc, test_acc))

    if test_acc >= best_accuracy:
        torch.save(model.state_dict(), './task3_model/CNN_for_task3_model.pkl')
        best_accuracy = test_acc

epoch=0	train loss=0.094026	train accuracy=0.984	test accuracy=0.231
epoch=1	train loss=0.127681	train accuracy=0.965	test accuracy=0.240
epoch=2	train loss=0.281620	train accuracy=0.918	test accuracy=0.233
epoch=3	train loss=0.188074	train accuracy=0.946	test accuracy=0.228
epoch=4	train loss=0.149138	train accuracy=0.957	test accuracy=0.231
epoch=5	train loss=0.052930	train accuracy=0.994	test accuracy=0.235
epoch=6	train loss=0.038169	train accuracy=0.997	test accuracy=0.241
epoch=7	train loss=0.024408	train accuracy=0.999	test accuracy=0.228
epoch=8	train loss=0.019886	train accuracy=0.999	test accuracy=0.233
epoch=9	train loss=0.017276	train accuracy=1.000	test accuracy=0.233
epoch=10	train loss=0.014347	train accuracy=1.000	test accuracy=0.238
epoch=11	train loss=0.012460	train accuracy=1.000	test accuracy=0.226
epoch=12	train loss=0.010386	train accuracy=1.000	test accuracy=0.248
epoch=13	train loss=0.008383	train accuracy=1.000	test accuracy=0.247
epoch=14	train loss=0.007295	t

KeyboardInterrupt: 