In [1]:
!pip3 install torch torchvision torchaudio # Default to CUDA 10.2 https://pytorch.org/get-started/locally/
!pip install ipywidgets
!pip install sklearn
!pip install matplotlib



## Read Data functions

In [1]:
import numpy as np
import matplotlib.pyplot as plt

import json
from PIL import Image
import os

label_names = ['finger_point', 'fist', 'index_pick', 'middle_pick', 'palm', 'thumb_up', 'victory']

def readData(directory):
    data = []
    img_data = []
    y = []
    y_i = []
    # Data being read here
    # Default directory name ./data_collector/(gesture type)/(image/label)
    for i, folder in enumerate(label_names):
        baseD = os.path.join(directory, folder)
        labelsD = os.path.join(baseD, 'labels')
        imageD = os.path.join(baseD, 'images')

        for filename in os.listdir(labelsD):
            labelFileName = os.path.join(labelsD, filename)
            imageFileName = os.path.join(imageD, filename.split('.')[0] + ".png")

            with open(labelFileName, 'r') as f:
                data.append(json.load(f))
                y.append(folder)
                y_i.append(i)
                f.close()

            img_data.append(Image.open(imageFileName))

    return np.array(data), np.array(img_data), np.array(y), np.array(y_i)


def balance_data(data, img_data, y_i):
    # Balance data classes. 
    y_counts = np.bincount(y_i)
    num_per_y = y_counts.min()
    final_sel = np.ones(len(y_i), dtype = 'bool')
    for i in range(len(label_names)):
        sel = np.array(y_i) == i
        sel[np.where(sel)[0][:num_per_y]] = False
        final_sel[sel] = False

    data_X = data[final_sel]
    data_y = y_i[final_sel]
    img_data_X = img_data[final_sel]
    
    return data_X, img_data_X, data_y

## Read Data


In [2]:
DIR1 = './data_collector'
DIR2 = './test_data'
DIR3 = './skye'

data1, img_data1, y1, y1_i = readData(DIR1)
data1_balanced, img_data1_balanced, y1_i_balanced = balance_data(data1, img_data1, y1_i)

data2, img_data2, y2, y2_i = readData(DIR2)
data2_balanced, img_data2_balanced, y2_i_balanced = balance_data(data2, img_data2, y2_i)

data3, img_data3, y3, y3_i = readData(DIR3)
data3_balanced, img_data3_balanced, y3_i_balanced = balance_data(data3, img_data3, y3_i)

  return np.array(data), np.array(img_data), np.array(y), np.array(y_i)
  return np.array(data), np.array(img_data), np.array(y), np.array(y_i)


In [3]:
total_data = np.concatenate((data1, data2), axis = 0)
total_img_data = np.concatenate((img_data1, img_data2), axis = 0)
total_y = np.concatenate((y1, y2), axis = 0)
total_y_i = np.concatenate((y1_i, y2_i), axis = 0)

total_data_balanced, total_img_data_balanced, total_y_i_balanced = balance_data(total_data, total_img_data, total_y_i)

print(len(total_data))

3896


## Use MLP

In [11]:
def extract_mlp_data(data, y):
    aa = []
    X = []
    for a in data : aa.append(a['multi_hand_landmarks'])
    for d in aa:
        singleData = []
        for landmark in d:
            singleData.append(landmark['x'])
            singleData.append(landmark['y'])
            singleData.append(landmark['z'])
        X.append(singleData)

    X = np.array(X)
    y = np.array(y)
    
    return X, y

# X, y = extract_mlp_data(data1_balanced, y1_i_balanced)
X, y = extract_mlp_data(total_data_balanced, total_y_i_balanced)

In [12]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, )

clf = MLPClassifier(hidden_layer_sizes=(100), max_iter=100000, learning_rate='adaptive').fit(X_train, y_train)

In [13]:
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

0.9902302861130495
0.9916434540389972


In [16]:
t_X, t_y = extract_mlp_data(data3, y3_i)
print(clf.score(t_X, t_y))

0.7853470437017995


## Use MobileNet v2

In [34]:
from torchvision import transforms
import torch

def preprocess_mobilenet_data(img_data):
    # Resize data. 
    preprocess = transforms.Compose([
        transforms.Resize(256),
        # transforms.CenterCrop(224),
        transforms.ToTensor(),
        # transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    preprocessed_img_data = []
    for i in img_data:
        preprocessed_img_data.append(preprocess(i))

    preprocessed_img_data = torch.stack(preprocessed_img_data)
    
    return preprocessed_img_data

preprocessed_img_data = preprocess_mobilenet_data(img_data)
preprocessed_img_data.shape

torch.Size([2083, 3, 256, 455])

In [24]:
# for im in preprocessed_img_data[:10]:
    # plt.imshow(im.permute(1, 2, 0))
    # plt.show()

In [35]:
def balance_data(preprocessed_img_data, y_i):
    # Balance data classes. 
    y_counts = np.bincount(y_i)
    num_per_y = y_counts.min()
    final_sel = np.ones(len(y_i), dtype = 'bool')
    for i in range(len(label_names)):
        sel = np.array(y_i) == i
        sel[np.where(sel)[0][:num_per_y]] = False
        final_sel[sel] = False

    data_X = preprocessed_img_data[final_sel]
    data_y = y_i[final_sel]
    
    return data_X, data_y

y_i = np.array(y_i)
data_X, data_y = balance_data(preprocessed_img_data, y_i)
print(data_X.shape)
print(data_y.shape)

torch.Size([1022, 3, 256, 455])
(1022,)


In [36]:
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, stratify=data_y, test_size=0.3)

def get_dataloader(X, y):
    dataloader = DataLoader(
        [(X[i], y[i]) for i in range(len(X))], 
        batch_size=16, 
        shuffle=True
    )
    return dataloader

train_dataloader = get_dataloader(X_train, y_train)
print(np.bincount(y_train))

test_dataloader = get_dataloader(X_test, y_test)
print(np.bincount(y_test))

[103 102 102 102 102 102 102]
[43 44 44 44 44 44 44]


In [97]:
import torch.nn as nn
import torchvision.models as models
# resnet18 = models.resnet18(pretrained=True)

def get_model(freeze_extraction_layers = False):
    model = models.mobilenet_v2(pretrained=True)

    model.classifier[1] = nn.Linear(model.last_channel, len(label_names))

    if (freeze_extraction_layers):
        for param in model.parameters():
            param.requires_grad = False
        for layer in model.classifier:
            for param in layer.parameters():
                param.requires_grad = True
                
    if torch.cuda.is_available():
        model.to('cuda')
    return model

model = get_model(freeze_extraction_layers = True)

In [40]:
model.eval()

MobileNetV2(
  (features): Sequential(
    (0): ConvNormActivation(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace=True)
    )
    (1): InvertedResidual(
      (conv): Sequential(
        (0): ConvNormActivation(
          (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): InvertedResidual(
      (conv): Sequential(
        (0): ConvNormActivation(
          (0): Conv2d(16, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(96, eps=1e-05,

In [102]:
import torch.optim as optim

def train(model, dataloader, lr = 1e-4, num_epoch = 30):
    torch.cuda.empty_cache()

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    PRINT_BATCH_NUM = len(dataloader)

    best_acc = 0
    best_loss = 1e100000

    for epoch in range(num_epoch):  # loop over the dataset multiple times

        running_loss = 0.0
        epoch_loss = 0.0
        correct = 0

        for i, data in enumerate(dataloader):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            if torch.cuda.is_available():
                inputs = inputs.to('cuda')
                labels = labels.to('cuda')

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            correct += (torch.max(outputs.data, 1)[1] == labels).float().sum().item()

            # print statistics
            running_loss += loss.item()
            epoch_loss += loss.item()
            if i % PRINT_BATCH_NUM == (PRINT_BATCH_NUM - 1):    # print every 2000 mini-batches
                print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / PRINT_BATCH_NUM:.3f}')
                running_loss = 0.0

        epoch_acc = correct / len(dataloader.dataset)
        epoch_loss = epoch_loss / len(dataloader.dataset)
        print(f'accuracy: {epoch_acc}')

        if (epoch_loss < best_loss):
            best_loss = epoch_loss
            torch.save(model.state_dict(), f'model_parameters_loss_{best_loss:.6f}.pt')

    return best_acc, best_loss
best_acc, best_loss = train(model, train_dataloader, lr = 1e-4, num_epoch = 5)

[1,    45] loss: 1.977
accuracy: 0.13566433566433567
[2,    45] loss: 1.908
accuracy: 0.24055944055944056
[3,    45] loss: 1.865
accuracy: 0.2573426573426573
[4,    45] loss: 1.818
accuracy: 0.31608391608391606
[5,    45] loss: 1.802
accuracy: 0.3062937062937063


In [94]:
def load_model_state(model, best_loss):
    best_model_name = f'model_parameters_loss_{best_loss:.6f}.pt'
    print(f'loading {best_model_name}')

    model.load_state_dict(torch.load(best_model_name))

def eval_model(model, dataloader, print_graph = False):

    y_pred = None
    with torch.no_grad():
        correct = 0

        running_loss = 0.0
        for i, data in enumerate(dataloader):
            inputs, labels = data
            if torch.cuda.is_available():
                inputs = inputs.to('cuda')
                labels = labels.to('cuda')

            outputs = model(inputs)
            pred = torch.max(outputs.data, 1)[1]
            # loss = criterion(outputs, labels)

            correct += (pred == labels).float().sum().item()
            running_loss += loss.item()

            if (y_pred is None):
                y_pred = pred.unsqueeze(-1).cpu().detach()
            else:
                y_pred = torch.vstack((y_pred, pred.unsqueeze(-1).cpu().detach()))

            if print_graph:
                for j in range(len(inputs)):
                    plt.imshow(inputs[j].cpu().permute(1, 2, 0))
                    plt.show()
                    print(f"Label: {labels[j]}, {label_names[labels[j]]}")
                    print(f"Pred: {pred[j]}, {label_names[pred[j]]}")

        acc = correct / len(dataloader.dataset)

    print(running_loss)
    print(acc)
# for i in np.unique(y_test):
#     print(i)
#     print((y_pred[y_test == i, 0] == i).sum() / np.bincount(y_test)[i])
load_model_state(model, best_loss)
eval_model(model, test_dataloader)

loading model_parameters_loss_0.092446.pt
9.923317432403564
0.5179153094462541


In [None]:
t_y_i = np.array(t_y_i)
t_preprocessed_img_data = preprocess_mobilenet_data(t_img_data)
t_dataloader = get_dataloader(t_preprocessed_img_data, t_y_i)

In [78]:
eval_model(model, t_dataloader)

61.09195411205292
0.1831218974076117


In [85]:
total_y_i = np.array(total_y_i)
total_preprocessed_img_data = preprocess_mobilenet_data(total_img_data)
total_data_X, total_data_y = balance_data(total_preprocessed_img_data, total_y_i)

total_X_train, total_X_test, total_y_train, total_y_test = train_test_split(total_data_X, total_data_y, stratify=total_data_y, test_size=0.3)

total_train_dataloader = get_dataloader(total_X_train, total_y_train)
total_test_dataloader = get_dataloader(total_X_test, total_y_test)

In [116]:
total_model = get_model(freeze_extraction_layers = True)
total_best_acc, total_best_loss = train(total_model, total_train_dataloader, lr = 1e-3, num_epoch = 20)

[1,    79] loss: 1.798
accuracy: 0.2838915470494418
[2,    79] loss: 1.473
accuracy: 0.4800637958532695
[3,    79] loss: 1.318
accuracy: 0.5454545454545454
[4,    79] loss: 1.180
accuracy: 0.6275917065390749
[5,    79] loss: 1.082
accuracy: 0.6547049441786283
[6,    79] loss: 1.024
accuracy: 0.6738437001594896
[7,    79] loss: 0.959
accuracy: 0.6834130781499203
[8,    79] loss: 0.924
accuracy: 0.7105263157894737
[9,    79] loss: 0.868
accuracy: 0.7208931419457735
[10,    79] loss: 0.855
accuracy: 0.7312599681020734
[11,    79] loss: 0.829
accuracy: 0.7304625199362041
[12,    79] loss: 0.792
accuracy: 0.759170653907496
[13,    79] loss: 0.763
accuracy: 0.7535885167464115
[14,    79] loss: 0.769
accuracy: 0.7615629984051037
[15,    79] loss: 0.717
accuracy: 0.7727272727272727
[16,    79] loss: 0.723
accuracy: 0.7703349282296651
[17,    79] loss: 0.696
accuracy: 0.7894736842105263
[18,    79] loss: 0.684
accuracy: 0.7838915470494418
[19,    79] loss: 0.718
accuracy: 0.7583732057416268
[20

In [117]:
load_model_state(total_model, total_best_loss)
eval_model(total_model, total_test_dataloader)

loading model_parameters_loss_0.043033.pt
16.86963963508606
0.7026022304832714
