In [1]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import json

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
!unzip asl-data.zip

Archive:  asl-data.zip
   creating: asl-data/
  inflating: asl-data/05234.csv      
  inflating: asl-data/01385.csv      
  inflating: asl-data/01461.csv      
  inflating: asl-data/03122.csv      
  inflating: asl-data/01991.csv      
  inflating: asl-data/00631.csv      
  inflating: asl-data/04619.csv      
  inflating: asl-data/04712.csv      
  inflating: asl-data/00624.csv      
  inflating: asl-data/02999.csv      
  inflating: asl-data/01460.csv      
  inflating: asl-data/04858.csv      
  inflating: asl-data/04616.csv      
  inflating: asl-data/01384.csv      
  inflating: asl-data/03121.csv      
  inflating: asl-data/00415.csv      
  inflating: asl-data/03001.csv      
  inflating: asl-data/03435.csv      
  inflating: asl-data/03441.csv      
  inflating: asl-data/05088.csv      
  inflating: asl-data/04851.csv      
  inflating: asl-data/05230.csv      
  inflating: asl-data/04708.csv      
  inflating: asl-data/03120.csv      
  inflating: asl-data/04897.csv      
  in

In [12]:
input_size = 98 # 7 landmarks for upper body and 21 for each hand for a total of
                # 49 landmarks * 2 x/y positions for each
sequence_length = 50 # 25 fps, assuming about two seconds per video
num_layers = 2
hidden_size = 128
num_classes = 5 # number of signs
learning_rate = 0.0001
batch_size = 16
num_epochs = 100

In [4]:
class RNN(nn.Module):
  def __init__(self, input_size, hidden_size, num_layers, num_classes,
               batch_size):
    super(RNN, self).__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.num_classes = num_classes
    self.batch_size = batch_size

    # RNN takes tensor of shape (batch_size, sequence_length, input_size)
    # (N, 30, 90)
    self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)

    # classifier -- uses final hidden state as input, outputs probability of
    # each class
    self.fc = nn.Linear(self.hidden_size, self.num_classes)
    self.softmax = nn.LogSoftmax(dim=1)

  def forward(self, x):
    # x = (N, 30, 90) = (batch_size, sequence_length, input_size)
    # h_0 = (2, N, 128) = (num_layers, batch_size, hidden_size)
    h_0 = torch.zeros(self.num_layers, x.size(0),
                              self.hidden_size).to(device)

    # get RNN last layer output. last hidden layer is no longer necessary
    output, h_n = self.rnn(x, h_0)

    # output = (batch_size, sequence_length, hidden_size) = (N, 30, 90)
    output = output[:, -1, :] # output of last layer for each batch sequence

    # output = (batch_size, hidden_size)
    output = self.fc(output)
    output = self.softmax(output)

    # output = (batch_size, num_classes)
    return output

In [5]:
class GlossDataset(Dataset):
  # thin go bowling cool before
  def __init__(self, annotations_file, landmark_dir, sequence_length):
    self.landmark_labels = pd.read_csv(annotations_file, dtype={'id': 'object'})
    self.landmark_dir = landmark_dir
    self.sequence_length = sequence_length
    self.gloss_to_int = {
        'book': 0,
        'computer': 1,
        'backpack': 2,
        'medicine': 3,
        'teacher': 4
    }
    self.landmark_labels['gloss'] = self.landmark_labels['gloss'].apply(lambda x : self.gloss_to_int[x])

  def __len__(self):
    return len(self.landmark_labels)

  def __getitem__(self, idx):
    landmark_path = os.path.join(self.landmark_dir, self.landmark_labels.iloc[idx, 0] + '.csv')

    gloss = self.landmark_labels.iloc[idx, 1]
    landmarks = pd.read_csv(landmark_path)
    # pad output to make video long enough
    if landmarks.shape[1] - self.sequence_length > 0:
      delta = landmarks.shape[1] - self.sequence_length
      row = landmarks.iloc[-1]
      for _ in range(delta):
        landmarks.loc[len(landmarks)] = row

    # trim output if it's too long
    landmarks_tensor = torch.tensor(landmarks.iloc[:self.sequence_length].to_numpy().astype('float32'))

    return landmarks_tensor, gloss

In [13]:
model = RNN(input_size, hidden_size, num_layers, num_classes, batch_size).to(device)

gloss_data = GlossDataset('video-metadata.csv', 'asl-data', sequence_length)
train_loader = DataLoader(gloss_data, batch_size=1, shuffle=True)

# Loss and optimizer
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
n_total_steps = len(train_loader)
for epoch in range(num_epochs):
    for i, (landmarks, labels) in enumerate(train_loader):
        # origin shape: [N, 1, 10, 98]
        # resized: [N, 50, 94]
        landmarks = landmarks.reshape(-1, sequence_length, input_size).to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(landmarks)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/100], Loss: 1.7330
Epoch [2/100], Loss: 1.8109
Epoch [3/100], Loss: 1.8712
Epoch [4/100], Loss: 1.2011
Epoch [5/100], Loss: 1.4160
Epoch [6/100], Loss: 1.8107
Epoch [7/100], Loss: 2.2980
Epoch [8/100], Loss: 0.5961
Epoch [9/100], Loss: 2.1403
Epoch [10/100], Loss: 1.5622
Epoch [11/100], Loss: 1.3812
Epoch [12/100], Loss: 0.9137
Epoch [13/100], Loss: 1.0118
Epoch [14/100], Loss: 1.2199
Epoch [15/100], Loss: 0.4571
Epoch [16/100], Loss: 0.7477
Epoch [17/100], Loss: 0.9566
Epoch [18/100], Loss: 0.6552
Epoch [19/100], Loss: 2.4052
Epoch [20/100], Loss: 1.5797
Epoch [21/100], Loss: 2.3734
Epoch [22/100], Loss: 0.9054
Epoch [23/100], Loss: 0.9686
Epoch [24/100], Loss: 2.4440
Epoch [25/100], Loss: 2.3027
Epoch [26/100], Loss: 0.3329
Epoch [27/100], Loss: 0.3712
Epoch [28/100], Loss: 0.7732
Epoch [29/100], Loss: 2.3477
Epoch [30/100], Loss: 0.5048
Epoch [31/100], Loss: 0.5625
Epoch [32/100], Loss: 0.4706
Epoch [33/100], Loss: 0.8996
Epoch [34/100], Loss: 2.1142
Epoch [35/100], Loss: 1

In [14]:
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    for images, labels in train_loader:
        images = images.reshape(-1, sequence_length, input_size).to(device)
        labels = labels.to(device)
        outputs = model(images)
        # max returns (value ,index)
        _, predicted = torch.max(outputs.data, 1)
        n_samples += labels.size(0)
        n_correct += (predicted == labels).sum().item()

    acc = 100.0 * n_correct / n_samples
    print(f'Accuracy of the network on the 10000 test images: {acc} %')

Accuracy of the network on the 10000 test images: 89.47368421052632 %


This model is likely extremely overfit, but the high percentage makes me feel better about myself until I download and process the rest of the data and create a validation set.