In [1]:
video_ids = [2] # range(1, 6)
frame_skip = 240

In [2]:
# Load the JSON metadata for each video and store the paths, hits, and serves
import json

labels = {}
for id in video_ids:
    with open(f'data/json/video{id}.json') as f:
        video_data = json.load(f)

        video_start = video_data['match']['start']
        video_end = video_data['match']['end']
        
        # Create a dictionary to store the labels for each frame
        video_labels = {
            frame: {
                'shot': None,
                'player': None,
                'side': None,
                'type': None,
            }
            for frame in range(video_start, video_end, frame_skip)
        }

        # Add the labels for each hit
        for hit in video_data['hits']:
            hit_start = int(hit['start'])
            hit_end = int(hit['end'])
            hit_label = hit['custom']

            for frame in range(hit_start, hit_end):
                if frame in video_labels.keys():
                    video_labels[frame] = {
                        'shot': 'Hit',
                        'player': hit_label['Player'], #near or far
                        'side': hit_label['Side'],
                        'type': hit_label['Type'],
                    }

        # Add the labels for each serve
        for serve in video_data['serves']:
            serve_start = int(serve['start'])
            serve_end = int(serve['end'])
            serve_label = serve['custom']

            for frame in range(serve_start, serve_end):
                if frame in video_labels.keys():
                    video_labels[frame] = {
                        'shot': 'Serve',
                        'player': serve_label['Player'], #near or far
                        'side': serve_label['Result'],
                        'type': None,
                    }

        labels[id] = video_labels

In [3]:
# Pull frames from the videos
import cv2

frames = {}
for id in video_ids:
    video_path = f'data/videos/video{id}.mp4'
    video = cv2.VideoCapture(video_path)

    frames[id] = []
    for frame_num in range(video_start, video_end, frame_skip):
        print(f'Loading video {id}... ({frame_num-video_start}/{video_end-video_start})', end='\r')
        video.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
        ret, frame = video.read()
        frames[id].append(frame)

    video.release()
print('Videos loaded. ' + ' '*30)

Videos loaded.                               


In [4]:
import numpy as np
train_data = np.array([
    frame
    for id in video_ids
    for frame in frames[id]
])

train_labels = np.array([
    [label['shot'], label['player'], label['side'], label['type']]
    for label in labels[2].values()
])

In [5]:
from sklearn.preprocessing import LabelEncoder

label_encoders = [
    LabelEncoder().fit(train_labels[:,i])
    for i in range(train_labels.shape[1])
]

train_labels = np.array([
    label_encoders[i].transform(train_labels[:,i])
    for i in range(train_labels.shape[1])
]).T

In [10]:
def train(model, train_data, train_labels, epochs=10, batch_size=1):
    import torch
    import torch.optim as optim
    import torch.nn as nn
    import numpy as np

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(epochs):
        print(f'Epoch {epoch+1}/{epochs}')
        running_loss = 0.0
        for i in range(0, len(train_data), batch_size):
            X = torch.tensor(train_data[i:i+batch_size], dtype=torch.float32).to('mps')
            y = torch.tensor(train_labels[i:i+batch_size], dtype=torch.long).to('mps')

            optimizer.zero_grad()

            outputs = model(X)
            print(outputs.shape, y.shape)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            if i % 100 == 99:
                print(f'[{epoch+1}, {i+1}] loss: {running_loss/100:.3f}')
                running_loss = 0.0

    print('Finished training')


In [12]:
# Import the FlowNet model
import torch
model = torch.hub.load('pytorch/vision:v0.6.0', 'alexnet', pretrained=False)

Downloading: "https://github.com/pytorch/vision/zipball/v0.6.0" to /Users/tanaynistala/.cache/torch/hub/v0.6.0.zip


In [13]:
train(model, train_data, train_labels, epochs=1)

Epoch 1/1


RuntimeError: Given groups=1, weight of size [64, 3, 11, 11], expected input[1, 720, 1280, 3] to have 3 channels, but got 720 channels instead