In [1]:
import tensorflow as tf
import numpy as np
import cv2
import os
import tqdm
import heapq
import time
import datetime
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras.callbacks import ReduceLROnPlateau
from keras.callbacks import TensorBoard
from sklearn.preprocessing import LabelBinarizer

Using TensorFlow backend.


In [2]:
BASE_PATH = 'data/UCF-101'
VIDEOS_PATH = os.path.join(BASE_PATH, '**','*.avi')
SEQUENCE_LENGTH = 40

# Step 1 - Extract features from videos and cache them in files

### Sample 'SEQUENCE_LENGTH' frames from each video

In [3]:
def frame_generator():
    video_paths = tf.io.gfile.glob(VIDEOS_PATH)
    np.random.shuffle(video_paths)
    for video_path in video_paths:
        frames = []
        cap = cv2.VideoCapture(video_path)
        num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        sample_every_frame = max(1, num_frames // SEQUENCE_LENGTH)
        current_frame = 0

        label = os.path.basename(os.path.dirname(video_path))

        max_images = SEQUENCE_LENGTH
        while True:
            success, frame = cap.read()
            if not success:
                break

            if current_frame % sample_every_frame == 0:
                # OPENCV reads in BGR, tensorflow expects RGB so we invert the order
                frame = frame[:, :, ::-1]
                img = tf.image.resize(frame, (299, 299))
                img = tf.keras.applications.inception_v3.preprocess_input(
                    img)
                max_images -= 1
                yield img, video_path

            if max_images == 0:
                break
            current_frame += 1

dataset = tf.data.Dataset.from_generator(frame_generator,
             output_types=(tf.float32, tf.string),
             output_shapes=((299, 299, 3), ()))

dataset = dataset.batch(16).prefetch(tf.data.experimental.AUTOTUNE)

#### Extract Features from videos

In [4]:
inception_v3 = tf.keras.applications.InceptionV3(include_top=False, weights='imagenet')
x = inception_v3.output

# We add Average Pooling to transform the feature map from
# 8 * 8 * 2048 to 1 x 2048, as we don't need spatial information
pooling_output = tf.keras.layers.GlobalAveragePooling2D()(x)
feature_extraction_model = tf.keras.Model(inception_v3.input, pooling_output)

### Extract features and store them in .npy files¶
#### Extraction takes about ~1h20 minutes on an NVIDIA 1080 GPU

In [None]:
current_path = None
all_features = []

for img, batch_paths in tqdm.tqdm(dataset):
    batch_features = feature_extraction_model(img)
    batch_features = tf.reshape(batch_features, (batch_features.shape[0], -1))
    
    for features, path in zip(batch_features.numpy(), batch_paths.numpy()):
        if path != current_path and current_path is not None:
            output_path = current_path.decode().replace('.avi', '.npy')
            np.save(output_path, all_features)
            all_features = []
            
        current_path = path
        all_features.append(features)

# Step 2: Train the LSTM on video features

#### Labels preprocessing

In [5]:
LABELS = ['UnevenBars','ApplyLipstick','TableTennisShot','Fencing','Mixing','SumoWrestling','HulaHoop','PommelHorse','HorseRiding','SkyDiving','BenchPress','GolfSwing','HeadMassage','FrontCrawl','Haircut','HandstandWalking','Skiing','PlayingDaf','PlayingSitar','FrisbeeCatch','CliffDiving','BoxingSpeedBag','Kayaking','Rafting','WritingOnBoard','VolleyballSpiking','Archery','MoppingFloor','JumpRope','Lunges','BasketballDunk','Surfing','SkateBoarding','FloorGymnastics','Billiards','CuttingInKitchen','BlowingCandles','PlayingCello','JugglingBalls','Drumming','ThrowDiscus','BaseballPitch','SoccerPenalty','Hammering','BodyWeightSquats','SoccerJuggling','CricketShot','BandMarching','PlayingPiano','BreastStroke','ApplyEyeMakeup','HighJump','IceDancing','HandstandPushups','RockClimbingIndoor','HammerThrow','WallPushups','RopeClimbing','Basketball','Shotput','Nunchucks','WalkingWithDog','PlayingFlute','PlayingDhol','PullUps','CricketBowling','BabyCrawling','Diving','TaiChi','YoYo','BlowDryHair','PushUps','ShavingBeard','Knitting','HorseRace','TrampolineJumping','Typing','Bowling','CleanAndJerk','MilitaryParade','FieldHockeyPenalty','PlayingViolin','Skijet','PizzaTossing','LongJump','PlayingTabla','PlayingGuitar','BrushingTeeth','PoleVault','Punch','ParallelBars','Biking','BalanceBeam','Swing','JavelinThrow','Rowing','StillRings','SalsaSpin','TennisSwing','JumpingJack','BoxingPunchingBag'] 
encoder = LabelBinarizer()
encoder.fit(LABELS)

LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)

#### Defining the model

In [6]:
model = tf.keras.Sequential([
    tf.keras.layers.Masking(mask_value=0.),
    tf.keras.layers.LSTM(512, dropout=0.5, recurrent_dropout=0.5),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(len(LABELS), activation='softmax')
])

In [7]:
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy', 'top_k_categorical_accuracy'])

In [8]:
test_file = os.path.join('data', 'testlist01.txt')
train_file = os.path.join('data', 'trainlist01.txt')

with open('data/testlist01.txt') as f:
    test_list = [row.strip() for row in list(f)]

with open('data/trainlist01.txt') as f:
    train_list = [row.strip() for row in list(f)]
    train_list = [row.split(' ')[0] for row in train_list]

def make_generator(file_list):
    def generator():
        np.random.shuffle(file_list)
        for path in file_list:
            full_path = os.path.join(BASE_PATH, path).replace('.avi', '.npy')

            label = os.path.basename(os.path.dirname(path))
            features = np.load(full_path)

            padded_sequence = np.zeros((SEQUENCE_LENGTH, 2048))
            padded_sequence[0:len(features)] = np.array(features)

            transformed_label = encoder.transform([label])
            yield padded_sequence, transformed_label[0]
    return generator

In [9]:
train_dataset = tf.data.Dataset.from_generator(make_generator(train_list),
                 output_types=(tf.float32, tf.int16),
                 output_shapes=((SEQUENCE_LENGTH, 2048), (len(LABELS))))
train_dataset = train_dataset.batch(16).prefetch(tf.data.experimental.AUTOTUNE)


valid_dataset = tf.data.Dataset.from_generator(make_generator(test_list),
                 output_types=(tf.float32, tf.int16),
                 output_shapes=((SEQUENCE_LENGTH, 2048), (len(LABELS))))
valid_dataset = valid_dataset.batch(16).prefetch(tf.data.experimental.AUTOTUNE)

In [10]:
log_dir="logs\\" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

tensorboard_callback = TensorBoard(log_dir=log_dir, update_freq=1000, profile_batch=0)
checkpoint = ModelCheckpoint("model.h5", verbose=1, save_best_only=True)
earlystopping = EarlyStopping(patience=10, verbose=1)
model.fit(train_dataset, epochs=100, callbacks=[tensorboard_callback, checkpoint, earlystopping], validation_data=valid_dataset)

Epoch 1/100
    597/Unknown - 334s 560ms/step - loss: 3.5795 - accuracy: 0.1670 - top_k_categorical_accuracy: 0.3782
Epoch 00001: val_loss improved from inf to 2.46465, saving model to model.h5
Epoch 2/100
Epoch 00002: val_loss improved from 2.46465 to 1.68674, saving model to model.h5
Epoch 3/100
Epoch 00003: val_loss improved from 1.68674 to 1.50367, saving model to model.h5
Epoch 4/100
Epoch 00004: val_loss improved from 1.50367 to 1.34622, saving model to model.h5
Epoch 5/100
Epoch 00005: val_loss improved from 1.34622 to 1.25088, saving model to model.h5
Epoch 6/100
Epoch 00006: val_loss did not improve from 1.25088
Epoch 7/100
Epoch 00007: val_loss improved from 1.25088 to 1.23299, saving model to model.h5
Epoch 8/100
Epoch 00008: val_loss improved from 1.23299 to 1.21471, saving model to model.h5
Epoch 9/100
Epoch 00009: val_loss did not improve from 1.21471
Epoch 10/100
Epoch 00010: val_loss did not improve from 1.21471
Epoch 11/100
Epoch 00011: val_loss did not improve from 1.

Epoch 21/100
Epoch 00021: val_loss did not improve from 1.13664
Epoch 22/100
Epoch 00022: val_loss did not improve from 1.13664
Epoch 23/100
Epoch 00023: val_loss did not improve from 1.13664
Epoch 24/100
Epoch 00024: val_loss did not improve from 1.13664
Epoch 00024: early stopping


<tensorflow.python.keras.callbacks.History at 0x23ee7aa6d30>

# Step 3: Predict on Video

In [76]:
def load_features(path):
    features = np.load(path)
    padded_sequence = np.zeros((SEQUENCE_LENGTH, 2048))
    padded_sequence[0:len(features)] = np.array(features)
    return np.array([padded_sequence])

features = load_features('data/UCF-101/SumoWrestling/v_SumoWrestling_g06_c04.npy')
prediction = model.predict(features)

In [77]:
k = 5
idxs = heapq.nlargest(k, range(len(prediction[0])), prediction[0].__getitem__)
for i in idxs:
    label = encoder.classes_[i]
    pct = prediction[0][i]
    print(label+ ' ' + "{0:.0%}".format(pct))

SumoWrestling 100%
SalsaSpin 0%
Fencing 0%
BandMarching 0%
HulaHoop 0%
