# AVA interaction detector

In [1]:
import os
import time
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, \
    CSVLogger, TensorBoard
from tensorflow.keras.layers import Input, GlobalAveragePooling2D, \
    TimeDistributed, LSTM, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Nadam
import multiprocessing as mp

## Data

### AVA Action dataset
Dataset structure :

```
AVAAction/
    train/
        videos/
            train1.mp4
            train2.mp4
            ...
        labels/
            train1.csv
            train2.csv
            ...
    val/
        videos/
            val1.mp4
            ...
        labels/
            val1.csv
            ...
``` 

In [2]:
train_dir  = '../data/AVA/dataset/AVA_action/train'
val_dir = '../data/AVA/dataset/AVA_action/val'
 # talk to, listen to, watch



### Data handler for Keras

Defined in :

In [3]:
from proto.AVAGenerator import AVAGenerator

by using a Keras Sequence 

Test initialising a sequence : 

In [4]:
tstart = time.time()
testgen = AVAGenerator(dir_path=train_dir, shuffle=False)
print(time.time() - tstart)


52.60100078582764


Average time obtained with the train set is about 1 min with 8 core.

Next we can look at the time taken to obtain a batch from our Sequence and try to estimate
the time it would take for worker to sequentially retrieve every batch from the data generator :

In [5]:
tstart = time.time()
xb, yb = testgen.__getitem__(42)
measure = time.time() - tstart
print(f"""
Time taken to retrieve a batch = {measure} sec)
Number of batches in train = {testgen.nbatches}
Epoch time base estimate : {testgen.nbatches * measure} sec
""")


Time taken to retrieve a batch = 3.7909910678863525 sec)
Number of batches in train = 12898
Epoch time base estimate : 48896.202793598175 sec



In [6]:
del(testgen)


This time can be divided by 6 by using 8 workers which we will see later. Considering the size of the data,
data retrieval is most likely the bottleneck in our model training. Thus, this estimate can be taken as a rough 
estimate the time per epoch in the training of our model while using GPU acceleration.


## Models

### Deep LSTM using VGG16 as feature extractor

Model from : https://riptutorial.com/keras/example/29812/vgg-16-cnn-and-lstm-for-video-classification. We will start with VGG16 and see if using a pre-trained model on MomentsInTime is necessary (requires conversion from pytorch to keras). 


In [7]:
def lstm_model(image_shape, sequence_length, classes):
    (h, w, c) = image_shape
    sequence_shape = (sequence_length, h, w, c)
    video = Input(shape=sequence_shape)
    cnn_base = VGG16(input_shape=image_shape,
                     weights="imagenet",
                     include_top=False)
    cnn_out = GlobalAveragePooling2D()(cnn_base.output)
    cnn = Model(cnn_base.input, cnn_out)
    cnn.trainable = False
    encoded_frames = TimeDistributed(cnn)(video)
    encoded_sequences = LSTM(64, return_sequences=True)(encoded_frames)
    encoded_sequence = LSTM(64)(encoded_sequences)
    hidden_layer = Dense(256, activation="relu")(encoded_sequence)
    outputs = Dense(classes, activation="softmax")(hidden_layer)
    model = Model(video, outputs)
    optimizer = Nadam(lr=0.002,
                      beta_1=0.9,
                      beta_2=0.999,
                      epsilon=1e-08,
                      schedule_decay=0.004)
    model.compile(loss="sparse_categorical_crossentropy",
                  optimizer=optimizer,
                  metrics=["categorical_accuracy"])
    return model


## Training

### Parameters

In [8]:
FPS = 5
TIMESPAN = 2
SEQ_LENGTH = FPS * TIMESPAN 
CHANNELS = 3 # rgb
ROWS = 160
COLS = 160
CLASSES = 2 # interact / not interact
BATCH_SIZE = 3
SEQUENCE_SHAPE = (SEQ_LENGTH,ROWS,COLS,CHANNELS) # hack for 5 fps
IMAGE_SHAPE = (ROWS,COLS,CHANNELS)
IMAGE_TARGET_SIZE = (ROWS, COLS)
MODEL_NAME = 'vgg16deeplstm'


To avoid gpu OOM :

In [9]:
# config gpus : avoid out of memory
gpus = tf.compat.v2.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.compat.v2.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.compat.v2.config.experimental.list_logical_devices(
                'GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


In [10]:
start_train_gen = time.time()
train_gen = AVAGenerator(
        dir_path=train_dir,
        batch_size=BATCH_SIZE,
        sequence_time_span=TIMESPAN,
        target_img_shape=IMAGE_TARGET_SIZE,
        target_fps=FPS,
        shuffle=True
)
middle_time = time.time()
val_gen = AVAGenerator(
        dir_path=val_dir,
        batch_size=BATCH_SIZE,
        sequence_time_span=TIMESPAN,
        target_img_shape=IMAGE_TARGET_SIZE,
        target_fps=FPS,
        shuffle=True
)
end_time = time.time()
print(f"""
Dataset init :
    - train : {middle_time - start_train_gen} sc
    - val : {end_time - middle_time} sc
Steps per epoch for :
    - training : {len(train_gen)}
    - validation : {len(val_gen)}
""")



Dataset init :
    - train : 53.30703067779541 sc
    - val : 18.243991374969482 sc
Steps per epoch for :
    - training : 34376
    - validation : 9535



In [11]:
m = lstm_model(IMAGE_SHAPE, SEQ_LENGTH, 2)
m.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 10, 160, 160, 3)] 0         
_________________________________________________________________
time_distributed (TimeDistri (None, 10, 512)           14714688  
_________________________________________________________________
lstm (LSTM)                  (None, 10, 64)            147712    
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dense (Dense)                (None, 256)               16640     
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 514       
Total params: 14,912,578
Trainable params: 197,890
Non-trainable params: 14,714,688
_________________________________________

In [12]:

# Helper: Save the model.
checkpointer = ModelCheckpoint(
        filepath=os.path.join('data', 'checkpoints',
                              f'{MODEL_NAME}-.{{epoch:}}-{{val_loss:}}.hdf5'),
        verbose=1,
        save_best_only=True)

# Helper: TensorBoard
tb = TensorBoard(log_dir=os.path.join('data', 'logs',MODEL_NAME))

# Helper: Stop when we stop learning.
early_stopper = EarlyStopping(patience=5)

# Helper: Save results.
timestamp = time.time()
csv_logger = CSVLogger(
        os.path.join('data', 'logs', f'{MODEL_NAME}-training-{timestamp}.log'))

w = mp.cpu_count()
m.fit_generator(
        generator=train_gen,
        epochs=1,
        verbose = 2,
        validation_data=val_gen,
        use_multiprocessing=True,
        max_queue_size=2*w,
        workers=w,
        shuffle = False, # custom shuffle already
        callbacks=[checkpointer, tb, early_stopper]
)

ResourceExhaustedError: OOM when allocating tensor with shape[30,64,160,160] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:Conv2DBackpropInput]