In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";  # The GPU id to use, usually either "0" or "1";
os.environ["CUDA_VISIBLE_DEVICES"]="0";  # Do other imports now...

import tensorflow as tf

# Need to specify the memory_limit for my local machine for some reason, but didn't seem to be required
# on AWS machine
gpus = tf.config.experimental.list_physical_devices('GPU')

print(gpus)
tf.config.experimental.set_virtual_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=23400)])

# Check tensorflow version
print("Using Tensorflow %s\n" % (tf.__version__))

# Check to see if graphics card is doing OK memory-wise
!nvidia-smi

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Using Tensorflow 2.0.0

Thu Mar  5 09:16:00 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.59       Driver Version: 440.59       CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  TITAN RTX           Off  | 00000000:08:00.0 Off |                  N/A |
| 41%   36C    P8     5W / 280W |     12MiB / 24220MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 207...  Off  | 00000000:43:00.0  On |                  N/A |
| 41%   26C    P8     1W / 215W |    625MiB /  7974MiB |     16%      Default |
+-------------------------------+----------------------+----------

In [2]:
from tensorflow.keras.models import load_model
from collections import deque
import numpy as np
import cv2
from tensorflow.keras.models import Model


from tensorflow.keras.applications.inception_resnet_v2 import InceptionResNetV2, preprocess_input

model =load_model('models/class_only_30_epochs.h5')

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inception_resnet_v2 (Model)  (None, 17, 17, 1536)      54336736  
_________________________________________________________________
ClassConv (Conv2D)           (None, 17, 17, 1024)      14156800  
_________________________________________________________________
GAP (GlobalAveragePooling2D) (None, 1024)              0         
_________________________________________________________________
class (Dense)                (None, 40)                41000     
Total params: 68,534,536
Trainable params: 14,197,800
Non-trainable params: 54,336,736
_________________________________________________________________


In [3]:
pixels = model.input.get_shape().as_list()[1]
pixels

600

In [4]:
import json

# Read mapping file from  class index -> activity
with open("classes.json") as fp:
    mapping = json.loads(fp.read())

# Convert keys back to integer
mapping = {int(k):v for k,v in mapping.items()}

# Print out first 10 entries of mapping dictionary
for i in list(mapping)[0:10]:
    print (f"{i} : {mapping[i]}")


0 : applauding
1 : blowing_bubbles
2 : brushing_teeth
3 : cleaning_the_floor
4 : climbing
5 : cooking
6 : cutting_trees
7 : cutting_vegetables
8 : drinking
9 : feeding_a_horse


In [5]:
from tensorflow.keras.applications.inception_resnet_v2 import InceptionResNetV2, preprocess_input

def predict(image_data, model):
    
    processed_input = preprocess_input(image_data)
    
    # Run model's prediction to output from last Conv Layer + category prediction
    pred_vec = model.predict(processed_input)
    
    predictions = []
    for i in range(pred_vec.shape[0]):
        predictions.append(pred_vec[i])
        
    return predictions
    

In [6]:


def predict_video(filename, k = 200):
    vs = cv2.VideoCapture(filename)

    (W, H) = (None, None)

    count = 0


    image_data = np.zeros((k, pixels, pixels, 3))

    all_predictions = []


    while True:
        
        (grabbed, frame) = vs.read()
        
        if not grabbed:
            break

        if W is None or H is None:
            (H, W) = frame.shape[:2]

        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = cv2.resize(frame, (pixels, pixels)).astype("float32")
      
        
        image_data[count] = frame

        count = count + 1    
        
        if count == k:
            predictions = predict(image_data, model)
            all_predictions.extend(predictions)
            count = 0
                
    if count > 0:
        predictions = predict(image_data[:count], model)
        all_predictions.extend(predictions)
        

    vs.release()
    cv2.destroyAllWindows()

    averaged = np.mean(all_predictions, axis= 0)
    activity = mapping[np.argmax(averaged)]
    
    top_5_idx = np.argsort(averaged)[-5:]
    top_5 = [mapping[idx] for idx in top_5_idx]

    # Need to delete this otherwise we have a memory leak.
    # for some reason this isn't cleaned up automatically and kernel will die after
    # running many predictions
    for prediction in all_predictions:
        del prediction
    
    return activity, top_5
        
    

In [7]:
activity, top_5 = predict_video("videos/test/riding a bike/13.mp4")
activity

'riding_a_bike'

## Measure Accuracy of using video classification

In [20]:
import os
import json

correct = dict()
in_top_5 = dict()
incorrect = dict()

## The category names used in Kinetics-400 is slightly different from
## the ones used in Stanford40
category_mapping = {
    "cleaning_floor" : "cleaning_the_floor",
    "drinking_beer"  : "drinking",
    "pushing_cart"   : "pushing_a_cart",
    "reading_book"   : "reading",
    "riding_or_walking_with_horse" : "riding_a_horse",
    "texting"        : "texting_message",
    "using_computer" : "using_a_computer"
}
    
    
## Simple method to categorize videos one at a time.
## Would be nice to enhance this to run in parallel to reduce runtime
count = 0
import gc
gc.collect()

for root, dirs, files in os.walk("videos/test/"):
    for file in files:
        if file.endswith(".mp4"):
            orig_category = os.path.basename(root).replace(" ", "_")
            category = orig_category
            if category in category_mapping:
                category = category_mapping[category]
                
            prediction, top_5 = predict_video(os.path.join(root, file))
            
            if prediction == category:
                correct[os.path.join(orig_category, file)] = prediction
                print(f"correct: {os.path.join(orig_category, file)} => {prediction}")
            else:
                if category in top_5:
                    in_top_5[os.path.join(orig_category, file)] = top_5
                    print(f"in top 5: {os.path.join(orig_category, file)} => {top_5}")
                else:
                    incorrect[os.path.join(orig_category, file)] = prediction    
                    print(f"wrong: {os.path.join(orig_category, file)} => {prediction}")
                    
            count += 1
            
            # Need to run garabge collector manually, it seems to fill up memory very quickly
            if count % 20 == 0:
                gc.collect()
            
            
                
            


wrong: drinking/17.mp4 => writing_on_a_board
in top 5: drinking/32.mp4 => ['washing_dishes', 'blowing_bubbles', 'waving_hands', 'drinking', 'applauding']
wrong: drinking/30.mp4 => watching_TV
correct: drinking/38.mp4 => drinking
in top 5: drinking/57.mp4 => ['watching_TV', 'drinking', 'pouring_liquid', 'brushing_teeth', 'writing_on_a_board']
in top 5: drinking/46.mp4 => ['drinking', 'blowing_bubbles', 'brushing_teeth', 'reading', 'writing_on_a_book']
in top 5: drinking/63.mp4 => ['waving_hands', 'pushing_a_cart', 'blowing_bubbles', 'drinking', 'brushing_teeth']
wrong: drinking/8.mp4 => taking_photos
in top 5: drinking/5.mp4 => ['writing_on_a_board', 'playing_guitar', 'running', 'drinking', 'applauding']
in top 5: drinking/69.mp4 => ['pouring_liquid', 'drinking', 'waving_hands', 'writing_on_a_book', 'brushing_teeth']
correct: drinking/77.mp4 => drinking
in top 5: drinking/54.mp4 => ['texting_message', 'brushing_teeth', 'drinking', 'pouring_liquid', 'applauding']
correct: drinking/2.mp4 

correct: texting/34.mp4 => texting_message
correct: texting/28.mp4 => texting_message
in top 5: texting/56.mp4 => ['applauding', 'brushing_teeth', 'waving_hands', 'texting_message', 'smoking']
wrong: texting/90.mp4 => writing_on_a_board
in top 5: texting/19.mp4 => ['looking_through_a_telescope', 'taking_photos', 'phoning', 'texting_message', 'writing_on_a_board']
correct: texting/75.mp4 => texting_message
correct: texting/21.mp4 => texting_message
wrong: texting/10.mp4 => using_a_computer
correct: texting/52.mp4 => texting_message
correct: texting/74.mp4 => texting_message
in top 5: texting/33.mp4 => ['reading', 'smoking', 'waving_hands', 'texting_message', 'watching_TV']
correct: texting/13.mp4 => texting_message
correct: texting/91.mp4 => texting_message
in top 5: texting/6.mp4 => ['smoking', 'watching_TV', 'taking_photos', 'texting_message', 'phoning']
wrong: texting/70.mp4 => waving_hands
wrong: texting/72.mp4 => writing_on_a_board
correct: texting/49.mp4 => texting_message
in top 

correct: washing_dishes/78.mp4 => washing_dishes
in top 5: washing_dishes/87.mp4 => ['washing_dishes', 'reading', 'cutting_vegetables', 'writing_on_a_book', 'looking_through_a_microscope']
in top 5: washing_dishes/3.mp4 => ['drinking', 'pouring_liquid', 'cooking', 'washing_dishes', 'cutting_vegetables']
in top 5: washing_dishes/14.mp4 => ['pouring_liquid', 'writing_on_a_book', 'washing_dishes', 'cutting_vegetables', 'cooking']
in top 5: washing_dishes/76.mp4 => ['pouring_liquid', 'looking_through_a_microscope', 'cutting_vegetables', 'washing_dishes', 'cooking']
correct: washing_dishes/41.mp4 => washing_dishes
correct: washing_dishes/43.mp4 => washing_dishes
wrong: washing_dishes/50.mp4 => looking_through_a_telescope
correct: washing_dishes/60.mp4 => washing_dishes
correct: washing_dishes/89.mp4 => washing_dishes
correct: washing_dishes/15.mp4 => washing_dishes
correct: washing_dishes/64.mp4 => washing_dishes
correct: washing_dishes/48.mp4 => washing_dishes
correct: washing_dishes/61.mp

wrong: walking_the_dog/31.mp4 => applauding
wrong: walking_the_dog/9.mp4 => gardening
correct: walking_the_dog/26.mp4 => walking_the_dog
wrong: walking_the_dog/84.mp4 => running
in top 5: walking_the_dog/51.mp4 => ['fishing', 'jumping', 'pushing_a_cart', 'walking_the_dog', 'running']
in top 5: walking_the_dog/28.mp4 => ['riding_a_bike', 'using_a_computer', 'taking_photos', 'walking_the_dog', 'running']
correct: walking_the_dog/56.mp4 => walking_the_dog
wrong: walking_the_dog/90.mp4 => cleaning_the_floor
correct: walking_the_dog/19.mp4 => walking_the_dog
wrong: walking_the_dog/75.mp4 => taking_photos
in top 5: walking_the_dog/27.mp4 => ['running', 'walking_the_dog', 'taking_photos', 'gardening', 'writing_on_a_board']
correct: walking_the_dog/10.mp4 => walking_the_dog
correct: walking_the_dog/52.mp4 => walking_the_dog
correct: walking_the_dog/74.mp4 => walking_the_dog
wrong: walking_the_dog/33.mp4 => writing_on_a_board
in top 5: walking_the_dog/13.mp4 => ['looking_through_a_telescope', '

in top 5: reading_book/72.mp4 => ['phoning', 'reading', 'applauding', 'writing_on_a_book', 'waving_hands']
correct: reading_book/49.mp4 => reading
correct: reading_book/62.mp4 => reading
wrong: reading_book/40.mp4 => looking_through_a_microscope
in top 5: reading_book/37.mp4 => ['texting_message', 'using_a_computer', 'watching_TV', 'reading', 'writing_on_a_book']
correct: reading_book/47.mp4 => reading
in top 5: reading_book/86.mp4 => ['using_a_computer', 'watching_TV', 'washing_dishes', 'reading', 'writing_on_a_book']
correct: reading_book/4.mp4 => reading
in top 5: reading_book/66.mp4 => ['texting_message', 'applauding', 'using_a_computer', 'reading', 'writing_on_a_book']
in top 5: reading_book/71.mp4 => ['climbing', 'reading', 'washing_dishes', 'jumping', 'brushing_teeth']
correct: reading_book/36.mp4 => reading
wrong: riding_or_walking_with_horse/17.mp4 => shooting_an_arrow
correct: riding_or_walking_with_horse/32.mp4 => riding_a_horse
in top 5: riding_or_walking_with_horse/30.mp4 

in top 5: using_computer/80.mp4 => ['writing_on_a_book', 'looking_through_a_microscope', 'writing_on_a_board', 'using_a_computer', 'watching_TV']
correct: using_computer/55.mp4 => using_a_computer
correct: using_computer/85.mp4 => using_a_computer
in top 5: using_computer/73.mp4 => ['looking_through_a_microscope', 'reading', 'using_a_computer', 'writing_on_a_board', 'writing_on_a_book']
correct: using_computer/78.mp4 => using_a_computer
correct: using_computer/87.mp4 => using_a_computer
correct: using_computer/3.mp4 => using_a_computer
in top 5: using_computer/14.mp4 => ['writing_on_a_book', 'reading', 'writing_on_a_board', 'using_a_computer', 'watching_TV']
correct: using_computer/76.mp4 => using_a_computer
correct: using_computer/41.mp4 => using_a_computer
correct: using_computer/43.mp4 => using_a_computer
correct: using_computer/50.mp4 => using_a_computer
correct: using_computer/60.mp4 => using_a_computer
correct: using_computer/89.mp4 => using_a_computer
correct: using_computer/15.

correct: riding_a_bike/13.mp4 => riding_a_bike
in top 5: riding_a_bike/91.mp4 => ['riding_a_bike', 'looking_through_a_telescope', 'walking_the_dog', 'throwing_frisby', 'running']
correct: riding_a_bike/6.mp4 => riding_a_bike
correct: riding_a_bike/67.mp4 => riding_a_bike
correct: riding_a_bike/72.mp4 => riding_a_bike
correct: riding_a_bike/95.mp4 => riding_a_bike
correct: riding_a_bike/49.mp4 => riding_a_bike
in top 5: riding_a_bike/62.mp4 => ['riding_a_bike', 'running', 'waving_hands', 'reading', 'throwing_frisby']
correct: riding_a_bike/40.mp4 => riding_a_bike
correct: riding_a_bike/93.mp4 => riding_a_bike
correct: riding_a_bike/94.mp4 => riding_a_bike
correct: riding_a_bike/86.mp4 => riding_a_bike
correct: riding_a_bike/4.mp4 => riding_a_bike
correct: riding_a_bike/66.mp4 => riding_a_bike
correct: riding_a_bike/71.mp4 => riding_a_bike
in top 5: riding_a_bike/36.mp4 => ['blowing_bubbles', 'gardening', 'cutting_trees', 'riding_a_bike', 'fishing']
in top 5: brushing_teeth/17.mp4 => ['b

in top 5: drinking_beer/41.mp4 => ['texting_message', 'looking_through_a_telescope', 'drinking', 'taking_photos', 'writing_on_a_board']
correct: drinking_beer/43.mp4 => drinking
in top 5: drinking_beer/50.mp4 => ['playing_violin', 'applauding', 'pouring_liquid', 'drinking', 'writing_on_a_board']
wrong: drinking_beer/60.mp4 => looking_through_a_microscope
wrong: drinking_beer/89.mp4 => applauding
in top 5: drinking_beer/15.mp4 => ['writing_on_a_board', 'taking_photos', 'drinking', 'applauding', 'pouring_liquid']
correct: drinking_beer/64.mp4 => drinking
wrong: drinking_beer/48.mp4 => pouring_liquid
wrong: drinking_beer/61.mp4 => cleaning_the_floor
wrong: drinking_beer/11.mp4 => feeding_a_horse
in top 5: drinking_beer/83.mp4 => ['writing_on_a_board', 'taking_photos', 'waving_hands', 'drinking', 'applauding']
correct: drinking_beer/20.mp4 => drinking
wrong: drinking_beer/59.mp4 => writing_on_a_board
wrong: drinking_beer/7.mp4 => watching_TV
in top 5: drinking_beer/44.mp4 => ['pouring_liqu

correct: cleaning_floor/37.mp4 => cleaning_the_floor
in top 5: cleaning_floor/47.mp4 => ['running', 'waving_hands', 'fishing', 'cleaning_the_floor', 'pushing_a_cart']
correct: cleaning_floor/86.mp4 => cleaning_the_floor
correct: cleaning_floor/4.mp4 => cleaning_the_floor
correct: cleaning_floor/36.mp4 => cleaning_the_floor
wrong: smoking/32.mp4 => writing_on_a_board
in top 5: smoking/30.mp4 => ['drinking', 'texting_message', 'applauding', 'smoking', 'writing_on_a_board']
in top 5: smoking/29.mp4 => ['smoking', 'applauding', 'taking_photos', 'drinking', 'brushing_teeth']
correct: smoking/38.mp4 => smoking
correct: smoking/57.mp4 => smoking
in top 5: smoking/46.mp4 => ['texting_message', 'brushing_teeth', 'writing_on_a_board', 'smoking', 'taking_photos']
in top 5: smoking/63.mp4 => ['brushing_teeth', 'smoking', 'waving_hands', 'applauding', 'watching_TV']
in top 5: smoking/69.mp4 => ['taking_photos', 'smoking', 'writing_on_a_book', 'waving_hands', 'texting_message']
wrong: smoking/54.mp4

correct: pushing_cart/70.mp4 => pushing_a_cart
correct: pushing_cart/67.mp4 => pushing_a_cart
in top 5: pushing_cart/72.mp4 => ['pouring_liquid', 'writing_on_a_board', 'watching_TV', 'pushing_a_cart', 'cleaning_the_floor']
wrong: pushing_cart/49.mp4 => writing_on_a_board
correct: pushing_cart/62.mp4 => pushing_a_cart
wrong: pushing_cart/40.mp4 => phoning
correct: pushing_cart/37.mp4 => pushing_a_cart
correct: pushing_cart/47.mp4 => pushing_a_cart
correct: pushing_cart/86.mp4 => pushing_a_cart
correct: pushing_cart/4.mp4 => pushing_a_cart
correct: pushing_cart/66.mp4 => pushing_a_cart
in top 5: pushing_cart/71.mp4 => ['washing_dishes', 'riding_a_bike', 'pushing_a_cart', 'brushing_teeth', 'cleaning_the_floor']
in top 5: pushing_cart/36.mp4 => ['writing_on_a_board', 'cleaning_the_floor', 'climbing', 'pushing_a_cart', 'waving_hands']
correct: playing_guitar/32.mp4 => playing_guitar
correct: playing_guitar/30.mp4 => playing_guitar
correct: playing_guitar/29.mp4 => playing_guitar
correct: pl

In [21]:
print(len(list(correct)))
print(len(list(in_top_5)))
print(len(list(incorrect)))


702
295
181


In [22]:
results = { "correct" : correct, "in_top_5": in_top_5, "incorrect" : incorrect}

In [23]:
results

{'correct': {'drinking/38.mp4': 'drinking',
  'drinking/77.mp4': 'drinking',
  'drinking/2.mp4': 'drinking',
  'drinking/12.mp4': 'drinking',
  'drinking/39.mp4': 'drinking',
  'drinking/79.mp4': 'drinking',
  'drinking/55.mp4': 'drinking',
  'drinking/73.mp4': 'drinking',
  'drinking/76.mp4': 'drinking',
  'drinking/60.mp4': 'drinking',
  'drinking/89.mp4': 'drinking',
  'drinking/15.mp4': 'drinking',
  'drinking/11.mp4': 'drinking',
  'drinking/59.mp4': 'drinking',
  'drinking/45.mp4': 'drinking',
  'drinking/31.mp4': 'drinking',
  'drinking/9.mp4': 'drinking',
  'drinking/84.mp4': 'drinking',
  'drinking/51.mp4': 'drinking',
  'drinking/28.mp4': 'drinking',
  'drinking/75.mp4': 'drinking',
  'drinking/74.mp4': 'drinking',
  'drinking/13.mp4': 'drinking',
  'drinking/6.mp4': 'drinking',
  'drinking/62.mp4': 'drinking',
  'drinking/86.mp4': 'drinking',
  'texting/17.mp4': 'texting_message',
  'texting/46.mp4': 'texting_message',
  'texting/8.mp4': 'texting_message',
  'texting/77.mp4'

In [24]:
# Calculate percentages
total = 702 + 295 + 181
correct = 702 / total * 100
in_top_5 = (702 + 295) / total * 100

In [41]:
print("Classification Accuracy")
print("-----------------------")
print("Accuracy is:   {0:.2f} %".format(correct))

print("Top 5 acc is:  {0:.2f} %".format(in_top_5))


Classification Accuracy
-----------------------
Accuracy is:   59.59 %
Top 5 acc is:  84.63 %


In [26]:
# Write results to file in case we need to analyze it in the future
with open("results.json", "w") as fp:
    json.dump(results, fp)