### Colab notebook for activity recognition in videos using pretrained model (model from https://github.com/kenshohara/) for recognizing 400 different activities

In [23]:
# downloading project zip file containing required files - model file, class names text file, sample video for activity recognition 
from google_drive_downloader import GoogleDriveDownloader as gdd 
file_id = '1-2fFtZbQLsF3sSwpAZC_HOHyrCrF6GWz'
gdd.download_file_from_google_drive(file_id=file_id, dest_path = '/content/files_activity_recognition.zip', unzip=True)

# unzipping file 
!unzip files_activity_recognition.zip 

Downloading 1-2fFtZbQLsF3sSwpAZC_HOHyrCrF6GWz into /content/files_activity_recognition.zip... Done.
Unzipping...Done.
Archive:  files_activity_recognition.zip
replace files_activity_recognition/resnet-34_kinetics.onnx? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: files_activity_recognition/resnet-34_kinetics.onnx  
replace files_activity_recognition/class_names_list.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: files_activity_recognition/class_names_list.txt  
  inflating: files_activity_recognition/video_making_pizza_resized.mp4  


In [24]:
# importing required libraries 
import numpy as np 
import cv2 
from time import time 

In [25]:
# defining various input parameters 
filepath_class_names = '/content/files_activity_recognition/class_names_list.txt'                # original source - https://github.com/kenshohara/video-classification-3d-cnn-pytorch/blob/master/class_names_list
filepath_model       = '/content/files_activity_recognition/resnet-34_kinetics.onnx'             # original source - https://github.com/kenshohara/
filepath_in_video    = '/content/files_activity_recognition/video_making_pizza_resized.mp4'      # original source (before resizing)-  https://www.pexels.com/video/a-person-preparing-a-pepperoni-pizza-3196001/
filepath_out_video   = '/content/output.mp4'


In [26]:
# loading class names 
with open(filepath_class_names,'r') as fh :
  class_names = fh.read().strip().split('\n')
print("Printing all class names (total {} classes)".format(len(class_names)))
print(class_names)

Printing all class names (total 400 classes)
['abseiling', 'air drumming', 'answering questions', 'applauding', 'applying cream', 'archery', 'arm wrestling', 'arranging flowers', 'assembling computer', 'auctioning', 'baby waking up', 'baking cookies', 'balloon blowing', 'bandaging', 'barbequing', 'bartending', 'beatboxing', 'bee keeping', 'belly dancing', 'bench pressing', 'bending back', 'bending metal', 'biking through snow', 'blasting sand', 'blowing glass', 'blowing leaves', 'blowing nose', 'blowing out candles', 'bobsledding', 'bookbinding', 'bouncing on trampoline', 'bowling', 'braiding hair', 'breading or breadcrumbing', 'breakdancing', 'brush painting', 'brushing hair', 'brushing teeth', 'building cabinet', 'building shed', 'bungee jumping', 'busking', 'canoeing or kayaking', 'capoeira', 'carrying baby', 'cartwheeling', 'carving pumpkin', 'catching fish', 'catching or throwing baseball', 'catching or throwing frisbee', 'catching or throwing softball', 'celebrating', 'changing o

In [27]:
# loading the model 
model = cv2.dnn.readNet(filepath_model)

In [29]:
# defining function for preprocessing the frames before passing to the neural network model 
def preprocess(frames) :
  model_img_w = 112    # as per model input image width 
  model_img_h = 112    # as per model input image height 
  mean = (114.7748, 107.7354, 99.4750) # as per pre-trained model's mean values for normalization 
  blob = cv2.dnn.blobFromImages( frames, scalefactor=1 , size=(model_img_w, model_img_h), mean=mean, swapRB=True, crop=True) # blob.shape is N x 3 x H x W ( samples, channels(RGB), width , height)
  blob = np.transpose(blob, (1,0,2,3)) # resulting shape is 3 x N x H x W
  blob = np.expand_dims(blob, axis=0) # resulting shape is 1 x 3 x N x H x W # 1 is for the batch dimension, required for input to the model 
  return blob 


In [30]:
# defining helper function to retrieve (image) frames from the video 
def get_video_frames(vcap, num_frames) :
  frames = []
  for i in range(num_frames) :
    grabbed, frame = vcap.read()
    if grabbed == False :
      print("No more frames to read")
      break
    frames.append(frame)
  
  if len(frames) < num_frames :
    return -1
  else :
    return frames 

# defining helper function to write the predicted class and corresponding confidence probability on output frame 
def write_pred_on_frame(frame, pred_class_name) :
  text = pred_class_name
  cv2.rectangle(frame, (0,0), (150,30), (0,0,0), -1)
  cv2.putText(frame, text, (10,20), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255,255,255), 1)
  return frame 


In [31]:
# setting up input/read and output/write video streams 
vcap = cv2.VideoCapture(filepath_in_video)
if not vcap.isOpened() :
  print("Error opening video")
else :
  width  = int(vcap.get(3))
  height = int(vcap.get(4))
  fps    = int(vcap.get(5))
  num_frames = int(vcap.get(7))
  duration = num_frames/fps
  print("frame width:{}, height:{}, fps:{}".format(width, height, fps))
  print("Video duration {} seconds".format(duration))

fourcc = cv2.VideoWriter_fourcc(*'XVID') # XVID codec for writing MP4 files
vout = cv2.VideoWriter(filepath_out_video, fourcc, fps, (width, height), 1) # 1 for coloured video  

# performing inference and writing to output video stream 
num_frames = 16 # number of frames passed to model for making single inference . this specification is as per the model used and should not be changed 
start = time()
while True :
  frames = get_video_frames(vcap, num_frames)
  if frames == -1 :
    break 
  
  frames_processed = preprocess(frames)
  model.setInput(frames_processed)
  pred = model.forward() # resulting pred.shape will be (1 , 400) 
  pred_class_idx  = np.argmax(pred)
  pred_class_name = class_names[pred_class_idx]

  for frame in frames :
    output_frame = write_pred_on_frame(frame, pred_class_name)
    vout.write(output_frame)
end = time()
print("Finished processing. Took {} seconds".format(end-start))

vcap.release()
vout.release()

frame width:400, height:224, fps:25
Video duration 18.0 seconds
No more frames to read
Finished processing. Took 18.383758306503296 seconds


In [33]:
#@title Display video inline
from IPython.display import HTML
from base64 import b64encode
import os 

save_path = "/content/output.mp4" # Input video path
compressed_path = "/content/result_compressed.mp4" # Compressed video path

os.system(f"ffmpeg -i {save_path} -vcodec libx264 {compressed_path}")

mp4 = open(compressed_path, "rb").read()
dataURL = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML("""
<video width=400 controls>
      <source src="%s" type="video/mp4">
</video>
""" % dataURL)

# source - https://stackoverflow.com/questions/57377185/how-play-mp4-video-in-google-colab
