# AIVE - VIDEO DETECTION CHALLENGE
"""
Home Exercise
Given the Dior - Eau de Parfum commercial, use any existing algorithms to generate a video that shows and track the existence of humans within the video by drawing boxes around them for each frame. For different humans, different colors must be used.

Expected Results: 
A video that has the dior commercial, with boxes around each person, and runnable code (in the form of a repo with a good packaging) that can reproduce that video.
"""
# Needed packages/versions
Python 3.7.6 or later (I use Python 3.7.13)

pytube==12.0.0
moviepy==1.0.3
tensorflow==2.4.0
tensorflow-gpu==2.4.0
keras==2.4.3
numpy==1.19.3
pillow==7.0.0
scipy==1.4.1
h5py==2.10.0
matplotlib==3.3.2
opencv-python
keras-resnet==0.2.0
imageai==2.1.6

# Importing required libraries

In [6]:
# Importing required libraies
import os
import pytube
import cv2 as cv
from imageai.Detection import ObjectDetection
from moviepy.editor import VideoFileClip, AudioFileClip
import time
import numpy as np
from scipy.spatial import distance

# Initialize Parameters and Detection models

In [7]:
# Youtube url, i.e. marketing video of Dior commercial
VIDEO_URL = "https://www.youtube.com/watch?v=h4s0llOpKrU"

# Directory to save the downloaded video
VIDEO_IN_PATH = 'video_in'

# Directory to save video frame
VIDEO_OUT_PATH = 'video_out'

# Directory to save the audi file
AUDIO_PATH = 'audio'

# Audio fime name
AUDIO_NAME = 'dior_audio'

# Audio fime extension
AUDIO_EXT = '.mp3'

# Detection model path
MODEL_PATH = 'models/resnet50_coco_best_v2.1.0.h5'  # RetinaNet
# MODEL_PATH = 'models/yolo.h5' # Yolo Net
# MODEL_PATH = 'models/yolo-tiny.h5' # Yolo Tiny Net

# Output video name WITHOUT audio
OUT_VIDEO_NAME = 'dior_box_NO_music_center'

# Output video name WITH audio
OUT_VIDEO_MUSIC_NAME = 'dior_box_music_center'

# Output video extension '.mp4' or '.avi'
OUT_VIDEO_EXT = '.mp4'

# Color list
COLORS = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 0, 255), (0, 255, 255), (255, 50, 0), (50, 255, 0), (50, 0, 255)]

# ----- Other method to generate Color list at scale -----
# rgb_pixel = list(range(0,255,50))
# random.shuffle(rgb_pixel)
# COLORS = list(product(rgb_pixel,rgb_pixel,rgb_pixel))
# random.shuffle(COLORS)

# Get current directory of the python file
execution_path = os.getcwd()

# load detector from ObjectDetection class
detector = ObjectDetection()

# Set ReninaNet for video detection
detector.setModelTypeAsRetinaNet()

# Load RetinaNet model
detector.setModelPath(os.path.join(execution_path, MODEL_PATH))
detector.loadModel()

# Customize Object: Only detect 'person' object
custom_objects = detector.CustomObjects(person=True)



# Build Utils functions

In [8]:
"""
This is the utils file of 'aive' package. It includes the following functions:
- create_dir
- download_youtube_video
- extract_audio_from_video
- draw_boxes
- render_video_detection
- render_video_music
"""
# Function to create a directory to save downloaded video
def create_dir(dir_name):
    """ Create a directory if not existed.

    :param dir_name: directory name

    :returns: A new created directory
    """
    if not os.path.exists(dir_name):
        os.mkdir(dir_name)


# Download youtube video from url
def download_youtube_video(url, download_dir):
    """ Download youtube video from video url

    :param
        url_path: url path of the video
        download_dir: path to the download directory

    :returns:
        video_path: (str) path to the downloaded video
    """
    # Load Youtube url into pytube
    youtube = pytube.YouTube(url)

    # Set Streams resolution
    video = youtube.streams.get_highest_resolution()

    # Start downloading youtude video
    video.download(download_dir)

    # Get a list of video names
    video_files = os.listdir(download_dir)

    # Get video name
    video_name = video_files[0]

    # Get video path
    video_path = os.path.join(download_dir, video_name)

    # Print downloaded video path
    print('Video file is saved at: {}'.format(video_path))

    return video_path


# Extract audio from video
def extract_audio_from_video(video_file, audio_path):
    """ Extract audio from video using MoviePy library
    that uses `ffmpeg` under the hood

    :param
        video_file: path to the video file
        output_ext: audio output extension

    :returns: audio file is saved in the same folder of video file
    """
    # Get Clip file
    clip = VideoFileClip(video_file)

    # Save audio file
    clip.audio.write_audiofile(audio_path)

    print('Audio file is saved at: {}'.format(audio_path))


# Function to draw all detected boxes over the video frame
def draw_boxes(image, box_points, colors):
    """ Draw rectangle box for each object detection.
    Different detection has different box color.
    This function is generalized for as many as detected objects. The color
    list is now randomly choice in function of the number of detected object.

    :param
        image: video frame under ndarray

        box_points: A list of box coordinate tuples for each detection. Example: [(20,30,50,60),(15,35,40,65)]

    :returns image under ndarray
    """

    for i, box in enumerate(box_points):
        # It is ok now for the Dior video in term of box color, but if the video has a lot of people detection, then
        # 'colors' should be genereated at scale as many as object detection.
        if i <= len(colors) - 1:
            # Get coordinates of each box for each detection
            xmin, ymin, xmax, ymax = box

            # Draw box with OpenCV
            cv.rectangle(image, (xmin, ymin), (xmax, ymax), colors[i], 2)

    return image

def convert_to_center(box):
    """Changes the box format to center

    Arguments:
      boxes: [x1,y1,x2,y2] -> [left,top,right,bottom]

    Returns:
      find center of the box
    """
    x_center = int((box[0] + box[2])/2)
    y_center = int((box[1] + box[3])/2)
#     width = box[2] - box[0]
#     height = box[3] - box[1]
    return (x_center, y_center)


def render_video_detection(video_path, output_path, detector, custom_objects, colors, ext='.mp4'):
    """ Make object detection on each frame extracted from the video.
    Render processed frames into the output video.
    Function actually supports only .mp4 and .avi as output extension. Default is .mp4

    :param
        video_path: Path to the input video
        output_path: Path to the output video without extension
        detector: detection model
        custom_objects: ex. only 'person'
        box_colors: a list of rgb color for each bounding box
        ext: video extension
    :returns video with detected objects on each frame

    """

    # Set starting time
    start_time = time.time()

    # Capture video with OpenCV
    input_video = cv.VideoCapture(video_path)

    # Get video FPS
    fps = input_video.get(cv.CAP_PROP_FPS)

    # Get frame width and frame height
    frame_width = int(input_video.get(3))
    frame_height = int(input_video.get(4))

    # Set parameter for video encoder, adapting to the desired video extension, i.e. '.mp4' or '.avi'
    if ext == '.avi':
        # Create output video path
        output_video_filepath = output_path + ext
        output_video = cv.VideoWriter(output_video_filepath,
                                      cv.VideoWriter_fourcc('M', 'J', 'P', 'G'),
                                      fps,
                                      (frame_width, frame_height)
                                      )
    else:  # .mp4
        # Create output video path
        output_video_filepath = output_path + '.mp4'
        output_video = cv.VideoWriter(output_video_filepath,
                                      cv.VideoWriter_fourcc(*'mp4v'),
                                      fps,
                                      (frame_width, frame_height)
                                      )

    # initialize video reading status
    success = True
    count = 0
    image_center = (frame_width/2, frame_height/2)

    # read video from the first to the last frame
    while success:

        # read each frame, return status 'success' and video frame
        success, frame = input_video.read()
        
       
        # if there is still frame to read
        if success:
            # Model prediction: Detect only custom objects, i.e. person.
            _, detections = detector.detectCustomObjectsFromImage(custom_objects=custom_objects,
                                                                  input_image=frame,
                                                                  input_type="array",
                                                                  output_type="array",
                                                                  minimum_percentage_probability=50,
                                                                  display_percentage_probability=False,
                                                                  display_object_name=False)

            # Create a list of box coordinates of each detection
            box_points = [detection['box_points'] for detection in detections]
            
            num_box = len(box_points)

            if num_box == 0:
                detected_frame_array = frame
                
            elif num_box == 1:
                # Draw the rectanges over each frame. Different box color for each person
                detected_frame_array = draw_boxes(frame, box_points, colors)

            else:
                # In this case of study, choosing 2 human detections that have highest probabilities
                perc_proba = [detection['percentage_probability'] for detection in detections]
                box_points = [box_points[i] for i in np.argsort(perc_proba)[::-1]]
                box_points = box_points[:2]
                
                # Convert box into its center
                center_boxes = [convert_to_center(b) for b in box_points]
                
                # Calculate a list of distanced from the box center and the image center
                dist_center = [distance.euclidean(cb, image_center) for cb in center_boxes]
                
                # Sort the 'box_points' that gives the disctance to center in an ascending order  
                box_points_sorted = [box_points[i] for i in np.argsort(dist_center)]
                
                # Draw the rectanges over each frame. Different box color for each person
                detected_frame_array = draw_boxes(frame, box_points_sorted, colors)
                
            # Write detected frame array into output video
            output_video.write(detected_frame_array)

            # Print # frame being processing
            count += 1
            print("Frame: {} is processed.".format(count))
            print('----------------------')

    # Finished. Release input and output video
    input_video.release()
    output_video.release()

    # Print finish status and the processing time
    print("Your processed video WITHOUT music is saved at: {}".format(output_video_filepath))
    print("Rendering Video-detection is finished in {} minutes.".format(round((time.time() - start_time) / 60, 2)))
    return detections


def render_video_music(video_path, audio_path, combined_video_path):
    """ Render object-detected video with its original audio.

    :param
        video_path: Path to the processed video with object detection
        audio_path: Path to audio file
        combined_video_path: Path to save final video with audio
    :returns Final video with detected objects on each frame and its audio

    """

    # loading video dsa gfg intro video
    clip = VideoFileClip(video_path)

    # loading audio file
    audioclip = AudioFileClip(audio_path)

    # adding audio to the video clip
    videoclip = clip.set_audio(audioclip)

    # saving the final processed video with music
    videoclip.write_videofile(combined_video_path)
    # Print finish status and the processing time
    print("Your combined video with music is saved at: {}".format(combined_video_path))

    return videoclip

def main(url_path):
    """
    Get a youtube video url, this function make person detection on each frame, then generate the processed video.
    Different person has different bounding box color. The audio is combined with the processed video at the end
    of the pileline
    :param url_path: Youtube video url
    :return: processed video with object detection, combined with its original audio
    """
    # ------------- Create Directory & Path - -------------  #
    # Create download directory for video
    create_dir(VIDEO_IN_PATH)

    # Create directory for extracted audio
    create_dir(AUDIO_PATH)

    # Create output video directory to save processed video
    create_dir(VIDEO_OUT_PATH)

    # Get audio path
    audio_path = os.path.join(AUDIO_PATH, AUDIO_NAME + AUDIO_EXT)

    # Get path of video output without music
    output_video_no_ext = os.path.join(VIDEO_OUT_PATH, OUT_VIDEO_NAME)
    output_video_path = os.path.join(VIDEO_OUT_PATH, OUT_VIDEO_NAME + OUT_VIDEO_EXT)

    # Create path of the final processed video with its original music
    combined_output_video = os.path.join(VIDEO_OUT_PATH, OUT_VIDEO_MUSIC_NAME + OUT_VIDEO_EXT)

    # ------------- Download Video, Extract Audio, Object Detection & Render --------------#

    # Download youtube video and get video path
    video_path = download_youtube_video(url_path, VIDEO_IN_PATH)

    # Extract audio from video and show audio path
    extract_audio_from_video(video_path, audio_path)

    # Run the model prediction for video object detection
    render_video_detection(video_path, output_video_no_ext, detector, custom_objects, COLORS, ext=OUT_VIDEO_EXT)

    # Start render final video with music and saving
    _ = render_video_music(output_video_path, audio_path, combined_output_video)



# Run pipeline

In [9]:
main(VIDEO_URL)

Video file is saved at: video_in/MISS DIOR – The new Eau de Parfum.mp4


chunk:  17%|█▋        | 167/993 [00:00<00:00, 1667.36it/s, now=None]

MoviePy - Writing audio in audio/dior_audio.mp3


                                                                    

MoviePy - Done.
Audio file is saved at: audio/dior_audio.mp3
Frame: 1 is processed.
----------------------
Frame: 2 is processed.
----------------------
Frame: 3 is processed.
----------------------
Frame: 4 is processed.
----------------------
Frame: 5 is processed.
----------------------
Frame: 6 is processed.
----------------------
Frame: 7 is processed.
----------------------
Frame: 8 is processed.
----------------------
Frame: 9 is processed.
----------------------
Frame: 10 is processed.
----------------------
Frame: 11 is processed.
----------------------
Frame: 12 is processed.
----------------------
Frame: 13 is processed.
----------------------
Frame: 14 is processed.
----------------------
Frame: 15 is processed.
----------------------
Frame: 16 is processed.
----------------------
Frame: 17 is processed.
----------------------
Frame: 18 is processed.
----------------------
Frame: 19 is processed.
----------------------
Frame: 20 is processed.
----------------------
Frame: 2

Frame: 175 is processed.
----------------------
Frame: 176 is processed.
----------------------
Frame: 177 is processed.
----------------------
Frame: 178 is processed.
----------------------
Frame: 179 is processed.
----------------------
Frame: 180 is processed.
----------------------
Frame: 181 is processed.
----------------------
Frame: 182 is processed.
----------------------
Frame: 183 is processed.
----------------------
Frame: 184 is processed.
----------------------
Frame: 185 is processed.
----------------------
Frame: 186 is processed.
----------------------
Frame: 187 is processed.
----------------------
Frame: 188 is processed.
----------------------
Frame: 189 is processed.
----------------------
Frame: 190 is processed.
----------------------
Frame: 191 is processed.
----------------------
Frame: 192 is processed.
----------------------
Frame: 193 is processed.
----------------------
Frame: 194 is processed.
----------------------
Frame: 195 is processed.
---------------

Frame: 346 is processed.
----------------------
Frame: 347 is processed.
----------------------
Frame: 348 is processed.
----------------------
Frame: 349 is processed.
----------------------
Frame: 350 is processed.
----------------------
Frame: 351 is processed.
----------------------
Frame: 352 is processed.
----------------------
Frame: 353 is processed.
----------------------
Frame: 354 is processed.
----------------------
Frame: 355 is processed.
----------------------
Frame: 356 is processed.
----------------------
Frame: 357 is processed.
----------------------
Frame: 358 is processed.
----------------------
Frame: 359 is processed.
----------------------
Frame: 360 is processed.
----------------------
Frame: 361 is processed.
----------------------
Frame: 362 is processed.
----------------------
Frame: 363 is processed.
----------------------
Frame: 364 is processed.
----------------------
Frame: 365 is processed.
----------------------
Frame: 366 is processed.
---------------

Frame: 517 is processed.
----------------------
Frame: 518 is processed.
----------------------
Frame: 519 is processed.
----------------------
Frame: 520 is processed.
----------------------
Frame: 521 is processed.
----------------------
Frame: 522 is processed.
----------------------
Frame: 523 is processed.
----------------------
Frame: 524 is processed.
----------------------
Frame: 525 is processed.
----------------------
Frame: 526 is processed.
----------------------
Frame: 527 is processed.
----------------------
Frame: 528 is processed.
----------------------
Frame: 529 is processed.
----------------------
Frame: 530 is processed.
----------------------
Frame: 531 is processed.
----------------------
Frame: 532 is processed.
----------------------
Frame: 533 is processed.
----------------------
Frame: 534 is processed.
----------------------
Frame: 535 is processed.
----------------------
Frame: 536 is processed.
----------------------
Frame: 537 is processed.
---------------

Frame: 688 is processed.
----------------------
Frame: 689 is processed.
----------------------
Frame: 690 is processed.
----------------------
Frame: 691 is processed.
----------------------
Frame: 692 is processed.
----------------------
Frame: 693 is processed.
----------------------
Frame: 694 is processed.
----------------------
Frame: 695 is processed.
----------------------
Frame: 696 is processed.
----------------------
Frame: 697 is processed.
----------------------
Frame: 698 is processed.
----------------------
Frame: 699 is processed.
----------------------
Frame: 700 is processed.
----------------------
Frame: 701 is processed.
----------------------
Frame: 702 is processed.
----------------------
Frame: 703 is processed.
----------------------
Frame: 704 is processed.
----------------------
Frame: 705 is processed.
----------------------
Frame: 706 is processed.
----------------------
Frame: 707 is processed.
----------------------
Frame: 708 is processed.
---------------

Frame: 859 is processed.
----------------------
Frame: 860 is processed.
----------------------
Frame: 861 is processed.
----------------------
Frame: 862 is processed.
----------------------
Frame: 863 is processed.
----------------------
Frame: 864 is processed.
----------------------
Frame: 865 is processed.
----------------------
Frame: 866 is processed.
----------------------
Frame: 867 is processed.
----------------------
Frame: 868 is processed.
----------------------
Frame: 869 is processed.
----------------------
Frame: 870 is processed.
----------------------
Frame: 871 is processed.
----------------------
Frame: 872 is processed.
----------------------
Frame: 873 is processed.
----------------------
Frame: 874 is processed.
----------------------
Frame: 875 is processed.
----------------------
Frame: 876 is processed.
----------------------
Frame: 877 is processed.
----------------------
Frame: 878 is processed.
----------------------
Frame: 879 is processed.
---------------

Frame: 1030 is processed.
----------------------
Frame: 1031 is processed.
----------------------
Frame: 1032 is processed.
----------------------
Frame: 1033 is processed.
----------------------
Frame: 1034 is processed.
----------------------
Frame: 1035 is processed.
----------------------
Frame: 1036 is processed.
----------------------
Frame: 1037 is processed.
----------------------
Frame: 1038 is processed.
----------------------
Frame: 1039 is processed.
----------------------
Frame: 1040 is processed.
----------------------
Frame: 1041 is processed.
----------------------
Frame: 1042 is processed.
----------------------
Frame: 1043 is processed.
----------------------
Frame: 1044 is processed.
----------------------
Frame: 1045 is processed.
----------------------
Frame: 1046 is processed.
----------------------
Frame: 1047 is processed.
----------------------
Frame: 1048 is processed.
----------------------
Frame: 1049 is processed.
----------------------
Frame: 1050 is proce

chunk:  18%|█▊        | 174/994 [00:00<00:00, 1725.45it/s, now=None]

Moviepy - Building video video_out/dior_box_music_center.mp4.
MoviePy - Writing audio in dior_box_music_centerTEMP_MPY_wvf_snd.mp3


t:   4%|▎         | 41/1125 [00:00<00:02, 404.03it/s, now=None]     

MoviePy - Done.
Moviepy - Writing video video_out/dior_box_music_center.mp4



                                                                 

Moviepy - Done !
Moviepy - video ready video_out/dior_box_music_center.mp4
Your combined video with music is saved at: video_out/dior_box_music_center.mp4
