In [1]:
import cv2
import numpy as np
from math import sqrt
from keras_ocr.pipeline import Pipeline





# F2R

In [2]:
OUT_VIDEO_NAME = 'output_F2R.avi'

BLACK = 0
WHITE = 255
GAUSSIAN_BLUR = (5,5)
THRESHOLD_GOOD_MATCH = 0.6
BG_INTENSITY_MIN = 50
BG_INTENSITY_MAX = 125
THRESHOLD_LOGO_MASK = 200

def inits():
    cap = cv2.VideoCapture('imgs/Multiple View.avi')
    w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    out = cv2.VideoWriter('output_F2R.avi', cv2.VideoWriter_fourcc(*'DIVX'), fps, (w,h))
    sift = cv2.SIFT_create()
    flann = cv2.FlannBasedMatcher(dict(algorithm=1, trees=5), dict(checks=50))
    return cap, out, w, h, sift, flann

def textBackgroundMatching(augmented_layer):
    text = augmented_layer.copy()
    text[:text.shape[0]//2,:] = BLACK
    text_gray = cv2.cvtColor(text, cv2.COLOR_RGB2GRAY)
    mask = (text_gray > BG_INTENSITY_MIN) & (text_gray < BG_INTENSITY_MAX)
    warped_al[mask] = curr_frame[mask]
    return warped_al

def splitAlMask(mask):
    logo_mask = mask.copy()
    logo_mask[h//2:,:] = BLACK
    text_mask = mask.copy()
    text_mask[:h//2,:] = BLACK
    return logo_mask, text_mask

def findTextCorners(img):
    prediction_groups = Pipeline().recognize([img])
    corners_list = [[] for _ in range(8)]
    for box in prediction_groups[0]:
        if box[0] in ['richard', 'hartley', 'and', 'andrew', 'zisserman']:
            for i in range(4):
                corners_list[2*i].append(box[1][i][0])
                corners_list[2*i+1].append(box[1][i][1])

    full_corners = np.array([[min(corners_list[0]), min(corners_list[1])],
                             [max(corners_list[2]), min(corners_list[3])],
                             [max(corners_list[4]), max(corners_list[5])],
                             [min(corners_list[6]), max(corners_list[7])]],dtype='float32')

    return full_corners.reshape(1, -1, 2)

def removeAuthorText(img, M, text_corners): 
    corners = cv2.perspectiveTransform(text_corners, M)[0]

    x_mid0, y_mid0 = int((corners[1][0] + corners[2][0])/2), int((corners[1][1] + corners[2][1])/2)
    x_mid1, y_mi1 = int((corners[0][0] + corners[3][0])/2), int((corners[0][1] + corners[3][1])/2)
    
    mask = np.zeros(img.shape[:2], dtype="uint8")
    thickness = int(sqrt( (corners[2][0] - corners[1][0])**2 + (corners[2][1] - corners[1][1])**2 ))
    cv2.line(mask, (x_mid0, y_mid0), (x_mid1, y_mi1), 255, thickness+1)
    
    return cv2.inpaint(img, mask, 7, cv2.INPAINT_NS)


# Initializations
cap, out, w, h, sift, flann = inits()

# Read first frame, augmented layer and masks
_, ref_frame = cap.read()
ref_frame = cv2.cvtColor(ref_frame, cv2.COLOR_BGR2RGB)
object_mask = cv2.imread('imgs/ObjectMask.PNG', cv2.IMREAD_GRAYSCALE)
al = cv2.imread('imgs/AugmentedLayer.PNG')[:ref_frame.shape[0], :ref_frame.shape[1]]
al = cv2.cvtColor(al, cv2.COLOR_BGR2RGB)
al_mask = cv2.imread('imgs/AugmentedLayerMask.PNG', cv2.IMREAD_GRAYSCALE)[:ref_frame.shape[0], :ref_frame.shape[1]]

# Blur augmented layer's mask for smoother edges on result
al_mask = cv2.GaussianBlur(al_mask, GAUSSIAN_BLUR, 0)

# Mask reference frame
ref_frame[object_mask == BLACK] = BLACK

# Find keypoints and compute descriptions in reference frame
kp_ref, des_ref = sift.detectAndCompute(ref_frame, None)

# Find corners of text present in both frame and augmented layer and remove it using keras OCR
org_text_corners = findTextCorners(ref_frame)

frame_num = 0

while cap.isOpened():
    ret, curr_frame = cap.read()
    if not ret or curr_frame is None:
        print('End of input video. Exiting...')
        break
    curr_frame = cv2.cvtColor(curr_frame, cv2.COLOR_BGR2RGB)

    # Find keypoints and descriptions in current frame and match with reference frame
    kp_frame, des_frame = sift.detectAndCompute(curr_frame, None)
    matches = flann.knnMatch(des_ref, des_frame, k=2)

    good = []
    for m, n in matches:
        if m.distance < THRESHOLD_GOOD_MATCH*n.distance:
            good.append(m)

    if len(good) < 4:
        print("Not enough good matches. Aborting...")
        break
    
    # build corrspondence arrays of good matches
    src_pts = np.float32([kp_ref[m.queryIdx].pt for m in good ]).reshape(-1,1,2)
    dst_pts = np.float32([kp_frame[m.trainIdx].pt for m in good ]).reshape(-1,1,2)

    # Estimate a robust homography with RANSAC and warp augmented layer and its mask
    M, _ = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 2.0)
    warped_al = cv2.warpPerspective(al, M, (w,h))
    warped_al_mask = cv2.warpPerspective(al_mask, M, (w,h), flags=cv2.INTER_NEAREST)

    # Remove text that is both in frame and in augmented layer
    curr_frame = removeAuthorText(curr_frame, M, org_text_corners)
    
    # Match text's background in the augmented layer with the one on the book 
    warped_al = textBackgroundMatching(warped_al)

    # Split augmented layer's mask into its two parts
    warped_al_logo_mask, warped_al_text_mask = splitAlMask(warped_al_mask)

    # Place both parts of augmented layer on current frame and write to video
    curr_frame[warped_al_logo_mask>THRESHOLD_LOGO_MASK] = warped_al[warped_al_logo_mask>THRESHOLD_LOGO_MASK]
    curr_frame[warped_al_text_mask==WHITE] = warped_al[warped_al_text_mask==WHITE]

    out.write(cv2.cvtColor(curr_frame, cv2.COLOR_RGB2BGR))
    
    frame_num += 1
    if frame_num % 45 == 0:
        print(round(28-frame_num/15), "sec of video left.")

cap.release()
out.release()

Looking for C:\Users\tomas\.keras-ocr\craft_mlt_25k.h5

Instructions for updating:
Use `tf.image.resize(...method=ResizeMethod.BILINEAR...)` instead.

Looking for C:\Users\tomas\.keras-ocr\crnn_kurapan.h5
25 sec of video left.
22 sec of video left.
19 sec of video left.
16 sec of video left.
13 sec of video left.
10 sec of video left.
7 sec of video left.
4 sec of video left.
1 sec of video left.
End of input video. Exiting...


# F2F

In [3]:
OUT_VIDEO_NAME = 'output_F2F.avi'

BLACK = 0
WHITE = 255
GAUSSIAN_BLUR = (5,5)
THRESHOLD_GOOD_MATCH = 0.6
BG_INTENSITY_MIN = 50
BG_INTENSITY_MAX = 125
THRESHOLD_LOGO_MASK = 200


def inits():
    cap = cv2.VideoCapture('imgs/Multiple View.avi')
    w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    out = cv2.VideoWriter(OUT_VIDEO_NAME, cv2.VideoWriter_fourcc(*'DIVX'), fps, (w,h))
    sift = cv2.SIFT_create()
    flann = cv2.FlannBasedMatcher(dict(algorithm=1, trees=5), dict(checks=50))
    return cap, out, w, h, sift, flann

def textBackgroundMatching(augmented_layer):
    text = augmented_layer.copy()
    text[:text.shape[0]//2,:] = BLACK
    text_gray = cv2.cvtColor(text, cv2.COLOR_RGB2GRAY)
    mask = (text_gray > BG_INTENSITY_MIN) & (text_gray < BG_INTENSITY_MAX)
    warped_al[mask] = curr_frame[mask]
    return warped_al

def splitAlMask(mask):
    logo_mask = mask.copy()
    logo_mask[h//2:,:] = BLACK
    text_mask = mask.copy()
    text_mask[:h//2,:] = BLACK
    return logo_mask, text_mask

def findTextCorners(img):
    prediction_groups = Pipeline().recognize([img])
    corners_list = [[] for _ in range(8)]
    for box in prediction_groups[0]:
        if box[0] in ['richard', 'hartley', 'and', 'andrew', 'zisserman']:
            for i in range(4):
                corners_list[2*i].append(box[1][i][0])
                corners_list[2*i+1].append(box[1][i][1])

    full_corners = np.array([[min(corners_list[0]), min(corners_list[1])],
                             [max(corners_list[2]), min(corners_list[3])],
                             [max(corners_list[4]), max(corners_list[5])+3],
                             [min(corners_list[6]), max(corners_list[7])+4]],dtype='float32')

    return full_corners.reshape(1, -1, 2)

def removeAuthorText(img, M, text_corners): 
    corners = cv2.perspectiveTransform(text_corners, M)[0]

    x_mid0, y_mid0 = int((corners[1][0] + corners[2][0])/2), int((corners[1][1] + corners[2][1])/2)
    x_mid1, y_mi1 = int((corners[0][0] + corners[3][0])/2), int((corners[0][1] + corners[3][1])/2)
    
    mask = np.zeros(img.shape[:2], dtype="uint8")
    thickness = int(sqrt( (corners[2][0] - corners[1][0])**2 + (corners[2][1] - corners[1][1])**2 ))
    cv2.line(mask, (x_mid0, y_mid0), (x_mid1, y_mi1), 255, thickness+1)
    
    return cv2.inpaint(img, mask, 7, cv2.INPAINT_NS)


# Initializations
cap, out, w, h, sift, flann = inits()

# Read first frame, augmented layer and masks
_, prev_frame = cap.read()
prev_frame = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2RGB)
object_mask = cv2.imread('imgs/ObjectMask.PNG', cv2.IMREAD_GRAYSCALE)
al = cv2.imread('imgs/AugmentedLayer.PNG')[:prev_frame.shape[0], :prev_frame.shape[1]]
al = cv2.cvtColor(al, cv2.COLOR_BGR2RGB)
al_mask = cv2.imread('imgs/AugmentedLayerMask.PNG', cv2.IMREAD_GRAYSCALE)[:prev_frame.shape[0], :prev_frame.shape[1]]

# Blur augmented layer's mask for smoother edges on result
al_mask = cv2.GaussianBlur(al_mask, GAUSSIAN_BLUR, 0)

# Mask reference frame
prev_frame[object_mask == BLACK] = BLACK

# Find keypoints and compute descriptions in reference frame
kp_prev, des_prev = sift.detectAndCompute(prev_frame, None)

# Find corners of text present in both frame and augmented layer and remove it using keras OCR
org_text_corners = findTextCorners(prev_frame)

# Define the first homography as the identity matrix
M = np.eye(3)

frame_num = 0

while cap.isOpened():
    ret, curr_frame = cap.read()
    if not ret or curr_frame is None:
        print('End of input video. Exiting...')
        break
    curr_frame = cv2.cvtColor(curr_frame, cv2.COLOR_BGR2RGB)

    # Find keypoints and compute descriptions in current frame
    kp_curr, des_curr = sift.detectAndCompute(curr_frame, None)

    matches = flann.knnMatch(des_prev, des_curr, k=2)

    good = []
    for m, n in matches:
        if m.distance < THRESHOLD_GOOD_MATCH*n.distance:
            good.append(m)

    if len(good) < 4:
        print("Not enough good matches. Aborting...")
        break
    
    # building the corrspondences arrays of good matches
    src_pts = np.float32([kp_prev[m.queryIdx].pt for m in good ]).reshape(-1,1,2)
    dst_pts = np.float32([kp_curr[m.trainIdx].pt for m in good ]).reshape(-1,1,2)

    # Estimate a robust homography with RANSAC and multiply it with previous homography
    M_new, mask = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0)
    M = M_new @ M

    # Warp augmented layer and masks
    warped_al = cv2.warpPerspective(al, M, (w,h))
    warped_al_mask = cv2.warpPerspective(al_mask, M, (w,h), flags=cv2.INTER_NEAREST)
    warped_object_mask = cv2.warpPerspective(object_mask, M, (w,h))

    # Remove text that is both in frame and in augmented layer
    curr_frame = removeAuthorText(curr_frame, M, org_text_corners)

    # Split augmented layer's mask into its two parts
    warped_al_logo_mask, warped_al_text_mask = splitAlMask(warped_al_mask)

    # Match text's background in the augmented layer with the one on the book 
    warped_al = textBackgroundMatching(warped_al)

    prev_frame = np.copy(curr_frame)
    prev_frame[warped_object_mask==BLACK] = BLACK

    kp_prev = tuple(kp_curr)
    des_prev = np.copy(des_curr)

    # Place both parts of augmented layer on current frame and write to video
    curr_frame[warped_al_logo_mask>THRESHOLD_LOGO_MASK] = warped_al[warped_al_logo_mask>THRESHOLD_LOGO_MASK]
    curr_frame[warped_al_text_mask==WHITE] = warped_al[warped_al_text_mask==WHITE]
    
    out.write(cv2.cvtColor(curr_frame, cv2.COLOR_RGB2BGR))
    
    frame_num += 1
    if frame_num % 45 == 0:
        print(round(28-frame_num/15), "sec of video left.")

cap.release()
out.release()

Looking for C:\Users\tomas\.keras-ocr\craft_mlt_25k.h5
Looking for C:\Users\tomas\.keras-ocr\crnn_kurapan.h5
25 sec of video left.
22 sec of video left.
19 sec of video left.
16 sec of video left.
13 sec of video left.
10 sec of video left.
7 sec of video left.
4 sec of video left.
1 sec of video left.
End of input video. Exiting...
