# Inference Kernel Demo

This is the kernel I’ve used for my recent submissions. It takes about 5-6 hours on the test set, using only CPU. 

I’ve provided this kernel because a lot of people have problems making submissions. This method works and has never errored out for me. (Although I haven't tried making a submission using the GPU yet -- so no guarantees there.)

It uses BlazeFace for face extraction (see also [my BlazeFace kernel](https://www.kaggle.com/humananalog/starter-blazeface-pytorch)) and ResNeXt50 as the classifier model.

We take the average prediction over 17 frames from each video. (Why 17? Using more frames makes the kernel slower, but doesn't appear to improve the score much. I used an odd number so we don't always land on even frames.)

**Please use this kernel only to learn from...** Included is the checkpoint for a ResNeXt50 model that hasn't really been trained very well yet. I'm sure you can improve on it by training your own model!

You could use the included trained weights to get yourself an easy top-50 score on the leaderboard (as of 24 Jan 2020) but it’s nicer to use it as a starting point for your own work. :-)

In [1]:
import os, time
import cv2
import numpy as np
import pandas as pd

import torch

%matplotlib inline

## Get the test videos

In [2]:
Local = True

if Local:
    data_path = "/data1/data/deepfake/dfdc_train/"
    metadata: pd.DataFrame = pd.read_json(data_path + "metadata_kailu.json").T
    test_videos = sorted([data_path + x for x in metadata.index])
else:
    test_dir = "/kaggle/input/deepfake-detection-challenge/test_videos/"
    test_videos = sorted([x for x in os.listdir(test_dir) if x[-4:] == ".mp4"])
len(test_videos)

400

## Create helpers

In [3]:
print("PyTorch version:", torch.__version__)
print("CUDA version:", torch.version.cuda)
print("cuDNN version:", torch.backends.cudnn.version())

PyTorch version: 1.3.0
CUDA version: 10.0.130
cuDNN version: 7603


In [4]:
gpu = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
gpu

device(type='cuda', index=0)

In [5]:
import sys
sys.path.insert(0, "/kaggle/input/blazeface-pytorch")
sys.path.insert(0, "/kaggle/input/deepfakes-inference-demo")

In [6]:
if not Local:
    from blazeface import BlazeFace
    facedet = BlazeFace().to(gpu)
    facedet.load_weights("/kaggle/input/blazeface-pytorch/blazeface.pth")
    facedet.load_anchors("/kaggle/input/blazeface-pytorch/anchors.npy")
    _ = facedet.train(False)

In [7]:
if not Local:
    from helpers.read_video_1 import VideoReader
    from helpers.face_extract_1 import FaceExtractor
    
    frames_per_video = 43
    
    video_reader = VideoReader()
    video_read_fn = lambda x: video_reader.read_frames(x, num_frames=frames_per_video)
    face_extractor = FaceExtractor(video_read_fn, facedet)
else:
    from model1.utils import 

In [8]:
input_size = 224

In [9]:
from torchvision.transforms import Normalize, RandomHorizontalFlip, ToTensor, ToPILImage, Compose

mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]
normalize_transform = Normalize(mean, std)
flip = Compose([ToPILImage(), RandomHorizontalFlip(p=1), ToTensor(), normalize_transform])

In [10]:
def isotropically_resize_image(img, size, resample=cv2.INTER_AREA):
    h, w = img.shape[:2]
    if w > h:
        h = h * size // w
        w = size
    else:
        w = w * size // h
        h = size

    resized = cv2.resize(img, (w, h), interpolation=resample)
    return resized


def make_square_image(img):
    h, w = img.shape[:2]
    size = max(h, w)
    t = 0
    b = size - h
    l = 0
    r = size - w
    return cv2.copyMakeBorder(img, t, b, l, r, cv2.BORDER_CONSTANT, value=0)

In [11]:
import torch.nn as nn
import torchvision.models as models

class MyResNeXt(models.resnet.ResNet):
    def __init__(self, training=True):
        super(MyResNeXt, self).__init__(block=models.resnet.Bottleneck,
                                        layers=[3, 4, 6, 3], 
                                        groups=32, 
                                        width_per_group=4)
        self.fc = nn.Linear(2048, 1)

In [12]:
checkpoint = torch.load("/kaggle/input/deepfakes-inference-demo/resnext.pth", map_location=gpu)

model = MyResNeXt().to(gpu)
model.load_state_dict(checkpoint)
_ = model.eval()

del checkpoint

## Prediction loop

In [13]:
def predict_on_video(video_path, batch_size):
    try:
        # Find the faces for N frames in the video.
        faces = face_extractor.process_video(video_path)

        # Only look at one face per frame.
        face_extractor.keep_only_best_face(faces)

        if len(faces) > 0:
            # NOTE: When running on the CPU, the batch size must be fixed
            # or else memory usage will blow up. (Bug in PyTorch?)
            x = np.zeros((batch_size, input_size, input_size, 3), dtype=np.uint8)

            # If we found any faces, prepare them for the model.
            n = 0
            for frame_data in faces:
                for face in frame_data["faces"]:
                    # Resize to the model's required input size.
                    # We keep the aspect ratio intact and add zero
                    # padding if necessary.                    
                    resized_face = isotropically_resize_image(face, input_size)
                    resized_face = make_square_image(resized_face)

                    if n < batch_size:
                        x[n] = resized_face
                        n += 1
                    else:
                        print("WARNING: have %d faces but batch size is %d" % (n, batch_size))

    #                 Test time augmentation: horizontal flips.
    #                 TODO: not sure yet if this helps or not
                    x[n] = cv2.flip(resized_face, 1)
                    n += 1

            if n > 0:
                x = torch.tensor(x, device=gpu).float()

                # Preprocess the images.
                x = x.permute((0, 3, 1, 2))

                for i in range(len(x)):
                    x[i] = normalize_transform(x[i] / 255.)

                # Make a prediction, then take the average.
                with torch.no_grad():
                    y_pred = model(x)
                    y_pred = torch.sigmoid(y_pred.squeeze()).cpu()

                    return give_predict(y_pred[:n])

    except Exception as e:
        print("Prediction error on video %s: %s" % (video_path, str(e)))

    return 0.5

def give_predict(y):
    y = y.numpy().clip(5e-8, 1-(5e-8))
    return np.mean(y)

#     y = y.numpy().clip(5e-8, 1-(5e-8))
#     num = len(y)
#     return np.exp(np.log(y).sum()/num)
    

In [14]:
from concurrent.futures import ThreadPoolExecutor

def predict_on_video_set(videos, num_workers):
    def process_file(i):
        filename = videos[i]
        y_pred = predict_on_video(os.path.join(test_dir, filename), batch_size=frames_per_video*2)
        return y_pred

    with ThreadPoolExecutor(max_workers=num_workers) as ex:
        predictions = ex.map(process_file, range(len(videos)))

    return list(predictions)

## Speed test

The leaderboard submission must finish within 9 hours. With 4000 test videos, that is `9*60*60/4000 = 8.1` seconds per video. So if the average time per video is greater than ~8 seconds, the kernel will be too slow!

In [15]:
speed_test = True  # you have to enable this manually

In [16]:
if speed_test:
    start_time = time.time()
    speedtest_videos = test_videos[-5:]
    predictions = predict_on_video_set(speedtest_videos, num_workers=4)
    elapsed = time.time() - start_time
    print("Elapsed %f sec. Average per video: %f sec." % (elapsed, elapsed / len(speedtest_videos)))

Elapsed 20.701058 sec. Average per video: 4.140212 sec.


## Make the submission

In [17]:
predictions = predict_on_video_set(test_videos, num_workers=4)

In [18]:
submission_df = pd.DataFrame({"filename": test_videos[:], "label": predictions})
submission_df.to_csv("submission.csv", index=False)

In [19]:
submission_df.head()

Unnamed: 0,filename,label
0,aassnaulhq.mp4,0.577892
1,aayfryxljh.mp4,0.00368
2,acazlolrpz.mp4,0.685165
3,adohdulfwb.mp4,0.065221
4,ahjnxtiamx.mp4,0.933871
