In [1]:
%env JOBLIB_TEMP_FOLDER=/root/ssd/tmp
import operator
from multiprocessing import Pool, TimeoutError
import glob
import os
import cv2

import numpy as np
from tqdm import tqdm_notebook
from catboost import CatBoostClassifier
from skimage.feature import local_binary_pattern as LBP

env: JOBLIB_TEMP_FOLDER=/root/ssd/tmp


In [2]:
TRAIN_PATH = '/train/'
VAL_PATH = '/test/'
RESULT_PATH = '/output/'
POOL_THREADS = 32
NEW_WIDTH = 480
NEW_HEIGHT = 480

In [5]:
def extract_feature_vector(image, p=18, r=2):
    channels = list(cv2.split(cv2.cvtColor(image, cv2.COLOR_BGR2HSV))) + \
                list(cv2.split(cv2.cvtColor(image, cv2.COLOR_BGR2YCrCb)))
    lbp_features = [LBP(ch, p, r, method="uniform") for ch in channels]
    hist_features = [np.histogram(lf,  bins=p+1, normed=True)[0] for lf in lbp_features]
    return np.hstack([hf.ravel() for hf in hist_features])


def process_single(file):
    image = cv2.imread(file)
    h, w, c = image.shape
    dw, dh = w // 4, h // 4
    return extract_feature_vector(cv2.resize(image[dh:h-dh, dw:w-dw, :], (NEW_WIDTH, NEW_HEIGHT)))


def extract_features(filelist):
    def __impl(files):
        pool = Pool(POOL_THREADS) 
        feats = list(tqdm_notebook(pool.imap(process_single, files), total=len(files)))        
        return feats
    feature_list = __impl(filelist) 
    return [x for x in feature_list if x is not None]

# Load train data

In [6]:
train_real_filelist = glob.glob(os.path.join(TRAIN_PATH, 'real/*.png'))
train_spoof_filelist = glob.glob(os.path.join(TRAIN_PATH, 'spoof/*.png'))

train_real_features = extract_features(train_real_filelist)
train_spoof_features = extract_features(train_spoof_filelist)

train_data = train_real_features + train_spoof_features
train_labels = [0] * len(train_real_features) + [1] * len(train_spoof_features)

# Load test data

In [7]:
val_real_filelist = glob.glob(os.path.join(VAL_PATH, 'real/*.png'))
val_spoof_filelist = glob.glob(os.path.join(VAL_PATH, 'spoof/*.png'))

val_real_features = extract_features(val_real_filelist)
val_spoof_features = extract_features(val_spoof_filelist)

val_data = val_real_features + val_spoof_features
val_labels = [0] * len(val_real_features) + [1] * len(val_spoof_features)

# Train classifier

In [11]:
clf = CatBoostClassifier(learning_rate=0.001, 
                         iterations=4000, 
                         eval_metric='BalancedAccuracy',
                         custom_metric='BalancedAccuracy')

clf.fit(train_data,
        train_labels, 
        verbose=500,
        eval_set=(np.array(val_data), val_labels),
        use_best_model=True)

0:	learn: 0.8114392	test: 0.7753440	best: 0.7753440 (0)	total: 103ms	remaining: 6m 53s
500:	learn: 0.8638515	test: 0.8881885	best: 0.8941443 (16)	total: 12.2s	remaining: 1m 25s
1000:	learn: 0.8827058	test: 0.8936162	best: 0.8944073 (964)	total: 24.2s	remaining: 1m 12s
1500:	learn: 0.9016633	test: 0.8970883	best: 0.8984288 (1409)	total: 36.4s	remaining: 1m
2000:	learn: 0.9155129	test: 0.8986705	best: 0.8986705 (1915)	total: 48.6s	remaining: 48.6s
2500:	learn: 0.9259165	test: 0.9053072	best: 0.9053072 (2354)	total: 1m	remaining: 36.5s
3000:	learn: 0.9354375	test: 0.9066477	best: 0.9066477 (2868)	total: 1m 13s	remaining: 24.3s
3500:	learn: 0.9473528	test: 0.9066477	best: 0.9066477 (2868)	total: 1m 25s	remaining: 12.1s
3999:	learn: 0.9544982	test: 0.9053072	best: 0.9066477 (2868)	total: 1m 37s	remaining: 0us

bestTest = 0.9066476906
bestIteration = 2868

Shrink model to first 2869 iterations.


<catboost.core.CatBoostClassifier at 0x7f0c106fc080>

# Check scores

In [12]:
filelist = ['real.png', 'spoof.png']
features = extract_features(filelist)
predictions = clf.predict(np.array(features))
probs = clf.predict_proba(np.array(features))
scores = np.array(probs)[:, 1] - np.array(probs)[:, 0]

In [13]:
for f, s in zip(filelist, scores.tolist()):
    print("{}: {}".format(f, s))

real.png: -0.9830079726623919
spoof.png: 0.5861334537205898
