In [1]:
%env JOBLIB_TEMP_FOLDER=/tmp
import operator
from multiprocessing import Pool, TimeoutError
import glob
import os
import cv2

import numpy as np
from tqdm import tqdm_notebook
from catboost import CatBoostClassifier
from skimage.feature import local_binary_pattern as LBP

env: JOBLIB_TEMP_FOLDER=/tmp


In [2]:
TRAIN_PATH = '/train/'
VAL_PATH = '/validation/'
TEST_PATH = '/test/'
RESULT_PATH = '/output/'
POOL_THREADS = 32
NEW_WIDTH = 480
NEW_HEIGHT = 480

In [3]:
def extract_feature_vector(image, p=18, r=2):
    channels = list(cv2.split(cv2.cvtColor(image, cv2.COLOR_BGR2HSV))) + \
                list(cv2.split(cv2.cvtColor(image, cv2.COLOR_BGR2YCrCb)))
    lbp_features = [LBP(ch, p, r, method="uniform") for ch in channels]
    hist_features = [np.histogram(lf,  bins=p+1, normed=True)[0] for lf in lbp_features]
    return np.hstack([hf.ravel() for hf in hist_features])


def process_single(file):
    image = cv2.imread(file)
    h, w, c = image.shape
    dw, dh = w // 4, h // 4
    return extract_feature_vector(cv2.resize(image[dh:h-dh, dw:w-dw, :], (NEW_WIDTH, NEW_HEIGHT)))


def extract_features(filelist):
    def __impl(files):
        pool = Pool(POOL_THREADS) 
        feats = list(tqdm_notebook(pool.imap(process_single, files), total=len(files)))        
        return feats
    feature_list = __impl(filelist) 
    return [x for x in feature_list if x is not None]

# Load train data

In [4]:
train_real_filelist = glob.glob(os.path.join(TRAIN_PATH, 'real/*.png'))
train_spoof_filelist = glob.glob(os.path.join(TRAIN_PATH, 'spoof/*.png'))

train_real_features = extract_features(train_real_filelist)
train_spoof_features = extract_features(train_spoof_filelist)

train_data = train_real_features + train_spoof_features
train_labels = [0] * len(train_real_features) + [1] * len(train_spoof_features)





# Load val data

In [5]:
val_real_filelist = glob.glob(os.path.join(VAL_PATH, 'real/*.png'))
val_spoof_filelist = glob.glob(os.path.join(VAL_PATH, 'spoof/*.png'))

val_real_features = extract_features(val_real_filelist)
val_spoof_features = extract_features(val_spoof_filelist)

val_data = val_real_features + val_spoof_features
val_labels = [0] * len(val_real_features) + [1] * len(val_spoof_features)





# Train classifier

In [6]:
clf = CatBoostClassifier(learning_rate=0.001, 
                         iterations=4000, 
                         eval_metric='BalancedAccuracy',
                         custom_metric='BalancedAccuracy')

clf.fit(train_data,
        train_labels, 
        verbose=500,
        eval_set=(np.array(val_data), val_labels),
        use_best_model=True)

0:	learn: 0.6606858	test: 0.6756668	best: 0.6756668 (0)	total: 75.5ms	remaining: 5m 1s
500:	learn: 0.7536266	test: 0.7285629	best: 0.7333755 (474)	total: 10.3s	remaining: 1m 12s
1000:	learn: 0.7783129	test: 0.7562294	best: 0.7591522 (928)	total: 20.3s	remaining: 1m
1500:	learn: 0.8032263	test: 0.7699418	best: 0.7699418 (1492)	total: 30.3s	remaining: 50.5s
2000:	learn: 0.8269687	test: 0.7771936	best: 0.7771936 (1948)	total: 40s	remaining: 39.9s
2500:	learn: 0.8472841	test: 0.7854782	best: 0.7854782 (2423)	total: 49.5s	remaining: 29.7s
3000:	learn: 0.8690379	test: 0.7916313	best: 0.7916313 (2974)	total: 59.5s	remaining: 19.8s
3500:	learn: 0.8869003	test: 0.7996742	best: 0.7996742 (3465)	total: 1m 9s	remaining: 9.86s
3999:	learn: 0.9047627	test: 0.8055855	best: 0.8055855 (3991)	total: 1m 18s	remaining: 0us

bestTest = 0.8055854855
bestIteration = 3991

Shrink model to first 3992 iterations.


<catboost.core.CatBoostClassifier at 0x7f8dc62e5390>

# Check scores

In [7]:
filelist = ['real.png', 'spoof.png']
features = extract_features(filelist)
predictions = clf.predict(np.array(features))
probs = clf.predict_proba(np.array(features))
scores = np.array(probs)[:, 1] - np.array(probs)[:, 0]




In [8]:
for f, s in zip(filelist, scores.tolist()):
    print("{}: {}".format(f, s))

real.png: -0.697713507129686
spoof.png: 0.7841449892948535


# Make test

In [9]:
test_filelist = glob.glob(os.path.join(TEST_PATH, '*.png'))
test_features = extract_features(test_filelist)




In [10]:
probs = clf.predict_proba(np.array(test_features))
scores = np.array(probs)[:, 1] - np.array(probs)[:, 0]
with open(os.path.join(RESULT_PATH, 'results_baseline.txt'), 'w') as f:
    for score, filepath in zip(scores.tolist(), test_filelist):
        f.write("{}, {}\n".format(filepath.split('/')[-1], score))