In [1]:
import pandas as pd
import numpy as np
import keras
from keras.utils.data_utils import get_file
from keras.preprocessing.image import array_to_img, img_to_array, load_img
from os.path import join
import multiprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from scipy.stats import skew, kurtosis, tvar, entropy, pearsonr
import keras.backend.tensorflow_backend as K

%matplotlib inline

Using TensorFlow backend.


In [2]:
# DATA_HOME = '../lfw-aligned-cropped/'

DATA_HOME = '../lfw-aligned-with-32-margin-resized/'

In [3]:
PAIRS_PATH = '../pairs.txt'

In [4]:
raw_dataset = pd.read_csv(PAIRS_PATH, nrows=None)
raw_dataset.columns = ['data']

In [5]:
raw_dataset.shape

(6000, 1)

In [6]:
raw_dataset = raw_dataset

In [7]:
def split_raw_data(row):
    fields = str(row[0]).split('\t')
    is_the_same = 0
    path1, path2 = '{}/{}_{:04d}.png', '{}/{}_{:04d}.png'
    if len(fields) == 3:
        is_the_same = 1
        path1 = path1.format(fields[0], fields[0], int(fields[1]))
        path2 = path2.format(fields[0], fields[0], int(fields[2]))
    elif len(fields) == 4:
        path1 = path1.format(fields[0], fields[0], int(fields[1]))
        path2 = path2.format(fields[2], fields[2], int(fields[3]))
    else:
        raise(Exception('invalid fields'))
    
    return pd.Series([path1, path2, is_the_same], ['path1', 'path2', 'is_the_same']) 

In [8]:
dataset = raw_dataset.apply(split_raw_data, axis=1)

In [9]:
dataset.head()

Unnamed: 0,path1,path2,is_the_same
0,Abel_Pacheco/Abel_Pacheco_0001.png,Abel_Pacheco/Abel_Pacheco_0004.png,1
1,Akhmed_Zakayev/Akhmed_Zakayev_0001.png,Akhmed_Zakayev/Akhmed_Zakayev_0003.png,1
2,Akhmed_Zakayev/Akhmed_Zakayev_0002.png,Akhmed_Zakayev/Akhmed_Zakayev_0003.png,1
3,Amber_Tamblyn/Amber_Tamblyn_0001.png,Amber_Tamblyn/Amber_Tamblyn_0002.png,1
4,Anders_Fogh_Rasmussen/Anders_Fogh_Rasmussen_00...,Anders_Fogh_Rasmussen/Anders_Fogh_Rasmussen_00...,1


In [10]:
dataset.shape

(6000, 3)

In [11]:
def path2ImgVec(path):
    x = img_to_array(load_img(join(DATA_HOME, path)))
    return x.reshape((1,) + x.shape)

In [12]:
img_paths = list(set(dataset.path1.tolist()+dataset.path2.tolist()))
len(img_paths)

7701

In [13]:
pool = multiprocessing.Pool(8)
results = pool.map(path2ImgVec, img_paths)
pool.close()
pool.join()

In [14]:
img_vecs = np.vstack(results)
img_vecs.shape

(7701, 55, 47, 3)

In [15]:
input_shape = img_vecs.shape[1:]
img_vecs.shape

(7701, 55, 47, 3)

In [16]:
from keras.models import load_model

# model = load_model('../models/facescrub-faceonly-simple-cnn.model.h5')
# model = load_model('../models/webface-simple-cnn.3348.model.h5')
model = load_model('../models/faceonly-simple-cnn.aligned.margin32.model.h5')

In [17]:
from keras import backend as K
inp = model.input
# outputs = [layer.output for layer in model.layers]
outputs = [model.layers[-2].output]
functor = K.function([inp] + [K.learning_phase()], outputs)

In [18]:
deepids = functor([img_vecs, 1.])[0]

In [19]:
deepids.shape

(7701, 160)

In [20]:
img_deepid_mem = {k:v for k, v in zip(img_paths, deepids)}

In [37]:
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis

COL_NAMES = [
    "z_cosine",
#     'z_cityblock',
#     'z_jaccard',
#     "z_canberra",
#     "z_euclidean",
#     "z_minkowski",
#     "z_braycurtis",
    'z_skew1',
    'z_skew2',
    'z_kurtosis1',
    'z_kurtosis2',
    'z_tvar1',
    'z_tvar2',
] + ['is_the_same']

def to_deepid_features(row):
    vec1 = img_deepid_mem[row['path1']]
    vec2 = img_deepid_mem[row['path2']]

    feats = [
        cosine(vec1, vec2),
#         cityblock(vec1, vec2),
#         jaccard(vec1, vec2),
#         canberra(vec1, vec2),
#         euclidean(vec1, vec2),
#         minkowski(vec1, vec2, 3),
#         braycurtis(vec1, vec2),
        skew(vec1),
        skew(vec2),
        kurtosis(vec1),
        kurtosis(vec2),
        tvar(vec1),
        tvar(vec2),
    ]
    
    return pd.Series(feats + [row['is_the_same']], COL_NAMES) 

In [38]:
feat_dataset = dataset.apply(to_deepid_features, axis=1)

In [39]:
feat_dataset.head()

Unnamed: 0,z_cosine,z_skew1,z_skew2,z_kurtosis1,z_kurtosis2,z_tvar1,z_tvar2,is_the_same
0,0.547323,0.754202,1.13198,-0.589935,1.146834,1.583346,1.668785,1.0
1,0.252855,1.145159,1.103116,0.42225,0.285577,2.166655,2.115369,1.0
2,0.415697,1.494683,1.103116,2.55414,0.285577,2.180767,2.115369,1.0
3,0.292701,0.917856,1.419371,-0.118368,1.680075,2.08022,2.46003,1.0
4,0.31982,1.092137,1.201916,0.401829,0.609903,1.585915,2.274228,1.0


In [40]:
col = [c for c in feat_dataset.columns if c[0]=='z']

X = feat_dataset[col].as_matrix()
y = feat_dataset['is_the_same'].as_matrix()

In [41]:
X[0]

array([ 0.54732274,  0.75420225,  1.13198042, -0.5899353 ,  1.14683375,
        1.58334554,  1.66878534])

In [42]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=42)

In [43]:
import xgboost as xgb
params = {}
params["objective"] = "binary:logistic"
params['eval_metric'] = ['logloss', 'error']
# params["eta"] = 0.02
# params["subsample"] = 0.7
# params["min_child_weight"] = 1
# params["colsample_bytree"] = 0.7
params["max_depth"] = 4
# params["silent"] = 1
params["seed"] = 1632

d_train = xgb.DMatrix(X_train, label=y_train)
d_valid = xgb.DMatrix(X_valid, label=y_valid)
watchlist = [(d_train, 'train'), (d_valid, 'valid')]
bst = xgb.train(params, d_train, 500, watchlist, early_stopping_rounds=50, verbose_eval=100)

[0]	train-logloss:0.552276	train-error:0.159815	valid-logloss:0.559196	valid-error:0.175
Multiple eval metrics have been passed: 'valid-error' will be used for early stopping.

Will train until valid-error hasn't improved in 50 rounds.
Stopping. Best iteration:
[12]	train-logloss:0.330845	train-error:0.145	valid-logloss:0.37787	valid-error:0.161667



In [44]:
d_all = xgb.DMatrix(X, label=y)

xgb.cv(params, d_all, 500, nfold=10, metrics=['error'], shuffle=False, early_stopping_rounds=20, seed=42)

Unnamed: 0,test-error-mean,test-error-std,train-error-mean,train-error-std
0,0.172667,0.010572,0.157852,0.002788
1,0.168833,0.009459,0.15487,0.002208
2,0.166,0.015388,0.153907,0.002427
3,0.164667,0.014602,0.153074,0.002564
4,0.165167,0.015211,0.152944,0.002069
5,0.165833,0.014127,0.15063,0.002743
6,0.1655,0.01352,0.149556,0.002319
7,0.164833,0.013196,0.149556,0.00143
8,0.164,0.012342,0.148963,0.001782
9,0.162666,0.012828,0.147982,0.001512
