In [1]:
import pandas as pd
import numpy as np
import keras
from keras.utils.data_utils import get_file
from keras.preprocessing.image import array_to_img, img_to_array, load_img
from os.path import join
import multiprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from scipy.stats import skew, kurtosis, tvar, entropy, pearsonr
import keras.backend.tensorflow_backend as K

%matplotlib inline

Using TensorFlow backend.


In [2]:
DATA_HOME = '../lfw-aligned-cropped/'

In [3]:
PAIRS_PATH = '../pairs.txt'

In [4]:
raw_dataset = pd.read_csv(PAIRS_PATH, nrows=None)
raw_dataset.columns = ['data']

In [5]:
raw_dataset = raw_dataset

In [6]:
def split_raw_data(row):
    fields = str(row[0]).split('\t')
    is_the_same = 0
    path1, path2 = '{}/{}_{:04d}.png', '{}/{}_{:04d}.png'
    if len(fields) == 3:
        is_the_same = 1
        path1 = path1.format(fields[0], fields[0], int(fields[1]))
        path2 = path2.format(fields[0], fields[0], int(fields[2]))
    elif len(fields) == 4:
        path1 = path1.format(fields[0], fields[0], int(fields[1]))
        path2 = path2.format(fields[2], fields[2], int(fields[3]))
    else:
        raise(Exception('invalid fields'))
    
    return pd.Series([path1, path2, is_the_same], ['path1', 'path2', 'is_the_same']) 

In [7]:
dataset = raw_dataset.apply(split_raw_data, axis=1)

In [8]:
dataset.head()

Unnamed: 0,path1,path2,is_the_same
0,Abel_Pacheco/Abel_Pacheco_0001.png,Abel_Pacheco/Abel_Pacheco_0004.png,1
1,Akhmed_Zakayev/Akhmed_Zakayev_0001.png,Akhmed_Zakayev/Akhmed_Zakayev_0003.png,1
2,Akhmed_Zakayev/Akhmed_Zakayev_0002.png,Akhmed_Zakayev/Akhmed_Zakayev_0003.png,1
3,Amber_Tamblyn/Amber_Tamblyn_0001.png,Amber_Tamblyn/Amber_Tamblyn_0002.png,1
4,Anders_Fogh_Rasmussen/Anders_Fogh_Rasmussen_00...,Anders_Fogh_Rasmussen/Anders_Fogh_Rasmussen_00...,1


In [9]:
dataset.shape

(6000, 3)

In [10]:
def path2ImgVec(path):
    x = img_to_array(load_img(join(DATA_HOME, path)))
    return x.reshape((1,) + x.shape)

In [11]:
img_paths = list(set(dataset.path1.tolist()+dataset.path2.tolist()))
len(img_paths)

7701

In [12]:
pool = multiprocessing.Pool(8)
results = pool.map(path2ImgVec, img_paths)
pool.close()
pool.join()

In [13]:
img_vecs = np.vstack(results)
img_vecs.shape

(7701, 55, 47, 3)

In [14]:
input_shape = img_vecs.shape[1:]
img_vecs.shape

(7701, 55, 47, 3)

In [15]:
from keras.models import load_model

# model = load_model('../models/facescrub-faceonly-simple-cnn.model.h5')
# model = load_model('../models/webface-simple-cnn.3348.model.h5')
model = load_model('../models/webface-simple-cnn.aligned.model.h5')

In [16]:
from keras import backend as K
inp = model.input
# outputs = [layer.output for layer in model.layers]
outputs = [model.layers[-2].output]
functor = K.function([inp] + [K.learning_phase()], outputs)

In [17]:
deepids = functor([img_vecs, 1.])[0]

In [18]:
deepids.shape

(7701, 160)

In [19]:
img_deepid_mem = {k:v for k, v in zip(img_paths, deepids)}

In [20]:
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis

COL_NAMES = [
    "z_cosine",
#     'z_cityblock',
#     'z_jaccard',
#     "z_canberra",
#     "z_euclidean",
#     "z_minkowski",
#     "z_braycurtis",
    'z_skew1',
    'z_skew2',
    'z_kurtosis1',
    'z_kurtosis2',
#     'z_tvar1',
#     'z_tvar2',
] + ['is_the_same']

def to_deepid_features(row):
    vec1 = img_deepid_mem[row['path1']]
    vec2 = img_deepid_mem[row['path2']]

    feats = [
        cosine(vec1, vec2),
#         cityblock(vec1, vec2),
#         jaccard(vec1, vec2),
#         canberra(vec1, vec2),
#         euclidean(vec1, vec2),
#         minkowski(vec1, vec2, 3),
#         braycurtis(vec1, vec2),
        skew(vec1),
        skew(vec2),
        kurtosis(vec1),
        kurtosis(vec2),
#         tvar(vec1),
#         tvar(vec2),
    ]
    
    return pd.Series(feats + [row['is_the_same']], COL_NAMES) 

In [21]:
feat_dataset = dataset.apply(to_deepid_features, axis=1)

In [22]:
feat_dataset.head()

Unnamed: 0,z_cosine,z_skew1,z_skew2,z_kurtosis1,z_kurtosis2,is_the_same
0,0.37941,0.979269,0.826802,0.269702,-0.360797,1.0
1,0.321641,0.917269,1.148254,-0.29223,0.459073,1.0
2,0.387377,1.106928,1.148254,0.446858,0.459073,1.0
3,0.2767,1.092813,1.137071,0.416162,0.900289,1.0
4,0.430194,1.237766,1.100429,1.605136,0.497464,1.0


In [23]:
col = [c for c in feat_dataset.columns if c[0]=='z']

X = feat_dataset[col].as_matrix()
y = feat_dataset['is_the_same'].as_matrix()

In [24]:
X[0]

array([ 0.3794097 ,  0.97926909,  0.82680219,  0.26970155, -0.36079694])

In [28]:
import xgboost as xgb
params = {}
params["objective"] = "binary:logistic"
params['eval_metric'] = ['logloss', 'error']
# params["eta"] = 0.02
# params["subsample"] = 0.7
# params["min_child_weight"] = 1
# params["colsample_bytree"] = 0.7
params["max_depth"] = 4
# params["silent"] = 1
params["seed"] = 1632

In [25]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=42)

In [26]:
d_train = xgb.DMatrix(X_train, label=y_train)
d_valid = xgb.DMatrix(X_valid, label=y_valid)
watchlist = [(d_train, 'train'), (d_valid, 'valid')]
bst = xgb.train(params, d_train, 500, watchlist, early_stopping_rounds=50, verbose_eval=100)

[0]	train-logloss:0.559145	train-error:0.160741	valid-logloss:0.557986	valid-error:0.16
Multiple eval metrics have been passed: 'valid-error' will be used for early stopping.

Will train until valid-error hasn't improved in 50 rounds.
Stopping. Best iteration:
[16]	train-logloss:0.349147	train-error:0.155185	valid-logloss:0.359831	valid-error:0.148333



In [29]:
d_all = xgb.DMatrix(X, label=y)

xgb.cv(params, d_all, 500, nfold=10, metrics=['error'], early_stopping_rounds=20, shuffle=False, seed=42)

Unnamed: 0,test-error-mean,test-error-std,train-error-mean,train-error-std
0,0.169833,0.011266,0.160352,0.001544
1,0.169667,0.00951,0.159111,0.00136
2,0.169333,0.011333,0.158926,0.001225
3,0.167166,0.010248,0.158556,0.001455
4,0.167,0.009741,0.157834,0.001021
5,0.167667,0.009493,0.157963,0.00158
6,0.167,0.009304,0.157796,0.001724
7,0.166833,0.010123,0.157167,0.001828
8,0.167666,0.01044,0.157055,0.002162
9,0.166,0.010546,0.156704,0.001729
