In [1]:
import pandas as pd
import numpy as np
import keras
from keras.utils.data_utils import get_file
from keras.preprocessing.image import array_to_img, img_to_array, load_img
from os.path import join
import multiprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from scipy.stats import skew, kurtosis, tvar, entropy, pearsonr
import keras.backend.tensorflow_backend as K

%matplotlib inline

Using TensorFlow backend.


In [2]:
DATA_HOME = '../lfw-aligned-with-32-margin-resized/'

In [3]:
PAIRS_PATH = '../pairs.txt'

In [4]:
raw_dataset = pd.read_csv(PAIRS_PATH, nrows=None)
raw_dataset.columns = ['data']

In [5]:
raw_dataset.shape

(6000, 1)

In [6]:
raw_dataset = raw_dataset

In [7]:
def split_raw_data(row):
    fields = str(row[0]).split('\t')
    is_the_same = 0
    path1, path2 = '{}/{}_{:04d}.png', '{}/{}_{:04d}.png'
    if len(fields) == 3:
        is_the_same = 1
        path1 = path1.format(fields[0], fields[0], int(fields[1]))
        path2 = path2.format(fields[0], fields[0], int(fields[2]))
    elif len(fields) == 4:
        path1 = path1.format(fields[0], fields[0], int(fields[1]))
        path2 = path2.format(fields[2], fields[2], int(fields[3]))
    else:
        raise(Exception('invalid fields'))
    
    return pd.Series([path1, path2, is_the_same], ['path1', 'path2', 'is_the_same']) 

In [8]:
dataset = raw_dataset.apply(split_raw_data, axis=1)

In [9]:
dataset.head()

Unnamed: 0,path1,path2,is_the_same
0,Abel_Pacheco/Abel_Pacheco_0001.png,Abel_Pacheco/Abel_Pacheco_0004.png,1
1,Akhmed_Zakayev/Akhmed_Zakayev_0001.png,Akhmed_Zakayev/Akhmed_Zakayev_0003.png,1
2,Akhmed_Zakayev/Akhmed_Zakayev_0002.png,Akhmed_Zakayev/Akhmed_Zakayev_0003.png,1
3,Amber_Tamblyn/Amber_Tamblyn_0001.png,Amber_Tamblyn/Amber_Tamblyn_0002.png,1
4,Anders_Fogh_Rasmussen/Anders_Fogh_Rasmussen_00...,Anders_Fogh_Rasmussen/Anders_Fogh_Rasmussen_00...,1


In [10]:
dataset.shape

(6000, 3)

In [11]:
from PIL import Image

def path2ImgVec(path):
    x = img_to_array(load_img(join(DATA_HOME, path)))
#     x = img_to_array(load_img(join(DATA_HOME, path)).convert('L'))
    return x.reshape((1,) + x.shape)

def path2ImgVecFlipped(path):
    img = load_img(join(DATA_HOME, path))
#     img = load_img(join(DATA_HOME, path)).convert('L')
    img = img.transpose(Image.FLIP_LEFT_RIGHT)
    x = img_to_array(img)
    return x.reshape((1,) + x.shape)

In [12]:
img_paths = list(set(dataset.path1.tolist()+dataset.path2.tolist()))
len(img_paths)

7701

In [13]:
pool = multiprocessing.Pool(8)
results1 = pool.map(path2ImgVec, img_paths)
pool.close()
pool.join()

In [14]:
pool = multiprocessing.Pool(8)
results2 = pool.map(path2ImgVecFlipped, img_paths)
pool.close()
pool.join()

In [15]:
img_vecs1 = np.vstack(results1)
img_vecs1.shape
img_vecs2 = np.vstack(results2)
img_vecs2.shape

(7701, 55, 47, 3)

In [16]:
input_shape = img_vecs1.shape[1:]
img_vecs1.shape

(7701, 55, 47, 3)

In [17]:
from keras.models import load_model

# model = load_model('../models/facescrub-faceonly-simple-cnn.model.h5')
# 0.250667	0.014855	0.240852	0.003922

# model = load_model('../models/webface-simple-cnn.3348.model.h5')
# 0.258	0.012423	0.247926	0.001509

# model = load_model('../models/webface-simple-cnn.aligned.margin16.model.h5')
# 0.188000	0.009123	0.174352	0.001018

# model = load_model('../models/webface-simple-cnn.aligned.margin32-1.model.h5')
# 0.158667	0.009684	0.149889	0.001223

# model = load_model('../models/webface-facescrub-faceonly-simple-cnn.aligned.model.h5')
# 0.146333	0.015720	0.137333	0.002688

# model = load_model('../models/webface-facescrub-faceonly-simple-cnn.aligned.selected.model.h5')
# 0.149667	0.012601	0.119389	0.002129

# model = load_model('../models/webface-facescrub-faceonly-simple-cnn.aligned.augment.model.h5')

# model = load_model('../models/webface-facescrub-faceonly-simple-cnn.aligned.flipped.model.h5')
# with <6 augmentation
# 0.131	0.010220	0.099037	0.002640

# model = load_model('../models/webface-full-simple-cnn.aligned.margin32.model.h5')
# about 60% data ?
# 0.111667	0.011571	0.099852	0.002124
# All data
# 0.102500	0.011908	0.065889	0.001608

# model = load_model('../models/webface-full-simple-cnn.aligned.margin32.s20.model.h5')
# 0.100333	0.013392	0.082704	0.001788

# with lfw flipped
#*0.095833	0.013627	0.077963	0.000824

# model = load_model('../models/webface-full-simple-cnn.aligned.margin32.grey.model.h5')
# All data
# 0.114000	0.012936	0.094685	0.001828
# All data + flipped
# 0.106500	0.011772	0.081500	0.002122
# 0.105167	0.010735	0.089592	0.001551

# model = load_model('../models/celeba-full-simple-cnn.aligned.margin32.model.h5')
# 0.139167 	0.018488 	0.131982 	0.002264

# model = load_model('../models/celeba-full-simple-cnn.aligned.margin32.flipped.model.h5')
# 0.126500 	0.009957 	0.114055 	0.002217

model = load_model('../models/celeba-full-simple-cnn.aligned.margin32.flipped.b1024.s30.model.h5')
# 0.111500	0.011315	0.097333	0.001769

# model = load_model('../models/webface-full-celeba-simple-cnn.aligned.margin32.grey.model.h5')
# 0.107333	0.013085	0.089500	0.001393

# model = load_model('../models/webface-full-celeba-simple-cnn.aligned.margin32.grey.flipped.s20.model.h5')
# 0.101000	0.013646	0.088574	0.002227
# 0.098667	0.014020	0.082630	0.001580
# 0.098333	0.013333	0.082704	0.001736
# 0.100500	0.009547	0.090111	0.001466
# 0.093667	0.010214	0.065185	0.001320

# with lfw flipped
#*090667	0.007079	0.056815	0.001879

In [18]:
from keras import backend as K
inp = model.input

outputs = [model.layers[-2].output]
functor = K.function([inp] + [K.learning_phase()], outputs)

In [19]:
img_vecs1.shape

(7701, 55, 47, 3)

In [20]:
deepids1 = functor([img_vecs1, 1.])[0]

In [21]:
deepids2 = functor([img_vecs2, 1.])[0]

In [22]:
deepids1.shape

(7701, 160)

In [23]:
img_paths_flipped = ["~"+i for i in img_paths]

In [24]:
deepids = np.vstack([deepids1, deepids2])

In [25]:
deepids.shape

(15402, 160)

In [26]:
img_deepid_mem = {k:v for k, v in zip(img_paths+img_paths_flipped, deepids)}

In [27]:
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis

COL_NAMES = [
    "z_cosine1",
    "z_cosine2",
    "z_cosine3",
    "z_cosine4",
#     'z_cityblock',
#     'z_jaccard',
#     "z_canberra",
#     "z_euclidean",
#     "z_minkowski",
#     "z_braycurtis",
    'z_skew1',
    'z_skew2',
    'z_kurtosis1',
    'z_kurtosis2',
    'z_tvar1',
    'z_tvar2',
] + ['is_the_same']

def to_deepid_features(row):
    vec1 = img_deepid_mem[row['path1']]
    vec2 = img_deepid_mem[row['path2']]

    fvec1 = img_deepid_mem["~"+row['path1']]
    fvec2 = img_deepid_mem["~"+row['path2']]
    
    feats = [
        cosine(vec1, vec2),
        cosine(vec1, fvec2),
        cosine(fvec1, vec2),
        cosine(fvec1, fvec2),
#         cityblock(vec1, vec2),
#         jaccard(vec1, vec2),
#         canberra(vec1, vec2),
#         euclidean(vec1, vec2),
#         minkowski(vec1, vec2, 3),
#         braycurtis(vec1, vec2),
        skew(vec1),
        skew(vec2),
        kurtosis(vec1),
        kurtosis(vec2),
        tvar(vec1),
        tvar(vec2),
    ]
    
    return pd.Series(feats + [row['is_the_same']], COL_NAMES) 

In [28]:
feat_dataset = dataset.apply(to_deepid_features, axis=1)

In [29]:
feat_dataset.head()

Unnamed: 0,z_cosine1,z_cosine2,z_cosine3,z_cosine4,z_skew1,z_skew2,z_kurtosis1,z_kurtosis2,z_tvar1,z_tvar2,is_the_same
0,0.416971,0.389608,0.475567,0.429334,0.96648,1.468843,0.23977,1.947711,5.246109,6.331815,1.0
1,0.20431,0.204679,0.182535,0.192055,0.929978,1.037582,0.03542,0.594503,7.052558,6.555378,1.0
2,0.326972,0.3065,0.305355,0.281173,0.892642,1.037582,-0.148579,0.594503,6.535841,6.555378,1.0
3,0.265033,0.26035,0.282445,0.254809,0.8628,0.747986,-0.165919,-0.471943,8.90671,9.162576,1.0
4,0.369209,0.356558,0.409931,0.406881,1.038093,1.246543,0.420399,1.494947,6.321789,9.156434,1.0


In [30]:
col = [c for c in feat_dataset.columns if c[0]=='z']

X = feat_dataset[col].as_matrix()
y = feat_dataset['is_the_same'].as_matrix()

In [31]:
X[0]

array([ 0.41697105,  0.38960834,  0.47556701,  0.4293338 ,  0.96647978,
        1.4688431 ,  0.23977028,  1.94771067,  5.24610948,  6.33181526])

In [32]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=42)

In [33]:
import xgboost as xgb
params = {}
params["objective"] = "binary:logistic"
params['eval_metric'] = ['logloss', 'error']
params["eta"] = 0.02
# params["subsample"] = 0.7
# params["min_child_weight"] = 1
# params["colsample_bytree"] = 0.7
params["max_depth"] = 4
params["seed"] = 1632

d_train = xgb.DMatrix(X_train, label=y_train)
d_valid = xgb.DMatrix(X_valid, label=y_valid)
watchlist = [(d_train, 'train'), (d_valid, 'valid')]
bst = xgb.train(params, d_train, 500, watchlist, early_stopping_rounds=50, verbose_eval=100)

[0]	train-logloss:0.680114	train-error:0.114259	valid-logloss:0.68047	valid-error:0.121667
Multiple eval metrics have been passed: 'valid-error' will be used for early stopping.

Will train until valid-error hasn't improved in 50 rounds.
Stopping. Best iteration:
[37]	train-logloss:0.411738	train-error:0.108519	valid-logloss:0.421356	valid-error:0.108333



In [34]:
d_all = xgb.DMatrix(X, label=y)

xgb.cv(params, d_all, 500, nfold=10, metrics=['error'], shuffle=False, early_stopping_rounds=50, seed=42)

Unnamed: 0,test-error-mean,test-error-std,train-error-mean,train-error-std
0,0.121167,0.010542,0.11387,0.001581
1,0.121167,0.010383,0.113759,0.001778
2,0.120667,0.009434,0.113444,0.001498
3,0.120667,0.009196,0.113074,0.001939
4,0.120667,0.009809,0.112981,0.00188
5,0.119667,0.009183,0.112778,0.001872
6,0.120167,0.009558,0.112908,0.001714
7,0.119167,0.009106,0.112593,0.001859
8,0.119166,0.010626,0.112278,0.001465
9,0.118833,0.009748,0.112111,0.001608


In [35]:
np.save('../output/celeba-full-simple-cnn.aligned.margin32.flipped.b1024.s30.model.h5.x.npy', X)

In [36]:
np.sum(y)

3000.0

In [37]:
y.shape

(6000,)