# Retrieving facial embeddings and building the index

In [7]:
from pymongo import MongoClient
import pymongo

CONNECTION_STRING = "mongodb://admin:admin@localhost:27018/?authSource=admin"

client = MongoClient(CONNECTION_STRING)
db = client['social_profiling']

profile_collection = db['eval_profiles']
face_collection = db['eval_embeddings']

In [9]:
faces = face_collection.find()
faces = list(faces)

In [10]:
import numpy as np

ids = []
embeddings = []
for face in faces:
    fid = face['_id']
    fid = fid.split('_')[-1]
    embedding = face['values']
    ids.append(fid)
    embeddings.append(embedding)

ids = np.array(ids, dtype='int64')
embeddings = np.array(embeddings, dtype='f')

In [11]:
print(ids.shape)
print(embeddings.shape)

(34796,)
(34796, 128)


In [12]:
import faiss

dimensions = 128    # FaceNet output is 128d vector

metric = 'euclidean' #euclidean, cosine
 
if metric == 'euclidean':
    index = faiss.IndexFlatL2(dimensions)
elif metric == 'cosine':
    index = faiss.IndexFlatIP(dimensions)
    faiss.normalize_L2(embeddings)

index = faiss.IndexIDMap(index)

In [13]:
index.add_with_ids(embeddings, ids)

In [79]:
faiss.write_index(index, "./test.index")

In [14]:
split_path = '../tests/nguoinoitiengtv/split/'
import os
train_path = os.path.join(split_path, 'train')
val_path = os.path.join(split_path, 'val')

In [15]:
import json
f2p_map = []

with open( os.path.join(split_path, 'filename_profile_map.json'), 'r' ) as f:
    f2p_map = json.load(f)

In [16]:
import pandas as pd

df = pd.DataFrame.from_dict(f2p_map, orient='index')

df.columns = ['truth']
df['is_in_val'] = np.nan
df['has_detect_face'] = np.nan
df['prediction'] = np.nan

df

Unnamed: 0,truth,is_in_val,has_detect_face,prediction
son_tung_2f2254a088377df162f65714467920756d42bd2b.jpg,profile_000000,,,
son_tung_5f3fcaef2ee07436a10993d42e2825eb8a29e45b.jpg,profile_000000,,,
son_tung_1bfcf9ab4f2f7d07f4d01a46e790ed8a37b4374c.jpg,profile_000000,,,
son_tung_ba35896e20d1430ca11bd05b4a52c073ae18a152.jpg,profile_000000,,,
son_tung_c90fb40c95e77c2f78e5a9e7d21df11021145e89.jpg,profile_000000,,,
...,...,...,...,...
nguyen_thi_mai_ka_1199de669876527412fbbe810cd9438e529a8cc0.jpg,profile_012524,,,
nguyen_thi_mai_ka_d2ca55075502fcfd2c61f2bd99cd027fe25e14af.jpg,profile_012524,,,
nguyen_thi_mai_ka_3a4456452f42cda05b3e06002535fbdabceaa776.jpg,profile_012524,,,
nguyen_thi_mai_ka_b40d03f5243adc2892d99fa3d15898593e0390c2.jpg,profile_012524,,,


In [17]:
import os

base_path = val_path

val_filenames = []

for root, dirs, files in os.walk(base_path):
    if not dirs:
        for f in files:
            path = os.path.join(root, f)
            val_filenames.append(path)
            
len(val_filenames)

13866

In [18]:
def get_id(prefix, suffix, rjust_fill=6):
    return (str(prefix) + str(suffix).rjust(rjust_fill, '0'))

In [19]:
from deepface.basemodels import Facenet
from deepface.commons import functions
model = Facenet.loadModel()

def get_closest_match(target_img_path, k, index):
    target_img = functions.preprocess_face(img=target_img_path, target_size=(160, 160), detector_backend='mtcnn')
    target_representation = model.predict(target_img)[0,:]

    target_representation = np.array(target_representation, dtype='f')
    target_representation = np.expand_dims(target_representation, axis=0)

    distances, neighbors = index.search(target_representation, k)

    return distances, neighbors

In [20]:
from tqdm import tqdm
import numpy as np

test_result = {}

for path in tqdm(val_filenames[0:100]):
    head, tail = os.path.split(path)
    df.at[tail, 'is_in_val'] = True

    # try:
    #     preprocessed_faces[tail] = functions.preprocess_face(img=path, target_size=(160, 160))
    # except:
    #     df.at[tail, 'has_detect_face'] = False

    # # print(path)
    # # d, n = get_closest_match(path, 1, index)
    # # print(path, d, n)
    try:
        d, n = get_closest_match(path, 20, index)
    except ValueError:
        df.at[tail, 'has_detect_face'] = False
        continue

    matches = face_collection.find(
        { "_id": { "$in": [get_id("embedding_", idnum) for idnum in n[0].tolist()] } },
        { "profile_id": 1 }
        )
    matches = list(matches)

    results = [ [ match['_id'], match['profile_id'], np.float64(distance) ] for match, distance in zip(matches, d[0]) ]

    test_result[tail] = results
            
    # match = face_collection.find_one({ "_id": get_id('embedding_', n[0][0]) }, { "profile_id": 1 })
    # profile_id = match['profile_id']

    # print(f"{d} {n} {profile_id}")
    
    # df.at[tail, 'prediction'] = profile_id
    df.at[tail, 'has_detect_face'] = True

import json
with open(os.path.join(split_path, 'test_result.json'), 'w') as f:
    json.dump(test_result, f)
print(f"Written test result to {os.path.join(split_path, 'test_result.json')}")

  0%|          | 0/100 [00:00<?, ?it/s]2022-02-23 10:40:39.339559: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8302

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.




100%|██████████| 100/100 [01:35<00:00,  1.05it/s]

Written test result to ../tests/nguoinoitiengtv/split/test_result.json





In [99]:
test_result['dita_von_teese_eeeb1401545eb75a52c7d2c846f908c92865bf12.jpg'][0]

('embedding_003408', 'profile_001251', 50.48596954345703)

In [104]:
with open(os.path.join(split_path, 'test_result.json'), 'r') as f:
    import_result = json.load(f)

In [105]:
import_result

{'dita_von_teese_eeeb1401545eb75a52c7d2c846f908c92865bf12.jpg': [['embedding_003408',
   'profile_001251',
   50.48596954345703],
  ['embedding_004047', 'profile_001472', 60.60420227050781],
  ['embedding_004048', 'profile_001472', 64.78118133544922],
  ['embedding_004049', 'profile_001472', 92.24520874023438],
  ['embedding_004050', 'profile_001472', 94.05899810791016],
  ['embedding_004862', 'profile_001750', 98.48477935791016],
  ['embedding_004863', 'profile_001750', 103.31478881835938],
  ['embedding_004864', 'profile_001750', 105.87550354003906],
  ['embedding_007195', 'profile_002543', 106.4572982788086],
  ['embedding_007414', 'profile_002627', 106.69178771972656],
  ['embedding_007913', 'profile_002787', 107.30900573730469],
  ['embedding_014416', 'profile_005036', 109.97169494628906],
  ['embedding_014417', 'profile_005036', 110.31056213378906],
  ['embedding_019986', 'profile_007011', 112.51024627685547],
  ['embedding_021804', 'profile_007658', 112.66697692871094],
  ['embe

In [92]:
df[ df['has_detect_face'] == True ]

Unnamed: 0,truth,is_in_val,has_detect_face,prediction
son_tung_ba35896e20d1430ca11bd05b4a52c073ae18a152.jpg,profile_000000,True,True,profile_000000
hari_won_81c3b0a93343051452695d92dc9ad658f1bf60be.jpg,profile_000001,True,True,profile_000001
hari_won_a84c21610bdec19752b0bf95da9bc7b5e5d3ea9d.jpg,profile_000001,True,True,profile_000001
ho_quang_hieu_e01ffcb608bcd21746da62c9a2543bd3b1a5c3a3.jpg,profile_000002,True,True,profile_002298
ho_quang_hieu_dcc5e659b6bc7738ddf226f54bdbf1e4cd6fce68.jpg,profile_000002,True,True,profile_011015
...,...,...,...,...
ha_kim_tuyen_9a2abb7626ff23d2a578f5170de17feeb09bb165.jpg,,True,True,profile_000618
thu_ha_40215bc9ed6f5789dc628457f2ec5d03e6ee0424.jpg,,True,True,profile_011941
beth_jordan_f81bbc4faf84fb1c10d9a47dab6cc9fe270db561.jpg,,True,True,profile_007683
tina_turner_91ff2bcf8d137031497ac18bd3b55e19292e161d.jpg,,True,True,profile_011324


In [93]:
df.to_csv('./test_result.csv')

In [101]:
df[ df['truth'] == df['prediction'] ]

Unnamed: 0,truth,is_in_val,has_detect_face,prediction
son_tung_ba35896e20d1430ca11bd05b4a52c073ae18a152.jpg,profile_000000,True,True,profile_000000
hari_won_81c3b0a93343051452695d92dc9ad658f1bf60be.jpg,profile_000001,True,True,profile_000001
hari_won_a84c21610bdec19752b0bf95da9bc7b5e5d3ea9d.jpg,profile_000001,True,True,profile_000001
sowon_685583906b95f6b9aa555b96881aa2f80fea5692.jpg,profile_000015,True,True,profile_000015
tobias_schramm_7b000a7857a56eb4fd29644ec12cd17eb82a98f2.jpg,profile_000016,True,True,profile_000016
...,...,...,...,...
linh_trang_e50551b7916d3fae89eea8b88cad72564e538f96.jpg,profile_012505,True,True,profile_012505
ong_hoang_nam_952a11ca7273c499caad34283e6e99283e16a78d.jpg,profile_012508,True,True,profile_012508
ong_hoang_nam_a8570425db9bb6928a402a8aaa2ae69724beaff6.jpg,profile_012508,True,True,profile_012508
le_phan_ngoc_lan_bc2d715c835d1dcf636e93d173aad29a9f49743c.jpg,profile_012521,True,True,profile_012521


In [100]:
df[ df['is_in_val'] == True ]

Unnamed: 0,truth,is_in_val,has_detect_face,prediction
son_tung_ba35896e20d1430ca11bd05b4a52c073ae18a152.jpg,profile_000000,True,True,profile_000000
hari_won_81c3b0a93343051452695d92dc9ad658f1bf60be.jpg,profile_000001,True,True,profile_000001
hari_won_a84c21610bdec19752b0bf95da9bc7b5e5d3ea9d.jpg,profile_000001,True,True,profile_000001
ho_quang_hieu_e01ffcb608bcd21746da62c9a2543bd3b1a5c3a3.jpg,profile_000002,True,True,profile_002298
ho_quang_hieu_dcc5e659b6bc7738ddf226f54bdbf1e4cd6fce68.jpg,profile_000002,True,True,profile_011015
...,...,...,...,...
ha_kim_tuyen_9a2abb7626ff23d2a578f5170de17feeb09bb165.jpg,,True,True,profile_000618
thu_ha_40215bc9ed6f5789dc628457f2ec5d03e6ee0424.jpg,,True,True,profile_011941
beth_jordan_f81bbc4faf84fb1c10d9a47dab6cc9fe270db561.jpg,,True,True,profile_007683
tina_turner_91ff2bcf8d137031497ac18bd3b55e19292e161d.jpg,,True,True,profile_011324
