In [62]:
import numpy as np
import pandas as pd

In [4]:
embedding_vector_length = 5

In [36]:
input_face = np.random.randn(embedding_vector_length)
input_face = np.array([1, 1, 1, 1, 1])
input_face

array([1, 1, 1, 1, 1])

In [37]:
dataset_faces = np.random.randn(3, embedding_vector_length)
dataset_faces = np.array(
    [
        [1, 1, 1, 1, 1],
        [-1, -1, -1, -1, -1],
        [1, 1, 0, 1, 1],
        [100, 100, 100, 100, 100],
    ]
)
dataset_faces

array([[  1,   1,   1,   1,   1],
       [ -1,  -1,  -1,  -1,  -1],
       [  1,   1,   0,   1,   1],
       [100, 100, 100, 100, 100]])

In [38]:
dataset_faces.dot(input_face)

array([  5,  -5,   4, 500])

In [39]:
np.linalg.norm(dataset_faces, axis=1)

array([  2.23606798,   2.23606798,   2.        , 223.60679775])

In [40]:
np.linalg.norm(input_face)

2.23606797749979

In [41]:
np.linalg.norm(input_face) * np.linalg.norm(dataset_faces, axis=1)

array([  5.        ,   5.        ,   4.47213595, 500.        ])

In [42]:
dataset_faces.dot(input_face) / (
    np.linalg.norm(input_face) * np.linalg.norm(dataset_faces, axis=1)
)

array([ 1.        , -1.        ,  0.89442719,  1.        ])

In [47]:
input_face.shape == dataset_faces.shape[1:]

True

In [None]:
pd.DataFrame

In [82]:
from typing import Union


def calculate_similarities_with_db(
    embedding: np.ndarray,
    db_embeddings: Union[np.ndarray, pd.DataFrame],
) -> Union[np.ndarray, pd.Series]:
    """Calculate pairwise cosine similarities, ranging [0, 1],
    between an embedding with n db embeddings. Each embedding
    is vector with length m.

    Parameters
    ----------
    embedding : array with shape (m,)
    db_embeddings : array or dataframe with shape (n, m)

    Returns
    -------
    array or series with shape (n,)
        Cosine similarities
    """
    assert (
        embedding.shape == db_embeddings.shape[1:]
    ), "Embedding vectors must share the same length."

    # cosine_sim = A dot B / norm(A) * norm(B)
    dot_products = db_embeddings.dot(embedding)
    norm_embedding = np.linalg.norm(embedding)
    norm_db_embedding = np.linalg.norm(db_embeddings, axis=1)
    cosine_sim = dot_products / (norm_embedding * norm_db_embedding)

    # change cosine sim range from [-1,1] to [0,1]
    cosine_sim = (cosine_sim + 1) / 2

    return cosine_sim.astype(float)

In [74]:
ans = calculate_similarities_with_db(input_face, dataset_faces)

In [75]:
ans

array([1.00000000e+00, 1.11022302e-16, 9.47213595e-01, 1.00000000e+00])

In [76]:
ans[1]

1.1102230246251565e-16

In [68]:
db_ex = pd.DataFrame(dataset_faces)
db_ex["subject_id"] = list("ABCD")
db_ex

Unnamed: 0,0,1,2,3,4,subject_id
0,1,1,1,1,1,A
1,-1,-1,-1,-1,-1,B
2,1,1,0,1,1,C
3,100,100,100,100,100,D


In [69]:
db_ex.iloc[:, :-1]

Unnamed: 0,0,1,2,3,4
0,1,1,1,1,1
1,-1,-1,-1,-1,-1
2,1,1,0,1,1
3,100,100,100,100,100


In [79]:
ans2 = calculate_similarities_with_db(input_face, db_ex.iloc[:, :-1])
ans2

0    1.000000e+00
1    1.110223e-16
2    9.472136e-01
3    1.000000e+00
dtype: float64

In [80]:
ans2.argmax()

3

In [78]:
db_ex.iloc[:, :-1].shape

(4, 5)

In [81]:
type(db_ex)

pandas.core.frame.DataFrame

In [83]:
isinstance(db_ex, pd.DataFrame)

True

In [110]:
import os


def add_embedding_to_db(
    db_path: str,
    subject_id: str,
    embedding: np.ndarray,
):
    embedding = embedding.astype(float)  # ensure data type
    if not os.path.exists(db_path):
        subject_arr = np.array([subject_id])
        db = pd.DataFrame(np.concatenate((subject_arr, embedding))[np.newaxis, ...])
        db.columns = ["subject_id"] + [i for i in range(len(embedding))]
        db.to_csv(db_path, index=False)
    else:
        db = pd.read_csv(db_path)
        assert (db.shape[1] - 1) == embedding.shape[0], (
            f"Embedding vector ({embedding.shape[0]},) does not share "
            + f"the same length with others in db ({db.shape[1] - 1},)."
        )
        subject_arr = np.array([subject_id])
        db.loc[len(db)] = np.concatenate((subject_arr, embedding)).ravel()
        db.to_csv(db_path, index=False)

In [94]:
db_path = "test.csv"

add_embedding_to_db(
    db_path,
    "A",
    np.random.randn(3)
)

In [112]:
db_path = "test.csv"

add_embedding_to_db(
    db_path,
    "D",
    np.random.randn(3)
)

In [117]:
arr1 = np.array(["Hello"])
arr2 = np.random.randn(3)

In [116]:
db_embeddings = pd.read_csv("test.csv")
db_embeddings

Unnamed: 0,subject_id,0,1,2
0,A,-0.529212,-0.724797,-0.802108
1,B,-0.17751,-0.549399,-0.434037
2,C,-0.90267,-0.209591,0.079867
3,D,-1.24255,-0.125968,-0.894645


In [118]:
calculate_similarities_with_db(arr2, db_embeddings.iloc[:,1:])

0    0.974342
1    0.938045
2    0.875394
3    0.933357
dtype: float64

In [121]:
pd.concat([db_embeddings["subject_id"], calculate_similarities_with_db(arr2, db_embeddings.iloc[:,1:])], axis=1)

Unnamed: 0,subject_id,0
0,A,0.974342
1,B,0.938045
2,C,0.875394
3,D,0.933357


In [None]:
def recognize_face(face_img:np.ndarray, db_path:str):
    # Detect and embed here.
    embedding = face_img

    db_embeddings = pd.read_csv("test.csv")
    similarities = calculate_similarities_with_db(embedding, db_embeddings.iloc[:,1:])
    
