In [3]:
from glob import glob
import random
import json
import numpy as np
import os


VISUAL_FEATURES_PATH = '/home/tk/datasets/MELD/visual-features/MELD-visual-features/train/'
VIDS_DIR = "/home/tk/datasets/MELD/MELD.Raw/train/train_splits/"
DATASET_PATH = '/home/tk/datasets/MELD/visual-features/MELD-visual-features/datasets.json'

with open(DATASET_PATH, 'r') as stream:
    datasets = json.load(stream)

visual_features = glob(os.path.join(VISUAL_FEATURES_PATH, '*.npy'))
visual_features = {os.path.basename(vf).split('.npy')[0] : np.load(vf, allow_pickle=True).item() for vf in visual_features}

print(len(visual_features))

9988


In [4]:
from tqdm.notebook import tqdm
FACE_PROB = 0.975
EVERY_N_FRAME = 16
ACTORS = ['Chandler', 'Joey', 'Monica', 'Phoebe', 'Rachel', 'Ross']
DATASET_chosen = 'train'
dataset_chosen = datasets[DATASET_chosen]

speakers_mentioned = []
embeddings_all = []

# This is gonna help us to find back to the source frame and video
idx2source = {}
embeddings_all = []
bboxes_all = []
landmarks_all = []

count = 0
for diautt, annot in tqdm(dataset_chosen.items()):
    # There is one face annotated in the entire video.
    # We are not even sure if the face is actually there or not.
    # Even though the face is there, we are not sure which frame number it is.
    try:
        if annot['Speaker'] not in ACTORS:
            continue

        for framenum, list_of_findings in visual_features[diautt].items():
            if framenum % EVERY_N_FRAME != 0:
                continue
            for finding in list_of_findings:
                if finding['bbox'][-1] < FACE_PROB:
                    continue
                
                embeddings_all.append(finding['embedding'])
                bboxes_all.append(finding['bbox'])
                landmarks_all.append(finding['landmark'])
                idx2source[count] = {'diautt':diautt, 'frame': framenum}
                count+=1
                speakers_mentioned.append(annot['Speaker'])
    except KeyError as e:
        print(f"{e} doesn't exist")
        continue

assert len(embeddings_all) == len(bboxes_all) == len(landmarks_all) == \
        len(idx2source)

speakers_mentioned = sorted(list(set(speakers_mentioned)))

print(f"Out of the {len(dataset_chosen)} number of videos (utterances),")
print(f"There are in total of {len(speakers_mentioned)} unique speakers mentioned")
print()
print(speakers_mentioned)
print()
print(f"and {len(embeddings_all)} faces detected")

np.save('./DEBUG/embeddings-all.npy', embeddings_all)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=9989.0), HTML(value='')))

'dia125_utt3' doesn't exist

Out of the 9989 number of videos (utterances),
There are in total of 6 unique speakers mentioned

['Chandler', 'Joey', 'Monica', 'Phoebe', 'Rachel', 'Ross']

and 71883 faces detected


In [3]:
import numpy as np

from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler


X = np.stack(embeddings_all)

# #############################################################################
# Compute DBSCAN
# DBSCAN uses euclidean distance between the data points.
# TODO: find a way to replace it with angle distance.
# eps and min_samples are hyper parameters that you have to tune.
# At the moment 0.75 and 10, respectively, works decent.
db = DBSCAN(eps=0.8, min_samples=100, n_jobs=-1).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels))
print(f"Number of faces that are clustered: {len(embeddings_all) - n_noise_}")
print()

(label_num, counts) = np.unique(labels, return_counts=True)

for l, c in zip(label_num, counts):
    print(f" label {l} \t has {c} counts")

np.save('./DEBUG/embeddings-clusters.npy', labels)

Estimated number of clusters: 8
Estimated number of noise points: 28721
Silhouette Coefficient: 0.122
Number of faces that are clustered: 43162

 label -1 	 has 28721 counts
 label 0 	 has 6833 counts
 label 1 	 has 2471 counts
 label 2 	 has 7505 counts
 label 3 	 has 6275 counts
 label 4 	 has 5144 counts
 label 5 	 has 6134 counts
 label 6 	 has 8548 counts
 label 7 	 has 252 counts


In [7]:
import numpy as np
import shutil
from tqdm.notebook import tqdm
import cv2
import av
import random
from cltl_face_all.face_alignment import FaceDetection

labels = np.load("./DEBUG/embeddings-clusters.npy")

shutil.rmtree('./DEBUG/faces/', ignore_errors=True)

assert len(embeddings_all) == len(bboxes_all) == len(landmarks_all) == \
        len(idx2source) == len(labels)


list_all = []


indices = list(idx2source.keys())
for idx in tqdm(indices):
    label_ = labels[idx]

    embedding_ = embeddings_all[idx]
    bbox_ = bboxes_all[idx]
    landmark_ = landmarks_all[idx]
    source_ = idx2source[idx]

    to_append = {'label': label_, 
                'embedding': embedding_,
                 'bbox': bbox_,
                 'landmark': landmark_,
                 'diautt': source_['diautt'],
                 'frame': source_['frame']}

    list_all.append(to_append)


assert len(list_all) == len(labels)

random.shuffle(list_all)

fd = FaceDetection(device='cpu', face_detector='sfd')



for finding in tqdm(list_all):
    label_ = finding['label']
    embedding_ = finding['embedding']
    bbox_ = finding['bbox']
    landmark_ = finding['landmark']
    diautt_ = finding['diautt']
    frame_num = finding['frame']
    video_path = os.path.join(VIDS_DIR, diautt_) + '.mp4'

    os.makedirs(os.path.join('./DEBUG/faces', str(label_)), exist_ok=True)

    if not os.path.isfile(video_path):
        continue

    container = av.open(video_path)
    for idx, frame in enumerate(container.decode(video=0)):
        img = np.array(frame.to_image())

        if idx == frame_num:
            break

    batch = img[np.newaxis, ...]
    face = fd.crop_and_align(batch, [bbox_[np.newaxis, ...]], [landmark_[np.newaxis, ...]])
    face = np.squeeze(face)

    img_write_path = os.path.join('./DEBUG/faces',
                                str(label_), 
                                f"{diautt_}_frame{frame_num}_{'_'.join([str(foo) for foo in bbox_.astype(np.int).tolist()[:4]])}.jpg")

    cv2.imwrite(img_write_path, cv2.cvtColor(face, cv2.COLOR_RGB2BGR))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=71883.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=71883.0), HTML(value='')))




KeyboardInterrupt: 

In [8]:
import numpy as np
import shutil
from tqdm.notebook import tqdm
import cv2
import av
import random
from cltl_face_all.face_alignment import FaceDetection

labels = np.load("./DEBUG/embeddings-clusters.npy")
embeddings_all = np.load("./DEBUG/embeddings-all.npy")

assert len(labels) == len(embeddings_all)

In [9]:
# I got below after going through all of them

# -1 Random faces
# 0 Chandler
# 1	Random faces
# 2	Joey
# 3 Rachel
# 4 Monica
# 5 Phoebe
# 6 Ross
# 7 Noise

to_keep = {actor: [] for actor in ACTORS}

label2name = {
    0: 'Chandler',
    2: 'Joey',
    3: 'Rachel',
    4: 'Monica',
    5: 'Phoebe',
    6: 'Ross'}

assert len(to_keep) == len(label2name)

for l, e in zip(labels, embeddings_all):
    if l not in list((label2name).keys()):
        continue
    to_keep[label2name[l]].append(e)

In [10]:
for key, val in to_keep.items():
    print(key, len(val))

Chandler 6833
Joey 7505
Monica 5144
Phoebe 6134
Rachel 6275
Ross 8548


In [11]:
final_vectors = {}
for name, list_of_embs in to_keep.items():
    sum_of_vecs = np.sum(list_of_embs, axis=0)
    sum_of_vecs = sum_of_vecs / np.linalg.norm(sum_of_vecs)
    print(name, sum_of_vecs.shape, np.linalg.norm(sum_of_vecs), sum_of_vecs.dtype)
    final_vectors[name] = sum_of_vecs

np.save('DEBUG/friends-embeddings.npy', final_vectors)

Chandler (512,) 1.0 float32
Joey (512,) 1.0000001 float32
Monica (512,) 1.0 float32
Phoebe (512,) 1.0000001 float32
Rachel (512,) 1.0 float32
Ross (512,) 1.0 float32


In [12]:
import os

for name, final_vector in final_vectors.items():
    os.makedirs(f'DEBUG/friends/{name}', exist_ok=True)
    np.save(f'DEBUG/friends/{name}/{name}.npy', final_vector)