In [1]:
from glob import glob
import random
import json
import numpy as np
import os


VISUAL_FEATURES_PATH = '/home/tk/datasets/MELD/visual-features/smaller-dataset/all/'
VIDS_DIR = "/home/tk/datasets/MELD/MELD.Raw/train/train_splits/"
DATASET_PATH = '/home/tk/datasets/MELD/visual-features/smaller-dataset/datasets.json'

visual_features = glob(os.path.join(VISUAL_FEATURES_PATH, '*.npy'))
visual_features = {os.path.basename(vf).split('.npy')[0] : np.load(vf, allow_pickle=True).item() for vf in visual_features}

with open(DATASET_PATH, 'r') as stream:
    datasets = json.load(stream)

datasets = datasets['large']

In [2]:
FACE_PROB = 0.975
EVERY_N_FRAME = 4
# SPEAKERS_OF_INTEREST = ['Chandler', 'Joey', 'Monica', 'Phoebe', 'Rachel', 'Ross']
DATASET_chosen = 'train'
dataset_chosen = datasets[DATASET_chosen]

speakers_mentioned = []
embeddings_all = []

# This is gonna help us to find back to the source frame and video
idx2source = {}
embeddings_all = []
bboxes_all = []
landmarks_all = []

count = 0
for diautt, annot in dataset_chosen.items():
    # There is one face annotated in the entire video.
    # We are not even sure if the face is actually there or not.
    # Even though the face is there, we are not sure which frame number it is.

    # if annot['Speaker'] not in SPEAKERS_OF_INTEREST:
    #     continue

    for framenum, list_of_findings in visual_features[diautt].items():
        if framenum % EVERY_N_FRAME != 0:
            continue
        for finding in list_of_findings:
            if finding['bbox'][-1] < FACE_PROB:
                continue
            
            embeddings_all.append(finding['embedding'])
            bboxes_all.append(finding['bbox'])
            landmarks_all.append(finding['landmark'])
            idx2source[count] = {'diautt':diautt, 'frame': framenum}
            count+=1
            speakers_mentioned.append(annot['Speaker'])

assert len(embeddings_all) == len(bboxes_all) == len(landmarks_all) == \
        len(idx2source)

speakers_mentioned = sorted(list(set(speakers_mentioned)))

print(f"Out of the {len(dataset_chosen)} number of videos (utterances),")
print(f"There are in total of {len(speakers_mentioned)} unique speakers mentioned")
print()
print(speakers_mentioned)
print()
print(f"and {len(embeddings_all)} faces detected")

np.save('./DEBUG/embeddings-all.npy', embeddings_all)

Out of the 584 number of videos (utterances),
There are in total of 31 unique speakers mentioned

['Ben', 'Chandler', 'Charlie', 'Chip', 'Danny', 'Dr. Green', 'Dr. Johnson', 'Dr. Ledbetter', 'Dr. Rhodes', 'Hoshi', 'Joey', 'Julie', 'Katie', 'Leslie', 'Marc', 'Mike', 'Mischa', 'Mona', "Mona's Date", 'Monica', 'Pete', 'Phoebe', 'Rachel', 'Receptionist', 'Richard', 'Rick', 'Ross', 'Student', 'The Assistant Director', 'The Director', 'Tom']

and 19645 faces detected


In [6]:
import numpy as np

from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler


X = np.stack(embeddings_all)

# #############################################################################
# Compute DBSCAN
# DBSCAN uses euclidean distance between the data points.
# TODO: find a way to replace it with angle distance.
# eps and min_samples are hyper parameters that you have to tune.
# At the moment 0.75 and 10, respectively, works decent.
db = DBSCAN(eps=0.8, min_samples=20).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels))
print(f"Number of faces that are clustered: {len(embeddings_all) - n_noise_}")
print()

(label_num, counts) = np.unique(labels, return_counts=True)

for l, c in zip(label_num, counts):
    print(f" label {l} \t has {c} counts")

np.save('./DEBUG/embeddings-clusters.npy', labels)

Estimated number of clusters: 57
Estimated number of noise points: 5350
Silhouette Coefficient: 0.129
Number of faces that are clustered: 14295

 label -1 	 has 5350 counts
 label 0 	 has 1014 counts
 label 1 	 has 1821 counts
 label 2 	 has 1318 counts
 label 3 	 has 2013 counts
 label 4 	 has 2470 counts
 label 5 	 has 1728 counts
 label 6 	 has 126 counts
 label 7 	 has 20 counts
 label 8 	 has 21 counts
 label 9 	 has 121 counts
 label 10 	 has 24 counts
 label 11 	 has 1393 counts
 label 12 	 has 72 counts
 label 13 	 has 23 counts
 label 14 	 has 89 counts
 label 15 	 has 155 counts
 label 16 	 has 24 counts
 label 17 	 has 20 counts
 label 18 	 has 25 counts
 label 19 	 has 57 counts
 label 20 	 has 72 counts
 label 21 	 has 27 counts
 label 22 	 has 61 counts
 label 23 	 has 27 counts
 label 24 	 has 21 counts
 label 25 	 has 20 counts
 label 26 	 has 22 counts
 label 27 	 has 48 counts
 label 28 	 has 49 counts
 label 29 	 has 30 counts
 label 30 	 has 32 counts
 label 31 	 ha

In [7]:
VIDS_DIR = "/home/tk/datasets/MELD/MELD.Raw/train/train_splits/"
NUM_MAX_VID_PER_LABEL = 25

import numpy as np
import shutil
from tqdm.notebook import tqdm
import cv2
import av
import random
from cltl_face_all.face_alignment import FaceDetection

labels = np.load("./DEBUG/embeddings-clusters.npy")


shutil.rmtree('./DEBUG/faces/', ignore_errors=True)

assert len(embeddings_all) == len(bboxes_all) == len(landmarks_all) == \
        len(idx2source) == len(labels)


list_all = []


indices = list(idx2source.keys())
for idx in indices:
    label_ = labels[idx]

    embedding_ = embeddings_all[idx]
    bbox_ = bboxes_all[idx]
    landmark_ = landmarks_all[idx]
    source_ = idx2source[idx]

    to_append = {'label': label_, 
                'embedding': embedding_,
                 'bbox': bbox_,
                 'landmark': landmark_,
                 'diautt': source_['diautt'],
                 'frame': source_['frame']}

    list_all.append(to_append)


assert len(list_all) == len(labels)

random.shuffle(list_all)


fd = FaceDetection(device='cpu', face_detector='sfd')

labels_processed = {l: 0 for l in set(labels)}

for finding in tqdm(list_all):
    label_ = finding['label']
    embedding_ = finding['embedding']
    bbox_ = finding['bbox']
    landmark_ = finding['landmark']
    diautt_ = finding['diautt']
    frame_num = finding['frame']
    video_path = os.path.join(VIDS_DIR, diautt_) + '.mp4'

    os.makedirs(os.path.join('./DEBUG/faces', str(label_)), exist_ok=True)

    # if labels_processed[label_] > NUM_MAX_VID_PER_LABEL:
    #     continue

    assert os.path.isfile(video_path)

    container = av.open(video_path)
    for idx, frame in enumerate(container.decode(video=0)):
        img = np.array(frame.to_image())

        if idx == frame_num:
            break

    batch = img[np.newaxis, ...]
    face = fd.crop_and_align(batch, [bbox_[np.newaxis, ...]], [landmark_[np.newaxis, ...]])
    face = np.squeeze(face)

    img_write_path = os.path.join('./DEBUG/faces', 
                                  str(label_), 
                                  f"{diautt_}_frame{frame_num}_{'_'.join([str(foo) for foo in bbox_.astype(np.int).tolist()[:4]])}.jpg")

    cv2.imwrite(img_write_path, cv2.cvtColor(face, cv2.COLOR_RGB2BGR))
    labels_processed[label_] +=1

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=19645.0), HTML(value='')))




In [3]:
import numpy as np
import shutil
from tqdm.notebook import tqdm
import cv2
import av
import random
from cltl_face_all.face_alignment import FaceDetection

labels = np.load("./DEBUG/embeddings-clusters.npy")
embeddings_all = np.load("./DEBUG/embeddings-all.npy")

assert len(labels) == len(embeddings_all)

In [13]:
# I got below after going through all of them

# 1	Rachel
# 2	Chandler
# 3	Joey
# 4	Ross
# 5	Phoebe
# 9	Ben
# 11	Monica
# 12	Leslie
# 13	Monica
# 14	Mischa
# 19	Richard
# 21	Danny
# 22	Tom
# 28	Katie
# 30	Dr. Ledbetter
# 31	Dr. Green
# 32	Mona
# 42	Hoshi
# 44	Pete
# 48	Receptionist
# 51	Charlie
# 53	Rick

to_keep = {'Rachel': [],
'Chandler': [],
'Joey': [],
'Ross': [],
'Phoebe': [],
'Ben': [],
'Monica': [],
'Leslie': [],
'Mischa': [],
'Richard': [],
'Danny': [],
'Tom': [],
'Katie': [],
'Dr. Ledbetter': [],
'Dr. Green': [],
'Mona': [],
'Hoshi': [],
'Pete': [],
'Receptionist': [],
'Charlie': [],
'Rick': []}

label2name = {
1:'Rachel',
2:'Chandler',
3:'Joey',
4:'Ross',
5:'Phoebe',
9:'Ben',
11:'Monica',
12:'Leslie',
14:'Mischa',
19:'Richard',
21:'Danny',
22:'Tom',
28:'Katie',
30:'Dr. Ledbetter',
31:'Dr. Green',
32:'Mona',
42:'Hoshi',
44:'Pete',
48:'Receptionist',
51:'Charlie',
53:'Rick'
}
assert len(to_keep) == len(label2name)

for l, e in zip(labels, embeddings_all):
    if l not in list((label2name).keys()):
        continue
    to_keep[label2name[l]].append(e)

In [14]:
for key, val in to_keep.items():
    print(key, len(val))

Rachel 1821
Chandler 1318
Joey 2013
Ross 2470
Phoebe 1728
Ben 121
Monica 1393
Leslie 72
Mischa 89
Richard 57
Danny 27
Tom 61
Katie 49
Dr. Ledbetter 32
Dr. Green 159
Mona 51
Hoshi 36
Pete 23
Receptionist 114
Charlie 27
Rick 23


In [15]:
final_vectors = {}
for name, list_of_embs in to_keep.items():
    sum_of_vecs = np.sum(list_of_embs, axis=0)
    sum_of_vecs = sum_of_vecs / np.linalg.norm(sum_of_vecs)
    print(name, sum_of_vecs.shape, np.linalg.norm(sum_of_vecs), sum_of_vecs.dtype)
    final_vectors[name] = sum_of_vecs

np.save('DEBUG/friends-embeddings.npy', final_vectors)

Rachel (512,) 1.0000001 float32
Chandler (512,) 0.99999994 float32
Joey (512,) 0.9999999 float32
Ross (512,) 1.0000001 float32
Phoebe (512,) 1.0 float32
Ben (512,) 1.0 float32
Monica (512,) 0.99999994 float32
Leslie (512,) 1.0 float32
Mischa (512,) 1.0 float32
Richard (512,) 1.0 float32
Danny (512,) 1.0 float32
Tom (512,) 1.0000001 float32
Katie (512,) 0.9999999 float32
Dr. Ledbetter (512,) 0.99999994 float32
Dr. Green (512,) 1.0000001 float32
Mona (512,) 0.9999998 float32
Hoshi (512,) 1.0000001 float32
Pete (512,) 0.99999994 float32
Receptionist (512,) 1.0 float32
Charlie (512,) 1.0 float32
Rick (512,) 1.0 float32


In [21]:
import os

for name, final_vector in final_vectors.items():
    os.makedirs(f'DEBUG/friends/{name}', exist_ok=True)
    np.save(f'DEBUG/friends/{name}/{name}.npy', final_vector)