<a href="https://colab.research.google.com/github/cltl/ma-communicative-robots/blob/master/multimodal/visual-features-extraction-colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Clone the repo, install the package, download data, and extract them.

Don't run `pip install -r requirements.txt` from the original installation guide. They are made for python3.7, but colab still uses python3.6. We'll install the necessary packages here on the fly.

In [9]:
%cd /content
!git clone https://github.com/leolani/cltl-face-all
!cd cltl-face-all/ && pip install .
!pip install omegaconf==2.0.5
!pip install tqdm av

!git clone https://github.com/cltl/ma-communicative-robots.git

# Download the annotations and stuff
!wget https://raw.githubusercontent.com/declare-lab/MELD/master/data/MELD/train_sent_emo.csv
!wget https://raw.githubusercontent.com/declare-lab/MELD/master/data/MELD/dev_sent_emo.csv
!wget https://raw.githubusercontent.com/declare-lab/MELD/master/data/MELD/test_sent_emo.csv

/content
fatal: destination path 'cltl-face-all' already exists and is not an empty directory.
Processing /content/cltl-face-all
Building wheels for collected packages: cltl-face-all
  Building wheel for cltl-face-all (setup.py) ... [?25l[?25hdone
  Created wheel for cltl-face-all: filename=cltl_face_all-0.0.0-cp36-none-any.whl size=53234 sha256=2f10ea0bd718e0c46e00460c01f3ff9573647b828afb31f58e51ac3b88480771
  Stored in directory: /root/.cache/pip/wheels/8d/71/f3/2db0b17856a0b395aaa04c8562716e0350cfc712cdd4b9947e
Successfully built cltl-face-all
Installing collected packages: cltl-face-all
  Found existing installation: cltl-face-all 0.0.0
    Uninstalling cltl-face-all-0.0.0:
      Successfully uninstalled cltl-face-all-0.0.0
Successfully installed cltl-face-all-0.0.0
fatal: destination path 'ma-communicative-robots' already exists and is not an empty directory.
--2020-11-23 12:40:20--  https://raw.githubusercontent.com/declare-lab/MELD/master/data/MELD/train_sent_emo.csv
Resolving

# Assess the data

The number of videos should be the same

In [10]:
import os
from glob import glob
import csv
import json


# Only the train_sent_emo is relevant to us since the smaller datasets
# are subset of the original train dataset.
annotation_path = 'train_sent_emo.csv'
        
vids_dir = "ma-communicative-robots/multimodal/smaller-dataset/"

datasets = {}
datasets['small'] = "ma-communicative-robots/multimodal/dataset-small.json"
datasets['medium'] = "ma-communicative-robots/multimodal/dataset-medium.json"
datasets['large'] = "ma-communicative-robots/multimodal/dataset-large.json"

for datasize in ['small', 'medium', 'large']:
    with open(datasets[datasize], 'r') as stream:
        datasets[datasize] = json.load(stream)

with open(annotation_path) as f:
    reader = csv.reader(f)
    annotations = list(reader)


# See if we have all of the videos
for datasize in ['small', 'medium', 'large']:
    for datatype in ['train', 'dev', 'test']:
        diautt_ = datasets[datasize][datatype]
        for diautt in diautt_:
            assert os.path.isfile(os.path.join(vids_dir, diautt))

# Find the corresponding speaker / emotion / sentiment from the annotations

vid2anno = {}
for row in annotations[1:]:
    SrNo, Utterance, Speaker, Emotion, Sentiment, Dialogue_ID,\
        Utterance_ID, Season, Episode, StartTime, EndTime = row

    if f"dia{Dialogue_ID}_utt{Utterance_ID}.mp4" not in str(os.listdir(os.path.join(vids_dir))):
        continue
    vid2anno[f"dia{Dialogue_ID}_utt{Utterance_ID}.mp4"] = \
        {'speaker':Speaker, 
         'emotion': Emotion, 
         'sentiment': Sentiment,
         'Utterance': Utterance}

for datasize in ['small', 'medium', 'large']:
    for datatype in ['train', 'dev', 'test']:
        diautt_ = datasets[datasize][datatype]
        for idx, diautt in enumerate(diautt_):
            
            datasets[datasize][datatype][idx] = \
                (diautt, vid2anno[diautt])

with open('datasets.json', 'w', encoding='utf8') as json_file:
    json.dump(datasets, json_file, ensure_ascii=False)

!cp datasets.json drive/MyDrive/

# Let's check out a couple of videos to see if our labeling is correct.

In [11]:
import random
chosen = random.choice(datasets['small']['train'])
print(chosen)

vidpath = os.path.join(vids_dir, chosen[0])
print(vidpath)

from IPython.display import HTML
from base64 import b64encode
mp4 = open(vidpath,'rb').read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()

HTML("""
<video width=1280 controls>
      <source src="%s" type="video/mp4">
</video>
""" % data_url)

('dia226_utt6.mp4', {'speaker': 'Rachel', 'emotion': 'neutral', 'sentiment': 'neutral', 'Utterance': "Oh, come on! He's glad that I came, he doesn't want me to go anywhere, balls flying all over the place!"})
ma-communicative-robots/multimodal/smaller-dataset/dia226_utt6.mp4


# Instantiate the three classes we need.

In [12]:
from cltl_face_all.face_alignment import FaceDetection
from cltl_face_all.arcface import ArcFace
from cltl_face_all.agegender import AgeGender

fd = FaceDetection(device='cuda', face_detector='sfd')
ag = AgeGender(device='cpu')
af = ArcFace(device='cpu')

[*] load ckpt from /usr/local/lib/python3.6/dist-packages/cltl_face_all/arcface/./pretrained_models/arc_res50/e_8_b_40000.ckpt


# Run over the images

In [None]:
from tqdm.notebook import tqdm
import av
import numpy as np
import os
import shutil

!rm visual-features.zip
shutil.rmtree('visual-features', ignore_errors=True)
os.makedirs('visual-features', exist_ok=True)

for vidpath in tqdm(glob(os.path.join(vids_dir, '*.mp4'))):
    print(vidpath)

    diautt = os.path.basename(vidpath)
    to_dump = {}

    container = av.open(vidpath)

    for frame in container.decode(video=0):
        idx = frame.index
        to_dump[idx] = []
        numpy_RGB = np.array(frame.to_image())
        batch = numpy_RGB[np.newaxis, ...]
        bboxes = fd.detect_faces(batch)
        # print(f"number of faces detected in the frame {idx} is {len(bboxes[0])}")

        if len(bboxes[0]) == 0:
            continue

        landmarks = fd.detect_landmarks(batch, bboxes)

        faces = fd.crop_and_align(batch, bboxes, landmarks)
        faces = np.concatenate(faces, axis=0)
        embeddings = af.predict(faces)
        ages, genders = ag.predict(faces)

        # print(len(bboxes[0]), len(landmarks[0]), len(ages), len(genders), len(embeddings), len(faces))
        for bb, lm, a, g, emb in zip(bboxes[0], landmarks[0], ages, genders, embeddings):
            x1, y1, x2, y2, prob = bb

            to_append = {'bbox': bb, 
                        'landmark': lm, 
                        'age': a,
                        'gender': g,
                        'embedding': emb}

            to_dump[idx].append(to_append)
        # print(len(to_dump), idx+1)

    np.save(os.path.join('visual-features', f"{diautt.replace('.mp4', '.npy')}" ), to_dump)

!zip -r visual-features.zip visual-features
!cp visual-features.zip drive/MyDrive/

rm: cannot remove 'visual-features.zip': No such file or directory


HBox(children=(FloatProgress(value=0.0, max=789.0), HTML(value='')))

ma-communicative-robots/multimodal/smaller-dataset/dia335_utt3.mp4


  r, _, _, _ = lstsq(X, U)


ma-communicative-robots/multimodal/smaller-dataset/dia111_utt16.mp4
ma-communicative-robots/multimodal/smaller-dataset/dia869_utt14.mp4
ma-communicative-robots/multimodal/smaller-dataset/dia652_utt4.mp4
ma-communicative-robots/multimodal/smaller-dataset/dia413_utt3.mp4
ma-communicative-robots/multimodal/smaller-dataset/dia523_utt0.mp4
