## Set the paths

Download the visual features, and raw videos if you haven't yet.
And then set the paths

In [None]:
# download visaul features
# !gdown --id 1jS2ufbIovxg8umkZM5UKzsvtSp4UJJyt

VISUAL_FEATURES_DIR = {}
VISUAL_FEATURES_DIR['train'] = "/home/tk/datasets/MELD/visual-features/MELD-visual-features/train/"
VISUAL_FEATURES_DIR['dev'] = "/home/tk/datasets/MELD/visual-features/MELD-visual-features/dev/"
VISUAL_FEATURES_DIR['test'] = "/home/tk/datasets/MELD/visual-features/MELD-visual-features/test/"

# download the raw videos
# !wget http://web.eecs.umich.edu/~mihalcea/downloads/MELD.Raw.tar.gz
VIDS_DIR = {}
VIDS_DIR['train'] = "/home/tk/datasets/MELD/MELD.Raw/train/train_splits/"
VIDS_DIR['dev'] = "/home/tk/datasets/MELD/MELD.Raw/dev/dev_splits_complete/"
VIDS_DIR['test'] = "/home/tk/datasets/MELD/MELD.Raw/test/output_repeated_splits_test"

# This comes with VISUAL_FEATURES when you downloaded them.
ANNOTATION_PATH = "/home/tk/datasets/MELD/visual-features/MELD-visual-features/datasets.json"

THRESHOLDS = {'face': 0.9, 'angle': 1.15}

SAVE_AT = './DEBUG'

# in seconds
IMAGE_INTERVAL = 0.5

## Read pre-computed visual features and annotations

In [None]:
import json
import numpy as np
import os
import json
import av
import cv2
import random
from glob import glob
import pickle
from datetime import datetime, timedelta
from tqdm.notebook import tqdm
import random

with open(ANNOTATION_PATH, 'r') as stream:
    datasets = json.load(stream)

visual_features = {DATASET: glob(os.path.join(VISUAL_FEATURES_DIR[DATASET], '*.npy'))
                   for DATASET in ['train', 'dev', 'test']}

visual_features = {DATASET: {os.path.basename(vf).split('.npy')[0] : vf 
                   for vf in tqdm(visual_features[DATASET])}
                   for DATASET in tqdm(['train', 'dev', 'test'])}

with open('friends-time/friends-time.pkl', 'rb') as stream:
    friends_time = pickle.load(stream)

## Define helper functions

In [None]:
import os
import av
import numpy as np
import cv2
from tqdm.notebook import tqdm
import time
import csv
from glob import glob
import numpy as np
import shutil
import uuid

import signal
from contextlib import contextmanager

class TimeoutException(Exception): pass

@contextmanager
def time_limit(seconds):
    def signal_handler(signum, frame):
        raise TimeoutException("Timed out!")
    signal.signal(signal.SIGALRM, signal_handler)
    signal.alarm(seconds)
    try:
        yield
    finally:
        signal.alarm(0)


MAIN_ACTORS = {}
for path in glob('../main-actors/*.npy'):
    name = path.split('/')[-1].split('.npy')[0]
    MAIN_ACTORS[name] = np.load(path)

def calc_angle_distance(emb1, emb2):
    """Calculate the angle (radian) distance between the embeddings."""
    return np.arccos(np.clip((emb1 @ emb2.T), -1, 1))

def get_unique_dias(list_of_diautts):
    return sorted(list(set([diautt.split('_')[0] for diautt in list_of_diautts])))

def get_time_unix_ms(time_string, season, episode):
    hours, minutes, seconds = time_string.split(':')
    seconds, milliseconds = seconds.split(',')
    hours, minutes, seconds, milliseconds = int(hours), int(minutes), int(seconds), int(milliseconds)
    time_datetime = friends_time[season][episode] + timedelta(hours=hours, minutes=minutes, seconds=seconds)
    time_unix = time.mktime(time_datetime.timetuple())
    time_unix_ms = int(time_unix*1000 + milliseconds)    

    return time_unix_ms

import multiprocessing
from joblib import Parallel, delayed

NUM_CORES = multiprocessing.cpu_count()

class Utterance():

    def __init__(self, diautt, annotation, visual_feature_path, video_path, audio_path, image_interval):
        self.diautt = diautt
        self.srno = annotation['SrNo']
        self.utterance = annotation['Utterance']
        self.speaker = annotation['Speaker']
        self.emotion = annotation['Emotion']
        self.sentiment = annotation['Sentiment']
        self.dialogue_id = annotation['Dialogue_ID']
        self.utterance_id = annotation['Utterance_ID']
        self.season = annotation['Season']
        self.episode = annotation['Episode']
        self.starttime = get_time_unix_ms(annotation['StartTime'], self.season, self.episode)
        self.endtime = get_time_unix_ms(annotation['EndTime'], self.season, self.episode)

        self.visual_features = np.load(visual_feature_path, allow_pickle=True).item()
        self.video_path = video_path
        assert os.path.isfile(self.video_path)
        self.audio_path = audio_path
        self.image_interval = image_interval

    def write_to_audio(self):
        # !ffmpeg -i $self.video_path -q:a 0 -map a $self.audio_path -loglevel quiet
        !ffmpeg -i $self.video_path -q:a 0 -map a $self.audio_path > /dev/null 2>&1 < /dev/null

    def run_on_video(self, dia_dir):
        container = av.open(self.video_path)
        fps = float(container.streams.video[0].average_rate)
        spf = 1/fps 
        mspf = round(spf * 1000)

        self.video_info = []
        for idx, frame in enumerate(container.decode(video=0)):
            if idx % round(self.image_interval * fps) != 0:
                continue

            numpy_RGB = np.array(frame.to_image())
            numpy_BGR = cv2.cvtColor(numpy_RGB, cv2.COLOR_RGB2BGR)
            frame_time = idx*mspf + self.starttime
            img_path = os.path.join(dia_dir, 'image',
                                    self.diautt + f'_frame{str(idx)}_{str(frame_time)}.jpg')
            cv2.imwrite(img_path, numpy_BGR)

            features = self.visual_features[idx]

            if not features:
                continue

            frame_info = {}
            frame_info['files'] = [os.path.join('image', os.path.basename(img_path))]
            container_id = str(uuid.uuid4())
            frame_info['id'] = container_id
            frame_info['mentions'] = []

            frame_info['modality'] = 'image'
            frame_info['ruler'] = {'bounds': [0, 0, numpy_BGR.shape[1], numpy_BGR.shape[0]],
                                       'container_id': container_id,
                                       'type': 'MultiIndex'}
            frame_info['time'] = {'container_id': container_id,
                                      'start': frame_time,
                                        'end': frame_time + mspf,
                                        'type': 'TemporalRuler'}
            frame_info['type'] = 'ImageSignal'


            for k, feat in enumerate(features):
                age = round(float(feat['age']), 3)
                gender = round(float(feat['gender']), 3)
                bbox = feat['bbox']
                bbox, faceprob = [int(round(bb)) for bb in bbox[:4]], float(bbox[-1])
                faceprob = round(faceprob, 3)
                embedding = feat['embedding']
                landmark = feat['landmark']

                annotations = []

                embedding.reshape(1, 512)
                dists = {key: calc_angle_distance(embedding, val) for key, val \
                            in MAIN_ACTORS.items()}
                face_candidate = min(dists, key=dists.get)

                if dists[face_candidate] > THRESHOLDS['angle']:
                    face_candidate = 'unidentified'

                if face_candidate == self.speaker:
                    annotations.append({'source': 'human',
                                        'timestamp': frame_time,
                                        'type': 'emotion',
                                        'value': self.emotion.upper()})

                # annotations.append({'source': 'machine',
                #                     'timestamp': frame_time,
                #                     'type': 'display',
                #                     'value': face_candidate})

                annotations.append({'source': 'machine',
                                    'timestamp': frame_time,
                                    'type': 'person',
                                    'value': {'name': face_candidate,
                                              'age': age,
                                              'gender': gender,
                                              'faceprob': faceprob}})
                
                mention_id = str(uuid.uuid4())
                segment = [{'bounds': bbox,
                            'container_id': container_id,
                            'type': 'MultiIndex'}]
                frame_info['mentions'].append({'annotations': annotations,
                                                    'id': mention_id,
                                                    'segment': segment})
            self.video_info.append(frame_info)

## Run on the videos

In [None]:
def run_thread(DATASET, dia_diautts):
    for dia, diautts in tqdm(dia_diautts.items()):
        shutil.rmtree(os.path.join(SAVE_AT, DATASET, dia), ignore_errors=True)
        os.makedirs(os.path.join(SAVE_AT, DATASET, dia, 'image'), exist_ok=True)
        os.makedirs(os.path.join(SAVE_AT, DATASET, dia, 'text'), exist_ok=True)
        os.makedirs(os.path.join(SAVE_AT, DATASET, dia, 'audio'), exist_ok=True)

        chat_gmrc = []
        image_gmrc = []
        for diautt in diautts:
            try:
                u = Utterance(diautt=diautt,
                                annotation=datasets[DATASET][diautt],
                                visual_feature_path=visual_features[DATASET][diautt],
                                video_path=os.path.join(VIDS_DIR[DATASET], diautt) + '.mp4',
                                audio_path=os.path.join(SAVE_AT, DATASET, dia, 'audio', diautt + '.mp3'),
                                image_interval=IMAGE_INTERVAL)
            except Exception as e:
                print(f"cannot instantiate an utterance object: {e}")
                continue


            chat_gmrc.append([u.speaker, u.utterance, u.starttime, u.emotion])


            try:
                with time_limit(15):
                    u.write_to_audio()
            except TimeoutException as e:
                print(f"Timed out!: {e}")

            u.run_on_video(dia_dir=os.path.join(SAVE_AT, DATASET, dia))
            for video_info in u.video_info:
                image_gmrc.append(video_info)

        with open(os.path.join(SAVE_AT, DATASET, dia, 'text', f'{dia}.csv'), 'w') as stream:
            stream.write('speaker,utterance,time,emotion\n')

            for line in chat_gmrc:
                speaker, utterance, starttime, emotion= line
                stream.write(speaker)
                stream.write(',')
                stream.write(f"\"{utterance}\"")
                stream.write(',')
                stream.write(str(starttime))
                stream.write(',')
                stream.write(emotion)
                stream.write('\n')    

        with open(os.path.join(SAVE_AT, DATASET, dia, 'image.json'), 'w') as stream:
            json.dump(image_gmrc, stream)

def batch_a_dict(dict_to_batch, num_batches):
    batch_size = len(dict_to_batch) // num_batches
    keys = (list(dict_to_batch.keys()))
    random.shuffle(keys)

    batches = [keys[i*batch_size:(i+1)*batch_size] for i in range(num_batches)]
    batches[-1] = keys[batch_size*(num_batches-1):]

    batches = [{key:dict_to_batch[key] for key in batch} for batch in batches]

    return batches

for DATASET in tqdm(['train', 'dev', 'test']):
    dia_diautts = list(datasets[DATASET].keys())
    dia_diautts = {dia: [diautt for diautt in dia_diautts if dia + '_' in diautt] 
                    for dia in get_unique_dias(dia_diautts)}

    # TODO: fix the below line.
    # dia_diautts = batch_a_dict(dia_diautts, NUM_CORES)
    
    run_thread(DATASET, dia_diautts)