## Set the paths

Download the visual features, and raw videos if you haven't yet.
And then set the paths

In [1]:
# download visaul features
# !gdown --id 1jS2ufbIovxg8umkZM5UKzsvtSp4UJJyt

VISUAL_FEATURES_DIR = {}
VISUAL_FEATURES_DIR['train'] = "/home/tk/datasets/MELD/visual-features/MELD-visual-features/train/"
VISUAL_FEATURES_DIR['dev'] = "/home/tk/datasets/MELD/visual-features/MELD-visual-features/dev/"
VISUAL_FEATURES_DIR['test'] = "/home/tk/datasets/MELD/visual-features/MELD-visual-features/test/"

# download the raw videos
# !wget http://web.eecs.umich.edu/~mihalcea/downloads/MELD.Raw.tar.gz
VIDS_DIR = {}
VIDS_DIR['train'] = "/home/tk/datasets/MELD/MELD.Raw/train/train_splits/"
VIDS_DIR['dev'] = "/home/tk/datasets/MELD/MELD.Raw/dev/dev_splits_complete/"
VIDS_DIR['test'] = "/home/tk/datasets/MELD/MELD.Raw/test/output_repeated_splits_test"

# This comes with VISUAL_FEATURES when you downloaded them.
ANNOTATION_PATH = "/home/tk/datasets/MELD/visual-features/MELD-visual-features/datasets.json"

THRESHOLDS = {'face': 0.8, 'angle': 1.15}

SAVE_AT = './DEBUG'

# in seconds
IMAGE_INTERVAL = 0.5

## Read pre-computed visual features and annotations

In [2]:
import json
import numpy as np
import os
import json
import av
import cv2
import random
from glob import glob
import pickle
from datetime import datetime, timedelta
from tqdm.notebook import tqdm

with open(ANNOTATION_PATH, 'r') as stream:
    datasets = json.load(stream)

visual_features = {DATASET: glob(os.path.join(VISUAL_FEATURES_DIR[DATASET], '*.npy'))
                   for DATASET in ['train', 'dev', 'test']}

visual_features = {DATASET: {os.path.basename(vf).split('.npy')[0] : vf 
                   for vf in tqdm(visual_features[DATASET])}
                   for DATASET in tqdm(['train', 'dev', 'test'])}

with open('friends-time/friends-time.pkl', 'rb') as stream:
    friends_time = pickle.load(stream)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=9988.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1108.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2610.0), HTML(value='')))





## Run on the images

In [3]:
import os
import av
import numpy as np
import cv2
from tqdm.notebook import tqdm
import time
import csv
from glob import glob
import numpy as np
import shutil
import uuid


MAIN_ACTORS = {}
for path in glob('../main-actors/*.npy'):
    name = path.split('/')[-1].split('.npy')[0]
    MAIN_ACTORS[name] = np.load(path)

def calc_angle_distance(emb1, emb2):
    """Calculate the angle (radian) distance between the embeddings."""
    return np.arccos(np.clip((emb1 @ emb2.T), -1, 1))

def get_unique_dias(list_of_diautts):
    return sorted(list(set([diautt.split('_')[0] for diautt in list_of_diautts])))

def get_time_unix_ms(time_string):
    hours, minutes, seconds = time_string.split(':')
    seconds, milliseconds = seconds.split(',')
    hours, minutes, seconds, milliseconds = int(hours), int(minutes), int(seconds), int(milliseconds)
    time_datetime = friends_time[season][episode] + timedelta(hours=hours, minutes=minutes, seconds=seconds)
    time_unix = time.mktime(time_datetime.timetuple())
    time_unix_ms = int(time_unix*1000 + milliseconds)    

    return time_unix_ms

In [12]:
for DATASET in tqdm(['train', 'dev', 'test']):
    diautts_ = list(datasets[DATASET].keys())
    
    dias = get_unique_dias(diautts_)
    diautts_ = {dia: [diautt for diautt in diautts_ if dia + '_' in diautt] for dia in dias}

    for dia, diautts in tqdm(diautts_.items()):
        shutil.rmtree(os.path.join(SAVE_AT, DATASET, dia), ignore_errors=True)
        os.makedirs(os.path.join(SAVE_AT, DATASET, dia, 'image'), exist_ok=True)
        os.makedirs(os.path.join(SAVE_AT, DATASET, dia, 'text'), exist_ok=True)
        os.makedirs(os.path.join(SAVE_AT, DATASET, dia, 'audio'), exist_ok=True)

        image_gmrc = []
        chat = []
        for diautt in diautts:
            try:
                annot = datasets[DATASET][diautt] 
                vis = visual_features[DATASET][diautt]
            except KeyError as e:
                print(e)
                continue
            vis = np.load(vis, allow_pickle=True).item()
            vidpath = os.path.join(VIDS_DIR[DATASET], diautt) + '.mp4'

            season = annot['Season']
            episode = annot['Episode']
            emotion = annot['Emotion']
            sentiment = annot['Sentiment']
            utterance = annot['Utterance']
            speaker = annot['Speaker']

            starttime = annot['StartTime']
            endtime = annot['EndTime']

            time_unix_ms_start = get_time_unix_ms(starttime)
            time_unix_ms_end = get_time_unix_ms(endtime)
            chat.append([speaker, utterance, time_unix_ms_start])

            aud = diautt + '.mp3'
            !ffmpeg -i $vidpath -q:a 0 -map a $SAVE_AT/$DATASET/$dia/audio/$aud

            container = av.open(vidpath)
            fps = float(container.streams.video[0].average_rate)
            spf = 1/fps 
            mspf = round(spf * 1000)
            for idx, frame in enumerate(container.decode(video=0)):
                if idx % round(IMAGE_INTERVAL * fps) != 0:
                    continue

                numpy_RGB = np.array(frame.to_image())
                numpy_BGR = cv2.cvtColor(numpy_RGB, cv2.COLOR_RGB2BGR)
                img_time = idx*mspf + time_unix_ms_start
                impath = os.path.join(SAVE_AT, DATASET, dia, 'image', 
                                       diautt + f'_frame{str(idx).zfill(5)}_{str(img_time)}.jpg')
                cv2.imwrite(impath, numpy_BGR)

                features = vis[idx]

                if not features:
                    continue

                to_append = {}
                to_append['files'] = [os.path.join('image', os.path.basename(impath))]
                container_id = str(uuid.uuid4())
                to_append['id'] = container_id
                to_append['mentions'] = []



                for k, feat in enumerate(features):
                    age = round(float(feat['age']), 3)
                    gender = round(float(feat['gender']), 3)
                    bbox = feat['bbox']
                    bbox, faceprob = [int(round(bb)) for bb in bbox[:4]], float(bbox[-1])
                    faceprob = round(faceprob, 3)
                    embedding = feat['embedding']
                    landmark = feat['landmark']

                    annotations = []

                    embedding.reshape(1, 512)
                    dists = {key: calc_angle_distance(embedding, val) for key, val \
                                in MAIN_ACTORS.items()}
                    face_candidate = min(dists, key=dists.get)

                    if dists[face_candidate] > THRESHOLDS['angle']:
                        face_candidate = None

                    # if face_candidate is not None:
                    # annotations.append(
                    #     {
                    #     'source': 'human',
                    #     'timestamp': round(time.time()*1000),
                    #     'type': 'emotion',
                    #     'value': emotion.upper()
                    #     }
                    # )

                    annotations.append(
                        {
                            'source': 'machine',
                            'timestamp': round(time.time()*1000),
                            'type': 'person',
                            'value': 
                                {'name': face_candidate,
                                    'age': age,
                                    'gender': gender,
                                    'faceprob': faceprob}
                        }
                    )
                    
                    mention_id = str(uuid.uuid4())
                    segment = [
                        {
                            'bounds': bbox,
                            'container_id': container_id,
                            'type': 'MultiIndex'
                            
                        }
                    ]
                    to_append['mentions'].append(
                        {
                            'annotations': annotations,
                            'id': mention_id,
                            'segment': segment
                        }
                    )
                    to_append['modality'] = 'image'
                    to_append['ruler'] = {
                        'bounds': [0, 0, numpy_BGR.shape[1], numpy_BGR.shape[0]],
                        'container_id': container_id,
                        'type': 'MultiIndex'
                    }
                    to_append['time'] = {
                        'container_id': container_id,
                        'start': time_unix_ms_start,
                        'end': time_unix_ms_end,
                        'type': 'TemporalRuler',
                    }
                    to_append['type'] = 'ImageSignal'

                    image_gmrc.append(to_append)

        with open(os.path.join(SAVE_AT, DATASET, dia, 'text', f'{dia}.csv'), 'w') as stream:
            stream.write('speaker,utterance,time,emotion\n')

            for line in chat:
                speaker, utterance, time_unix_ms_start = line
                stream.write(speaker)
                stream.write(',')
                stream.write(f"\"{utterance}\"")
                stream.write(',')
                stream.write(str(time_unix_ms_start))
                stream.write(',')
                stream.write(emotion)
                stream.write('\n')    

        with open(os.path.join(SAVE_AT, DATASET, dia, 'image.json'), 'w') as stream:
            json.dump(image_gmrc, stream)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1038.0), HTML(value='')))

name    : SubtitleHandler
Stream mapping:
  Stream #0:1 -> #0:0 (aac (native) -> mp3 (libmp3lame))
Press [q] to stop, [?] for help
Output #0, mp3, to './DEBUG/train/dia10/audio/dia10_utt5.mp3':
  Metadata:
    major_brand     : isom
    minor_version   : 512
    compatible_brands: isomiso2avc1mp41
    TSSE            : Lavf58.45.100
    Chapter #0:0: start 0.000000, end 2.251000
    Metadata:
      TIT2            : 00:11:56.132
    Stream #0:0(eng): Audio: mp3 (libmp3lame), 48000 Hz, stereo, fltp (default)
    Metadata:
      handler_name    : SoundHandler
      encoder         : Lavc58.91.100 libmp3lame
size=      58kB time=00:00:02.28 bitrate= 209.9kbits/s speed=67.2x    
video:0kB audio:58kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.696688%
ffmpeg version 4.3.1-4ubuntu1 Copyright (c) 2000-2020 the FFmpeg developers
  built with gcc 10 (Ubuntu 10.2.0-9ubuntu2)
  configuration: --prefix=/usr --extra-version=4ubuntu1 --toolchain=hardened --libdir=/usr/lib/x8

KeyboardInterrupt: 