In [1]:
import pandas as pd
import numpy as np
import math
import os
import glob
import struct
from tqdm import tqdm

In [2]:
path = './data/aria-noxi/001_2016-03-17_Paris/smile.novice.system.csv'
pathe = './data/aria-noxi/001_2016-03-17_Paris/engagement.novice.gold.csv'
bin_file = './data/aria-noxi/001_2016-03-17_Paris/novice.au.stream~'
file_name = './test.csv'
root = './data/aria-noxi'
save_folder = './processed_data'
# every sample folder should have these 9 files
list_of_files = ['engagement.csv','expert.au.stream','expert.face.stream','expert.head.stream','expert.skel.stream','expert.video[crop].mp4','novice.au.stream','novice.face.stream','novice.head.stream','novice.skel.stream','expert.video[crop].mp4']
debug = False
list_of_pattern_dim = [('*au.stream~', 17), ('*head.stream~', 3), ('*face.stream~', 4041), ('*skel.stream~', 350)]
modalities = ['au', 'head', 'face', 'skel']

In [3]:
def search_directories(path):
    return [f for f in os.listdir(root) if not os.path.isfile(os.path.join(root, f))]

def time_to_frame(time):
    return int(np.rint(time * 100 / 4))

def get_save_path(path, prefix = 'processed_'):
    head, tail = os.path.split(path)
    tail = prefix + tail
    return os.path.join(head, tail)

def search_pattern(root, pattern):
    all_path = []
    samples = search_directories(root)
    for sample in samples:
        path_list = glob.glob(os.path.join(root, sample, pattern))
        path_list.sort()
        all_path.extend(path_list)
    return all_path


In [4]:
def get_stream_save_path(path, save_folder):
    head, tail = os.path.split(path)
    _, sample = os.path.split(head)
    tmp = tail.split('.')
    ret = sample
    for i in tmp[:-1]:
        ret += '_' + i
    ret += '.csv'
    return os.path.join(save_folder, ret)

In [5]:
# check all files for every sample (not the contents of files)
def validate_integrity(root):
    flag = True
    samples = search_directories(root)
    for sample in samples:
        for file in list_of_files:
            if os.path.isfile(os.path.join(root, sample, file)) is not True:
                print(f'{sample} is missing {file}')
                flag = False
    return flag

def binary_to_csv(path, save_path, dim, byte_num = 4):
#     with open(path, 'rb') as f:
#         val = f.read()
#         print(len(val))
    with open(path, 'rb') as f:
        feat_list = []
        while(True):
            val = f.read(dim * byte_num)
            if len(val) == 0:
                break;
            elif len(val) < (dim * byte_num):
                print('incomplete feature sample (less than one sample bytes read)', len(val), path)
                break;
            feat_set = []
            for i in range(0, dim * byte_num, 4):
                num = struct.unpack('f', val[i:i+4])[0]
                feat_set.append(num)

            feat_list.append(feat_set)
        df = pd.DataFrame(feat_list)
#         print(df)
        df.to_csv(save_path, index = False, encoding = 'utf-8')

In [6]:
def check_len(root, list_of_pattern_dim = [('*au.stream~', 17), ('*head.stream~', 3), ('*face.stream~', 4041), ('*skel.stream~', 350)], debug = False):
    samples = search_directories(root)
    tmp = 0
    flag = True
    for sample in tqdm(samples):
        
        for pattern, dim in list_of_pattern_dim:
            stream_list = glob.glob(os.path.join(root, sample, pattern))
            for stream_ in stream_list:
                if tmp == 0:
                    tmp = os.path.getsize(stream_) / dim
                else:
                    if tmp != os.path.getsize(stream_) / dim:
                        print(f'{sample} streams frame dismatch')
                        flag = False
        tmp = 0
        # print(stream_list)
        # print([os.path.getsize(f) for f in stream_list])
        if debug:
            break
        return flag
    
#clean up processed binary stream
def clean_up(save_folder = './processed_data'):
    all_csv = glob.glob(os.path.join(save_folder, '*au.csv'))
    all_csv.extend(glob.glob(os.path.join(save_folder, '*face.csv')))
    all_csv.extend(glob.glob(os.path.join(save_folder, '*head.csv')))
    all_csv.extend(glob.glob(os.path.join(save_folder, '*skel.csv')))
    for path in all_csv:
        os.remove(path)
        
def binary2csv(root, pattern, dim, debug = False, save_folder = './processed_data'):
    bin_list = search_pattern(root, pattern)
    dim = dim
    for bin_ in tqdm(bin_list):

        save_path = get_stream_save_path(bin_, save_folder)
        binary_to_csv(bin_, save_path, dim)

        if debug:
            break



In [7]:
validate_integrity(root)

True

In [8]:
binary_to_csv(bin_file, file_name, 17)

In [9]:
# clean_up()
for pattern, dim in list_of_pattern_dim:
    binary2csv(root, pattern, dim, debug = debug)

100%|██████████| 162/162 [01:47<00:00,  1.50it/s]
100%|██████████| 162/162 [00:24<00:00,  6.56it/s]
100%|██████████| 162/162 [7:46:00<00:00, 172.60s/it]  
100%|██████████| 162/162 [25:20<00:00,  9.39s/it]


In [10]:
check_len(root, list_of_pattern_dim, debug = debug)

  0%|          | 0/81 [00:00<?, ?it/s]


True

In [11]:
# def check_disturbance():
#     all_csv = glob.glob(os.path.join(save_folder, '*au.csv'))
#     all_csv.extend(glob.glob(os.path.join(save_folder, '*face.csv')))
#     all_csv.extend(glob.glob(os.path.join(save_folder, '*head.csv')))
#     all_csv.extend(glob.glob(os.path.join(save_folder, '*skel.csv')))