In [2]:
import pandas as pd
import numpy as np
import math
import os
import glob
import struct
from tqdm import tqdm

In [18]:
path = './data/aria-noxi/001_2016-03-17_Paris/smile.novice.system.csv'
pathe = './data/aria-noxi/001_2016-03-17_Paris/engagement.novice.gold.csv'
bin_file = './data/aria-noxi/001_2016-03-17_Paris/novice.au.stream~'
file_name = './test.csv'
root = './data/aria-noxi'
save_folder = './processed_csv'

In [17]:
def time_to_frame(time):
    return int(np.rint(time * 100 / 4))

def get_save_path(path, prefix = 'processed_'):
    head, tail = os.path.split(path)
    tail = prefix + tail
    return os.path.join(head, tail)

def get_stream_save_path(path, save_folder):
    head, tail = os.path.split(path)
    _, sample = os.path.split(head)
    tmp = tail.split('.')
    ret = sample
    for i in tmp[:-1]:
        ret += '_' + i
    ret += '.csv'
    return os.path.join(save_folder, ret)


def search_directories(path):
    return [f for f in os.listdir(root) if not os.path.isfile(os.path.join(root, f))]

# root/sample/csv
def search_engagement_csv(root):
    all_csv = []
    samples = search_directories(root)
    for sample in samples:
        csv_list = glob.glob(os.path.join(root, sample, 'engage*.csv'))
#         print(len(csv_list), sample)
        all_csv.extend(csv_list)
    return all_csv

def search_feat_csv(root, feat_name):
    all_csv = []
    glob_string = feat_name + '*.csv'
    samples = search_directories(root)
    for sample in samples:
        engage_list = glob.glob(os.path.join(root, sample, 'engage*.csv'))
        feat_list = glob.glob(os.path.join(root, sample, glob_string))
        tuple_list = list(zip(feat_list, engage_list))

        all_csv.extend(tuple_list)
    
    return all_csv

def search_noxi(root, path_name_pattern):
    all_path = []
    samples = search_directories(root)
    for sample in samples:
        path_list = glob.glob(os.path.join(root, sample, path_name_pattern))
        path_list.sort()
        all_path.extend(path_list)
    return all_path

def str_to_num(input_str, minimum = 0, maximum = 1, default = 0.5):
    tmp = float(input_str)
    if tmp > maximum:
        tmp = maximum
    if tmp < minimum:
        tmp = minimum
    if math.isnan(tmp):
        tmp = default
    return tmp


def extract_engagement(path):
    save_path = get_save_path(path)
    df = pd.read_csv(path, header = None, names = ['engagement'])
    for index, row in df.iterrows(): 
        items = row[0].split(';')
        row[0] = str_to_num(items[1])
    df.to_csv(save_path, index = False, encoding= 'utf-8')
    
def duration_to_frame(feat_path, engage_path, feat_name, conf = True):
    save_path = get_save_path(feat_path)
    length = len(pd.read_csv(engage_path, header = None, names = ['engagement']).index)
#     print(length)
    df = pd.read_csv(feat_path, header = None, names = [feat_name])
#     print(df.index)
    feat_list = []
    index = 0
    for idx, row in df.iterrows(): 
            items = row[0].split(';')
            beg_ = time_to_frame(float(items[0]))
            end_ = time_to_frame(float(items[1]))
            if conf == True:
                val = str_to_num(items[3])
            else:
                val = 1
            if beg_ != index:
                filler = [0] * (beg_ - index)
                feat_list.extend(filler)
            values = [val] * (end_ - beg_ + 1) # plus beg_
            feat_list.extend(values)
            index = end_ + 1


    if length > index:
        filler = [0] * (length - index)
        feat_list.extend(filler)  
    feat_df = pd.DataFrame(feat_list, columns = [feat_name])
    feat_df.to_csv(save_path, index = False, encoding = 'utf-8')
    
def feat_merge(root, save_folder):
    samples = search_directories(root)
    
    for sample in samples:
        csv_list = glob.glob(os.path.join(root, sample, 'processed_*.csv'))
        csv_list.sort() # 0eng exp 1eng nov 2hnod exp 3hnod novice 4hshak exp 5hshak nov 6smil exp 7smil nov
        df_list = []
        for csv in csv_list:
            df_list.append(pd.read_csv(csv))
        mer_csv = df_list[0]
        for df in df_list[1:]:
            mer_csv = mer_csv.merge(df, left_index = True, right_index = True, suffixes = ('_expert','_novice'))
        mer_csv.to_csv(os.path.join(save_folder, sample+'.csv'), index = False, encoding = 'utf-8')


def binary_to_csv(path, save_path, dim, byte_num = 4):
#     with open(path, 'rb') as f:
#         val = f.read()
#         print(len(val))
    with open(path, 'rb') as f:
        feat_list = []
        while(True):
            val = f.read(dim * byte_num)
            if len(val) == 0:
                break;
            elif len(val) < (dim * byte_num):
                print('incomplete feature sample (less than one sample bytes read)', len(val), path)
                break;
            feat_set = []
            for i in range(0, dim * byte_num, 4):
                num = struct.unpack('f', val[i:i+4])[0]
                feat_set.append(num)

            feat_list.append(feat_set)
        df = pd.DataFrame(feat_list)
#         print(df)
        df.to_csv(save_path, index = False, encoding = 'utf-8')

In [4]:
# convert engagement
all_csv = search_engagement_csv(root)
for csv_file in tqdm(all_csv):
    extract_engagement(csv_file)

100%|██████████████████████████████████████████████████████████████████████████████████| 92/92 [02:36<00:00,  1.70s/it]


In [5]:
# convert smile etc
modalities = ['smile','headshake','headnod']
for modality in modalities:
    all_csv = search_feat_csv(root, modality)
    for csv_pair in tqdm(all_csv):
        duration_to_frame(csv_pair[0], csv_pair[1], feat_name = modality)

100%|██████████████████████████████████████████████████████████████████████████████████| 92/92 [00:06<00:00, 14.92it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 91/91 [00:05<00:00, 17.14it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 92/92 [00:05<00:00, 16.05it/s]


In [6]:
smile_list = [0,0,1,1,0,0,1,1]
pd.DataFrame(smile_list, columns = ['smile'])

Unnamed: 0,smile
0,0
1,0
2,1
3,1
4,0
5,0
6,1
7,1


In [7]:
#clean up
# all_csv = []
# samples = search_directories(root)
# for sample in samples:
#     csv_list = glob.glob(os.path.join(root, sample, 'processed*.csv'))
#     all_csv.extend(csv_list)

# for path in all_csv:
#     os.remove(path)

In [28]:
# data combination
p1 = './data/aria-noxi/001_2016-03-17_Paris/processed_engagement.novice.gold.csv'
p2 = './data/aria-noxi/001_2016-03-17_Paris/processed_smile.novice.system.csv'
p3 = './data/aria-noxi/001_2016-03-17_Paris/processed_headshake.novice.system.csv'
p4 = './data/aria-noxi/001_2016-03-17_Paris/processed_headnod.novice.system.csv'
# df1 = pd.read_csv(pe, header = None, names = ['index', 'engagement'])
df1 = pd.read_csv(p1)
df2 = pd.read_csv(p2)
df3 = pd.read_csv(p3)
df4 = pd.read_csv(p4)
df1.merge(df2, left_index= True, right_index =True)
df1.merge(df3, left_index= True, right_index =True)
df1.merge(df4, left_index= True, right_index =True)
# pd.concat([df1,df2,df3])
# df = df1.join(df2).join(df3).join(df4)
# df.to_csv('test.csv', index = False, encoding = 'utf-8')

Unnamed: 0,engagement,headnod
0,0.437500,0.0
1,0.437500,0.0
2,0.441763,0.0
3,0.441763,0.0
4,0.441763,0.0
...,...,...
26640,0.500000,0.0
26641,0.500000,0.0
26642,0.500000,0.0
26643,0.500000,0.0


In [39]:
feat_merge(root, save_folder)

In [147]:
#clean up processed stream
all_csv = glob.glob(os.path.join(save_folder, '*au.csv'))
all_csv.extend(glob.glob(os.path.join(save_folder, '*face.csv')))
all_csv.extend(glob.glob(os.path.join(save_folder, '*head.csv')))
all_csv.extend(glob.glob(os.path.join(save_folder, '*skel.csv')))
for path in all_csv:
    os.remove(path)

In [149]:
#process au feature for dataframe ready csv
au_list = search_noxi(root, '*au.stream~')
dim = 17
for au in tqdm(au_list):
    save_path = get_stream_save_path(au, save_folder)
    binary_to_csv(au, save_path, dim)


100%|██████████████████████████████████████████████████████████████████████████████████| 92/92 [01:21<00:00,  1.14it/s]


In [150]:
#process head feature for dataframe ready csv
list_ = search_noxi(root, '*head.stream~')
dim = 3
for l in tqdm(list_):
    save_path = get_stream_save_path(l, save_folder)
    binary_to_csv(l, save_path, dim)


100%|██████████████████████████████████████████████████████████████████████████████████| 92/92 [00:20<00:00,  4.53it/s]


In [151]:
#process face feature for dataframe ready csv
list_ = search_noxi(root, '*face.stream~')
dim = 4041
for l in tqdm(list_):
    save_path = get_stream_save_path(l, save_folder)
    binary_to_csv(l, save_path, dim)

100%|███████████████████████████████████████████████████████████████████████████████| 92/92 [6:08:06<00:00, 240.07s/it]


In [152]:
#process skel feature for dataframe ready csv
list_ = search_noxi(root, '*skel.stream~')
dim = 350
for l in tqdm(list_):
    save_path = get_stream_save_path(l, save_folder)
    binary_to_csv(l, save_path, dim)

100%|██████████████████████████████████████████████████████████████████████████████████| 92/92 [21:52<00:00, 14.27s/it]


In [124]:
binary_to_csv(face_bin, dim, byte_num)
# dim * byte_num

430366500
incomplete feature sample (less than one sample bytes read) 4 26897907 ./data/aria-noxi/001_2016-03-17_Paris/novice.face.stream~


In [140]:
# face_bin = './data/aria-noxi/001_2016-03-17_Paris/novice.face.stream~'
# dim = 4041
# byte_num = 4
# with open(face_bin, 'rb') as f:
#     val = f.read()
#     print(len(val))
#     while(True):
#         val = f.read(dim * byte_num)
#         if len(val) == 0:
#             break;
#         elif len(val) < (dim * byte_num):
#             print('incomplete feature sample (less than one sample bytes read)', len(val))
#             break;
#         feat_set = []
#         for i in range(0, dim * byte_num, byte_num):
#             num = struct.unpack('f', val[i:i+4])[0]
#             feat_set.append(num)

#         feat_list.append(feat_set)
        
# def binary_to_csv(path, save_path, dim, byte_num = 4):
#     with open(path, 'rb') as f:
#         val = f.read()
#         print(len(val))
#     with open(path, 'rb') as f:
#         feat_list = []
#         counter = 0
#         while(True):
#             counter +=1
#             val = f.read(dim * byte_num)
#             if len(val) == 0:
#                 break;
#             elif len(val) < (dim * byte_num):
#                 print('incomplete feature sample (less than one sample bytes read)', len(val), counter, path)
#                 break;
#             feat_set = []
#             for i in range(0, dim * byte_num, 4):
#                 num = struct.unpack('f', val[i:i+4])[0]
#                 feat_set.append(num)

#             feat_list.append(feat_set)
# #         df.to_csv(save_path, index = False, encoding = 'utf-8')

430366500


In [72]:
#verify downloaded stream file size
au_pattern = '*au.stream~'
crop_vid_pattern = '*crop].mp4'
def search_noxi(root, path_name_pattern):
    all_stream = []
    samples = search_directories(root)
    for sample in samples:
        stream_list = glob.glob(os.path.join(root, sample, path_name_pattern))
        stream_list.sort()
#         print(sample)
#         print([os.path.getsize(f) for f in stream_list])
        all_stream.extend(stream_list)
    return all_stream

search_noxi(root, '*.stream~')
# search_noxi(root, au_pattern)

001_2016-03-17_Paris
[1810500, 430366500, 319500, 37275000, 1810500, 430366500, 319500, 37275000]
002_2016-03-17_Paris
[3303100, 785166300, 582900, 68005000, 3303100, 785166300, 582900, 68005000]
004_2016-03-18_Paris
[1999200, 475221600, 352800, 41160000, 1999200, 475221600, 352800, 41160000]
005_2016-03-18_Paris
[1861500, 442489500, 328500, 38325000, 1861500, 442489500, 328500, 38325000]
006_2016-03-18_Paris
[2623100, 623526300, 462900, 54005000, 2623100, 623526300, 462900, 54005000]
007_2016-03-21_Paris
[3692400, 877705200, 651600, 76020000, 3692400, 877705200, 651600, 76020000]
009_2016-03-25_Paris
[1536800, 365306400, 271200, 31640000, 1536800, 365306400, 271200, 31640000]
010_2016-03-25_Paris
[2403800, 571397400, 424200, 49490000, 2403800, 571397400, 424200, 49490000]
011_2016-03-25_Paris
[2448000, 581904000, 432000, 50400000, 2448000, 581904000, 432000, 50400000]
012_2016-03-25_Paris
[1995800, 474413400, 352200, 41090000, 1995800, 474413400, 352200, 41090000]
013_2016-03-30_Paris

['./data/aria-noxi\\001_2016-03-17_Paris\\expert.au.stream~',
 './data/aria-noxi\\001_2016-03-17_Paris\\expert.face.stream~',
 './data/aria-noxi\\001_2016-03-17_Paris\\expert.head.stream~',
 './data/aria-noxi\\001_2016-03-17_Paris\\expert.skel.stream~',
 './data/aria-noxi\\001_2016-03-17_Paris\\novice.au.stream~',
 './data/aria-noxi\\001_2016-03-17_Paris\\novice.face.stream~',
 './data/aria-noxi\\001_2016-03-17_Paris\\novice.head.stream~',
 './data/aria-noxi\\001_2016-03-17_Paris\\novice.skel.stream~',
 './data/aria-noxi\\002_2016-03-17_Paris\\expert.au.stream~',
 './data/aria-noxi\\002_2016-03-17_Paris\\expert.face.stream~',
 './data/aria-noxi\\002_2016-03-17_Paris\\expert.head.stream~',
 './data/aria-noxi\\002_2016-03-17_Paris\\expert.skel.stream~',
 './data/aria-noxi\\002_2016-03-17_Paris\\novice.au.stream~',
 './data/aria-noxi\\002_2016-03-17_Paris\\novice.face.stream~',
 './data/aria-noxi\\002_2016-03-17_Paris\\novice.head.stream~',
 './data/aria-noxi\\002_2016-03-17_Paris\\novice

In [9]:
save_path = './config/sample_name.csv'
samples = search_directories(root)
df = pd.DataFrame(samples, columns =['sample'])
df.to_csv(save_path, index = False, encoding= 'utf-8')

In [10]:
print('Length of {Name:s} is {num:d}'.format(Name='abc', num = 10))

Length of abc is 10


In [4]:
test = './processed_csv/001_2016-03-17_Paris_expert_face.csv'
df = pd.read_csv(test)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4031,4032,4033,4034,4035,4036,4037,4038,4039,4040
0,0.083900,0.057016,1.652853,0.083455,0.054458,1.646217,0.082210,0.048699,1.630389,0.081713,...,1.599310,0.111176,0.081827,1.599064,0.110942,0.080452,1.597526,0.111505,0.080209,1.596675
1,0.083628,0.056258,1.652084,0.083201,0.053772,1.645434,0.082014,0.048146,1.629583,0.081541,...,1.598189,0.110911,0.081735,1.597862,0.110713,0.080392,1.596262,0.111298,0.080194,1.595386
2,0.083846,0.056507,1.654070,0.083428,0.054006,1.647415,0.082271,0.048366,1.631545,0.081827,...,1.600795,0.111871,0.082302,1.600553,0.111677,0.080937,1.599019,0.112256,0.080719,1.598184
3,0.084699,0.057081,1.655531,0.084396,0.054536,1.648951,0.083418,0.048663,1.633284,0.083071,...,1.603188,0.112925,0.082899,1.602782,0.112753,0.081543,1.601358,0.113343,0.081337,1.600577
4,0.083469,0.056930,1.655526,0.083367,0.054366,1.648960,0.082839,0.048420,1.633321,0.082791,...,1.603110,0.112436,0.082999,1.602652,0.112298,0.081639,1.601216,0.112909,0.081447,1.600443
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26620,-0.140917,0.046651,1.628607,-0.139473,0.043564,1.623012,-0.136069,0.035122,1.610030,-0.133983,...,1.575668,-0.112279,0.072554,1.573112,-0.111379,0.071398,1.572133,-0.110299,0.071591,1.571596
26621,-0.143212,0.044734,1.629322,-0.141868,0.041584,1.623737,-0.138772,0.032993,1.610786,-0.136857,...,1.576892,-0.113663,0.070982,1.574304,-0.112851,0.069857,1.573425,-0.111833,0.070074,1.572912
26622,0.000003,-0.052398,-0.055524,0.000003,-0.051136,-0.062607,0.000002,-0.047756,-0.079285,0.000002,...,-0.091054,0.024901,0.002225,-0.090460,0.025026,0.002118,-0.092751,0.025650,0.002695,-0.093640
26623,0.000003,-0.052398,-0.055524,0.000003,-0.051136,-0.062607,0.000002,-0.047756,-0.079285,0.000002,...,-0.091054,0.024901,0.002225,-0.090460,0.025026,0.002118,-0.092751,0.025650,0.002695,-0.093640


In [9]:
n = df.to_numpy()
n.save()

array([[ 8.39002952e-02,  5.70155755e-02,  1.65285254e+00, ...,
         1.11504562e-01,  8.02092031e-02,  1.59667516e+00],
       [ 8.36281404e-02,  5.62581718e-02,  1.65208375e+00, ...,
         1.11298047e-01,  8.01944509e-02,  1.59538567e+00],
       [ 8.38460773e-02,  5.65071031e-02,  1.65407002e+00, ...,
         1.12255946e-01,  8.07186812e-02,  1.59818387e+00],
       ...,
       [ 2.91734864e-06, -5.23982607e-02, -5.55244759e-02, ...,
         2.56504286e-02,  2.69530481e-03, -9.36401114e-02],
       [ 2.91734864e-06, -5.23982607e-02, -5.55244759e-02, ...,
         2.56504286e-02,  2.69530481e-03, -9.36401114e-02],
       [ 2.91734864e-06, -5.23982607e-02, -5.55244759e-02, ...,
         2.56504286e-02,  2.69530481e-03, -9.36401114e-02]])

In [14]:
test_save = 'test_npz.npy'
np.save(test_save, n)

In [15]:
n1 = np.load(test_save)
n1

array([[ 8.39002952e-02,  5.70155755e-02,  1.65285254e+00, ...,
         1.11504562e-01,  8.02092031e-02,  1.59667516e+00],
       [ 8.36281404e-02,  5.62581718e-02,  1.65208375e+00, ...,
         1.11298047e-01,  8.01944509e-02,  1.59538567e+00],
       [ 8.38460773e-02,  5.65071031e-02,  1.65407002e+00, ...,
         1.12255946e-01,  8.07186812e-02,  1.59818387e+00],
       ...,
       [ 2.91734864e-06, -5.23982607e-02, -5.55244759e-02, ...,
         2.56504286e-02,  2.69530481e-03, -9.36401114e-02],
       [ 2.91734864e-06, -5.23982607e-02, -5.55244759e-02, ...,
         2.56504286e-02,  2.69530481e-03, -9.36401114e-02],
       [ 2.91734864e-06, -5.23982607e-02, -5.55244759e-02, ...,
         2.56504286e-02,  2.69530481e-03, -9.36401114e-02]])

In [22]:
feats = ['au', 'face', 'head', 'skel']
samples = search_directories(root)
for s in samples:
    print('Converting {:s}'.format(s))
    for f in feats:
        expert_affix = '_expert_' + f + '.csv'
        exp_save = s + '_expert_' + f
        novice_affix = '_novice_' + f + '.csv'
        nov_save = s + '_novice_' + f
        tmp = pd.read_csv(os.path.join(save_folder, s + expert_affix))
        np.save(os.path.join(save_folder, exp_save), tmp)
        tmp = pd.read_csv(os.path.join(save_folder, s + novice_affix))
        np.save(os.path.join(save_folder, nov_save), tmp)


Converting 001_2016-03-17_Paris
Converting 002_2016-03-17_Paris
Converting 004_2016-03-18_Paris
Converting 005_2016-03-18_Paris
Converting 006_2016-03-18_Paris
Converting 007_2016-03-21_Paris
Converting 009_2016-03-25_Paris
Converting 010_2016-03-25_Paris
Converting 011_2016-03-25_Paris
Converting 012_2016-03-25_Paris
Converting 013_2016-03-30_Paris
Converting 014_2016-04-01_Paris
Converting 015_2016-04-05_Paris
Converting 016_2016-04-05_Paris
Converting 017_2016-04-05_Paris
Converting 026_2016-04-06_Nottingham
Converting 027_2016-04-06_Nottingham
Converting 028_2016-04-06_Nottingham
Converting 029_2016-04-06_Nottingham
Converting 030_2016-04-06_Nottingham
Converting 031_2016-04-06_Nottingham
Converting 032_2016-04-07_Nottingham
Converting 033_2016-04-07_Nottingham
Converting 034_2016-04-07_Nottingham
Converting 035_2016-04-07_Nottingham
Converting 036_2016-04-07_Nottingham
Converting 039_2016-04-07_Nottingham
Converting 065_2016-04-14_Nottingham
Converting 066_2016-05-23_Augsburg
Conv

In [27]:
nov_save = '001_2016-03-17_Paris_novice_face.npy'
tmp = np.load(os.path.join(save_folder, nov_save))
tmp[:3,:]

array([[0.06311326, 0.00945116, 1.58805943, ..., 0.08768398, 0.05345267,
        1.53908885],
       [0.06304747, 0.00904121, 1.58676469, ..., 0.08741674, 0.053238  ,
        1.53822803],
       [0.06364778, 0.00906276, 1.58806849, ..., 0.08792129, 0.05293601,
        1.53917217]])