In [None]:
"""
Prepare avatar dataset for tuning ImageBindLora model
"""

In [21]:
import pandas as pd
import pickle
import re

import numpy as np
from sklearn.metrics import pairwise_distances
import torch
import os
from tqdm import tqdm

## Prepare clips dataframe

In [22]:
def preproc_text_labels(ds):
    
    gender_map = {
              'Arvid': 'man', 
              'Bizdev': 'man',
              'C9175': 'man',
              'C9176': 'woman',
              'C9180': 'woman',
              'C9183': 'man',
              'C9186': 'man',
              'C9187': 'woman',
              'C9188': 'man',
              'Jessica': 'woman',
              'Mia': 'woman',
              'Nick': 'man',
              'Vlad': 'man',
              'Yao': 'man',
              }
    
    ds['gender'] = ds.folder.apply(lambda x: gender_map[x])
    ds['vcls'] = ds.vcls.apply(lambda x: x.replace('_', ' ').strip())
    ds['vcls'] = ds.vcls.apply(lambda x: re.sub('talk ', 'talking ', x))
    ds.loc[ds.vcls == 'pause', 'vcls'] = 'talk pause'
    ds.loc[ds.vcls == 'yes', 'vcls'] = 'agreement'
    ds.loc[ds.vcls == 'ok', 'vcls'] = 'agreement'

    def get_sentence(r):
        return f"{r[0]} {r[1]}"
    
    ds['sentence'] = ds[['gender', 'vcls']].apply(get_sentence, axis=1)
    return ds

In [23]:
base = '/home/vash/apps/avatar/avatar_studio/db'
df = pd.read_csv('/home/vash/apps/avatar/gesture_clustering/avatar_ds.csv')

# some clips are broken and we need renove them from dataset
with open('/home/vash/apps/avatar/original_imagebind/ImageBind/avatar_original.pickl', 'rb') as f:
    labels = pickle.load(f)

ds = pd.DataFrame()
ds['path'] = labels
ds = ds.merge(df, on='path', how='left')
ds = preproc_text_labels(ds)
ds.head()

  return f"{r[0]} {r[1]}"


Unnamed: 0,path,folder,video_type,duration,vcls,gender,sentence
0,C9180/videos/talk01.mp4,C9180,talk01,4.56,talk,woman,woman talk
1,C9180/videos/talk02.mp4,C9180,talk02,3.8,talk,woman,woman talk
2,C9180/videos/talk03.mp4,C9180,talk03,5.16,talk,woman,woman talk
3,C9180/videos/talk04.mp4,C9180,talk04,2.08,talk,woman,woman talk
4,C9180/videos/talk05.mp4,C9180,talk05,2.4,talk,woman,woman talk


In [24]:
ds.sentence.value_counts()

sentence
man talk                       86
woman talk                     56
man talking neutral            41
man talking happy              25
man general                    17
man talking persuasive         14
man talking interrogative      13
woman talking neutral          10
man talking sorry               8
woman general                   8
man affirmative                 6
woman affirmative               6
woman talking happy             5
man talk pause                  5
man negative                    5
woman talking sorry             3
woman negative                  3
woman talking interrogative     3
woman talk pause                2
woman talking persuasive        2
woman hands                     1
woman cross                     1
man agreement                   1
man helpless                    1
woman shy                       1
woman dance                     1
woman agreement                 1
man sorry                       1
Name: count, dtype: int64

## Prepa key points for clips

In [25]:
## Read points info for Avatar Dataset

with open('/home/vash/apps/avatar/mmpose/wholebody_points_scaled.pckl', 'rb') as f:
    points = pickle.load(f)

In [26]:
### Preprocessing foi ImageBind model like IMU input

key_point_ids = [0,1,2,3,4,5,6,7,8,9,10,42,47,71,74,77]

key_points_label_map = {
    0: 'nose',
    1: 'left_eye',
    2: 'right_eye',
    3: 'left_ear',
    4: 'right_ear',
    5: 'left_shoulder',
    6: 'right_shoulder',
    7: 'left_elbow',
    8: 'right_elbow',
    9: 'left_wrist',
    10: 'right_wrist',
    42: 'right_eyebrow',
    47: 'left_eyebrow',
    71: 'right_mouth',
    74: 'center_mouth',
    77: 'left_mouth'
}

In [27]:
scale_ratio = 2

l = []
for p in points:
    video = p['video']
    frame_id = p['frame_id']
    height = p['height']
    width = p['width']
    for p_id in key_point_ids:
        x, y = p['kpts'][p_id]
        l.append({'video': video, 
                  'frame_id': frame_id, 
                  'point': key_points_label_map[p_id],
                  'height': height, 
                  'width': width, 
                  'x': x, 
                  'y': y,
                  'scale': scale_ratio})
        
# Normalize coordinates
points_df = pd.DataFrame(l)
points_df['x'] = points_df['x'] / (points_df.width / scale_ratio)
points_df['y'] = points_df['y'] / (points_df.height / scale_ratio)


points_df.head()

Unnamed: 0,video,frame_id,point,height,width,x,y,scale
0,C9180/videos/talk01.mp4,0,nose,1080,1080,0.517443,0.241745,2
1,C9180/videos/talk01.mp4,0,left_eye,1080,1080,0.568861,0.208983,2
2,C9180/videos/talk01.mp4,0,right_eye,1080,1080,0.469996,0.205415,2
3,C9180/videos/talk01.mp4,0,left_ear,1080,1080,0.624771,0.27056,2
4,C9180/videos/talk01.mp4,0,right_ear,1080,1080,0.413557,0.260498,2


In [28]:
l = []
for video, video_df in points_df.groupby('video'):
    for point, point_df in video_df.groupby('point'):
        point_df = point_df.sort_values('frame_id')
        point_df['shift_x'] = point_df.x.shift(-1)
        point_df['shift_y'] = point_df.y.shift(-1)
        point_df = point_df.dropna()
        l.append(point_df)
points_df = pd.concat(l)
points_df.head()

Unnamed: 0,video,frame_id,point,height,width,x,y,scale,shift_x,shift_y
508190,Arvid/videos/talk_neutral_01.mp4,0,center_mouth,1080,1500,0.494803,0.278039,2,0.495537,0.280426
508206,Arvid/videos/talk_neutral_01.mp4,1,center_mouth,1080,1500,0.495537,0.280426,2,0.496552,0.282748
508222,Arvid/videos/talk_neutral_01.mp4,2,center_mouth,1080,1500,0.496552,0.282748,2,0.502023,0.286497
508238,Arvid/videos/talk_neutral_01.mp4,3,center_mouth,1080,1500,0.502023,0.286497,2,0.503274,0.286479
508254,Arvid/videos/talk_neutral_01.mp4,4,center_mouth,1080,1500,0.503274,0.286479,2,0.507573,0.284855


In [29]:
video_keypoints_dst = {}

for video, video_df in points_df.groupby('video'):
    M = []
    for frame_id, frame_df in video_df.groupby('frame_id'):
        frame_df = frame_df.sort_values('point')
        # pairwise distances between key points on current frame
        m = pairwise_distances(frame_df[['x', 'y']].values, frame_df[['x', 'y']].values)
        # pairwise distances between key points on next frame
        m_next = pairwise_distances(frame_df[['shift_x', 'shift_y']].values, frame_df[['shift_x', 'shift_y']].values)
        
        # delta
        dm = m_next - m
        # get upper triangle without diagonal # after that shape will be (120, 0)
        tri_ids = np.triu_indices_from(dm, k=1)

        M.append(dm[tri_ids])

    M = np.vstack(M)
    video_keypoints_dst[video] = np.vstack(M)

In [30]:
def points_to_imu(m):
    """ IMU input for ImageBind model 6 x 2000
        in this case I take into account first 4 seconds of video with fps 25 (4 * 25) and for convinience I get rid off last column(distances to right_wrist)
    """
    n_rows = 100
    _m = m[:n_rows] # !
    dr = n_rows - len(_m)
    b = torch.from_numpy(_m.astype('float32'))
    if dr > 0:
        b = torch.nn.functional.pad(b, (0, 0, 0, dr), mode='constant', value=0)
    imu = torch.stack([b[:, :20].flatten(), b[:, 20:40].flatten(), b[:, 40:60].flatten(), b[:, 60:80].flatten(), b[:, 80:100].flatten(), b[:, 100:].flatten()])
    return imu

In [31]:
for i in video_keypoints_dst:
    m = video_keypoints_dst[i]
    imu = points_to_imu(m)

    imu_batch  = torch.unsqueeze(imu, 0)
    break
imu_batch.shape

torch.Size([1, 6, 2000])

## Cretae dataset

In [32]:
# Resize all video with scale 0.5
# Prepare csv with normalized keypoints distances(like imu modality)
# Dataset structure: "Actor": "Clips", "telemetry"

In [33]:
ds_base = '/home/vash/apps/avatar/imu_ds'

for i in tqdm(video_keypoints_dst):
    actor = i.split('/')[0]
    video =  i.split('/')[-1]
    actor_path = os.path.join(ds_base, actor)
    actor_clips = os.path.join(actor_path, 'clips')
    actor_telemetry = os.path.join(actor_path, 'telemetry')
    os.makedirs(actor_path, exist_ok=True)
    os.makedirs(actor_clips, exist_ok=True)
    os.makedirs(actor_telemetry, exist_ok=True)

    m = video_keypoints_dst[i]
    imu = points_to_imu(m)
    torch.save(imu, os.path.join(actor_telemetry, video.replace('.mp4', '.pt')))

    video_path = os.path.join(base, i)
    out_video_path = os.path.join(actor_clips, video)

    ! ffmpeg -i $video_path -vf scale="ceil(iw/4)*2:ceil(ih/4)*2" $out_video_path -hide_banner -loglevel error


100%|██████████| 326/326 [05:47<00:00,  1.07s/it]


In [34]:
# Prepare csv for avatar tuning dataset

ds['sentence'] = ds['sentence'].str.replace(' ', '_')
ds['clip'] = ds['path'].str.replace('videos', 'clips')
ds['telemetry'] = ds['path'].str.replace('videos', 'telemetry')
ds['telemetry'] = ds['telemetry'].str.replace('mp4', 'pt')
ds.head()

Unnamed: 0,path,folder,video_type,duration,vcls,gender,sentence,clip,telemetry
0,C9180/videos/talk01.mp4,C9180,talk01,4.56,talk,woman,woman_talk,C9180/clips/talk01.mp4,C9180/telemetry/talk01.pt
1,C9180/videos/talk02.mp4,C9180,talk02,3.8,talk,woman,woman_talk,C9180/clips/talk02.mp4,C9180/telemetry/talk02.pt
2,C9180/videos/talk03.mp4,C9180,talk03,5.16,talk,woman,woman_talk,C9180/clips/talk03.mp4,C9180/telemetry/talk03.pt
3,C9180/videos/talk04.mp4,C9180,talk04,2.08,talk,woman,woman_talk,C9180/clips/talk04.mp4,C9180/telemetry/talk04.pt
4,C9180/videos/talk05.mp4,C9180,talk05,2.4,talk,woman,woman_talk,C9180/clips/talk05.mp4,C9180/telemetry/talk05.pt


In [35]:
ds.to_csv('avatar_tune_ds.csv', index=False)

In [36]:
ds.head()

Unnamed: 0,path,folder,video_type,duration,vcls,gender,sentence,clip,telemetry
0,C9180/videos/talk01.mp4,C9180,talk01,4.56,talk,woman,woman_talk,C9180/clips/talk01.mp4,C9180/telemetry/talk01.pt
1,C9180/videos/talk02.mp4,C9180,talk02,3.8,talk,woman,woman_talk,C9180/clips/talk02.mp4,C9180/telemetry/talk02.pt
2,C9180/videos/talk03.mp4,C9180,talk03,5.16,talk,woman,woman_talk,C9180/clips/talk03.mp4,C9180/telemetry/talk03.pt
3,C9180/videos/talk04.mp4,C9180,talk04,2.08,talk,woman,woman_talk,C9180/clips/talk04.mp4,C9180/telemetry/talk04.pt
4,C9180/videos/talk05.mp4,C9180,talk05,2.4,talk,woman,woman_talk,C9180/clips/talk05.mp4,C9180/telemetry/talk05.pt


In [17]:
a = torch.load('/home/vash/apps/avatar/ImageBindLora_tuning/ImageBind-LoRA/.datasets/avatar/Arvid/telemetry/talk_neutral_05.pt')

In [19]:
type(a)

numpy.ndarray

In [20]:
isinstance(a,np.ndarray)

True