In [10]:
from pythonosc import udp_client
import time
import sounddevice as sd
import torch
from dataloaders.beat import CustomDataset
from dataloaders.build_vocab import Vocab
import pickle
import numpy as np

camn_config_file = open("camn_config.obj", 'rb') 
args = pickle.load(camn_config_file)
args.batch_size = 16

mean_facial = torch.from_numpy(np.load(args.root_path+args.mean_pose_path+f"{args.facial_rep}/json_mean.npy")).float()
std_facial = torch.from_numpy(np.load(args.root_path+args.mean_pose_path+f"{args.facial_rep}/json_std.npy")).float()
mean_audio = torch.from_numpy(np.load(args.root_path+args.mean_pose_path+f"{args.audio_rep}/npy_mean.npy")).float()
std_audio = torch.from_numpy(np.load(args.root_path+args.mean_pose_path+f"{args.audio_rep}/npy_std.npy")).float()
mean_pose = torch.from_numpy(np.load(args.root_path+args.mean_pose_path+f"{args.pose_rep}/bvh_mean.npy")).float()
std_pose = torch.from_numpy(np.load(args.root_path+args.mean_pose_path+f"{args.pose_rep}/bvh_std.npy")).float()

In [27]:
test_data = CustomDataset(args, "test")
test_loader = torch.utils.data.DataLoader(
    test_data, 
    batch_size=1,  
    shuffle=False,  
    drop_last=False,
)

In [29]:
for its, data in enumerate(test_loader):
    if its == 5:
        break

In [30]:
pose = data['pose']
audio = data['audio']
facial = data['facial']
id = data["id"]
word = data["word"]
emo = data["emo"]

In [31]:
 # load in model
import os
from utils.other_tools import load_checkpoints
from models.camn import CaMN
model_path = os.path.join(args.root_path, 'datasets/beat_cache/beat_4english_15_141/weights/camn.bin')
camn_model = CaMN(args)
load_checkpoints(camn_model, args.root_path+args.test_ckpt, args.g_name)
camn_model = camn_model.cuda().eval()

[32m2024-06-12 04:10:14.212[0m | [1mINFO    [0m | [36mutils.other_tools[0m:[36mload_checkpoints[0m:[36m96[0m - [1mload self-pretrained checkpoints for CaMN[0m


In [32]:
tar_pose = pose.cuda()
in_audio = audio.cuda()
in_facial = facial.cuda()
in_id = id.cuda()
in_emo = emo.cuda()

pre_frames = 4
pre_pose = tar_pose.new_zeros((tar_pose.shape[0], tar_pose.shape[1], tar_pose.shape[2] + 1)).cuda()
pre_pose[:, 0:pre_frames, :-1] = tar_pose[:, 0:pre_frames]
pre_pose[:, 0:pre_frames, -1] = 1

in_audio = in_audio.reshape(1, -1)

In [33]:
out_dir_vec = camn_model(pre_seq=pre_pose, in_audio=in_audio, in_facial=in_facial, in_id=in_id, in_emo=in_emo)

In [34]:
out_final = np.array((out_dir_vec.cpu().detach().reshape(-1, args.pose_dims) * std_pose) + mean_pose)

In [35]:
out_final.shape

(960, 141)

In [36]:
test_demo = args.root_path + args.test_data_path + f"{args.pose_rep}_vis/"
test_seq_list = os.listdir(test_demo)
test_seq_list.sort()

In [37]:
with open(f"result_pose/result_raw_{test_seq_list[its]}", 'w+') as f_real:
    for line_id in range(out_final.shape[0]): #,args.pre_frames, args.pose_length
        line_data = np.array2string(out_final[line_id], max_line_width=np.inf, precision=6, suppress_small=False, separator=' ')
        f_real.write(line_data[1:-2]+'\n')  