### Data

In [1]:
import sys
import os

path = os.path.abspath(os.path.join('..'))
if path not in sys.path:
    sys.path.append(path)

In [2]:
from diffusion.data_loaders.backflip_dataset import BackflipMotionDataset
dataset = BackflipMotionDataset("/home/kenji/Fyp/DeepMimic_mujoco/diffusion/data/motions/humanoid3d_backflip.txt")
len(dataset), dataset[0].shape

(29, torch.Size([29, 69]))

### Model

In [3]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=1000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)

        self.register_buffer('pe', pe)

    def forward(self, x):
        # not used in the final model
        x = x + self.pe[:x.shape[0], :]
        return self.dropout(x)


class TimestepEmbedder(nn.Module):
    def __init__(self, latent_dim, sequence_pos_encoder):
        super().__init__()
        self.latent_dim = latent_dim
        self.sequence_pos_encoder = sequence_pos_encoder

        time_embed_dim = self.latent_dim
        self.time_embed = nn.Sequential(
            nn.Linear(self.latent_dim, time_embed_dim),
            nn.SiLU(),
            nn.Linear(time_embed_dim, time_embed_dim),
        )

    def forward(self, timesteps):
        return self.time_embed(self.sequence_pos_encoder.pe[timesteps])
    
class MotionTransformer(nn.Module):
    def __init__(self, nfeats, latent_dim=256, ff_size=1024, num_layers=8, num_heads=4, dropout=0.1, activation="gelu"):
        super(MotionTransformer, self).__init__()
        
        self.nfeats = nfeats
        self.latent_dim = latent_dim
        self.ff_size = ff_size  
        self.dropout = dropout

        self.inputEmbedding = nn.Linear(self.nfeats, self.latent_dim)
        self.sequence_pos_encoder = PositionalEncoding(self.latent_dim, self.dropout)
        self.embed_timestep = TimestepEmbedder(self.latent_dim, self.sequence_pos_encoder)

        # Transformer Encoder
        encoder_layers = nn.TransformerEncoderLayer(d_model=self.latent_dim, nhead=num_heads, 
                                                    dim_feedforward=ff_size, dropout=dropout, activation=activation, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)

        # Output Linear Layer
        self.outputEmbedding = nn.Linear(self.latent_dim, self.nfeats)

    def forward(self, x: torch.Tensor, timesteps, y=None):
        """
        x: [batch_size, max_frames, n_feats], denoted x_t in the paper
        timesteps: [batch_size] (int)
        """
        # x: [batch_size, seq_len, nfeats]
        emb = self.embed_timestep(timesteps)  # [bs, seq_len, time_embed_dim]
        print("Emb", emb.shape)

        # Input process
        x = x.float()
        print(x.dtype, x.shape)
        x = self.inputEmbedding(x)  # [bs, seq_len, d]
        print("Input Embedding", x.shape)

        # Transformer Encoder
        # adding the timestep embed
        xseq = torch.cat((emb, x), axis=1)  # [bs, n_frames+1, d]
        print("Concat x and zkx", xseq.shape)

        xseq = self.sequence_pos_encoder(xseq)  # [bs, n_frames+1, d]
        print("Sequence Pos Encoder", xseq.shape)
        
        output = self.transformer_encoder(xseq)[:, 1:, :]  # , src_key_padding_mask=~maskseq)  # [bs, n_frames, d]
        print("Transformer Encoder", output.shape)

        # Output Linear
        output = self.outputEmbedding(output)  # [bs, n_frames, n_feats]
        print("Output Embedding", output.shape)

        return x

In [4]:
dataset[0].shape

torch.Size([29, 69])

In [5]:
import torch.optim as optim
from torch.utils.data import DataLoader

batch_size = 16
dataloader = DataLoader(
        dataset, batch_size=batch_size, shuffle=True,
        num_workers=8, drop_last=True)

nfeats = dataset[0].shape[1]

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = MotionTransformer(nfeats=nfeats, latent_dim=32, ff_size=128, num_layers=8, num_heads=4, dropout=0.1, activation="gelu").to(device)

In [6]:
for it, batch in enumerate(dataloader):
    batch = batch.to(device)
    print(batch.shape)

    model(batch, torch.arange(batch_size))
    break

torch.Size([16, 29, 69])
Emb torch.Size([16, 1, 32])
torch.float32 torch.Size([16, 29, 69])
Input Embedding torch.Size([16, 29, 32])
Concat x and zkx torch.Size([16, 30, 32])
Sequence Pos Encoder torch.Size([16, 30, 32])
Transformer Encoder torch.Size([16, 29, 32])
Output Embedding torch.Size([16, 29, 69])


In [7]:
from diffusion.diffusion import gaussian_diffusion as gd
from diffusion.diffusion.respace import SpacedDiffusion, space_timesteps

def create_gaussian_diffusion(
        diffusion_steps, # number eg 1000
        noise_schedule, # can be 'linear', 'cosine'
        sigma_small, # default True
        lambda_vel, lambda_rcxyz, lambda_fc # for geometric loss, we don't have fc, default 1 for rest
        ):
    # default params
    predict_xstart = True  # we always predict x_start (a.k.a. x0), that's our deal!
    steps = diffusion_steps
    scale_beta = 1.  # no scaling
    timestep_respacing = ''  # can be used for ddim sampling, we don't use it.
    learn_sigma = False
    rescale_timesteps = False

    betas = gd.get_named_beta_schedule(noise_schedule, steps, scale_beta)
    loss_type = gd.LossType.MSE

    if not timestep_respacing:
        timestep_respacing = [steps]

    return SpacedDiffusion(
        use_timesteps=space_timesteps(steps, timestep_respacing),
        betas=betas,
        model_mean_type=(
            gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X
        ),
        model_var_type=(
            (
                gd.ModelVarType.FIXED_LARGE
                if not sigma_small
                else gd.ModelVarType.FIXED_SMALL
            )
            if not learn_sigma
            else gd.ModelVarType.LEARNED_RANGE
        ),
        loss_type=loss_type,
        rescale_timesteps=rescale_timesteps,
        lambda_vel=lambda_vel,
        lambda_rcxyz=lambda_rcxyz,
        lambda_fc=lambda_fc,
    )

In [8]:
model = MotionTransformer(nfeats=nfeats, latent_dim=32, ff_size=128, num_layers=8, num_heads=4, dropout=0.1, activation="gelu").to(device)
diffusion = create_gaussian_diffusion(diffusion_steps=1000, noise_schedule="cosine", sigma_small=True, lambda_vel=1, lambda_rcxyz=1, lambda_fc=1)

In [9]:
class DefaultArgs:
    def __init__(self, save_dir, model_path, eval_model_path):
        # Base options
        self.cuda = True
        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
        self.seed = 10
        self.batch_size = 64

        # Diffusion options
        self.noise_schedule = 'cosine'
        self.diffusion_steps = 1000
        self.sigma_small = True

        # Model options
        self.arch = 'trans_enc'
        self.emb_trans_dec = False
        self.layers = 8
        self.latent_dim = 512
        self.cond_mask_prob = 0.1
        self.lambda_rcxyz = 0.0
        self.lambda_vel = 0.0
        self.lambda_fc = 0.0
        self.unconstrained = False  # This is inferred from the 'action' parameter

        # Data options
        self.dataset = 'humanml'
        self.data_dir = ""

        # Training options
        self.save_dir = save_dir
        self.overwrite = False
        self.train_platform_type = 'NoPlatform'
        self.lr = 1e-4
        self.weight_decay = 0.0
        self.lr_anneal_steps = 0
        self.eval_batch_size = 32
        self.eval_split = 'test'
        self.eval_during_training = False
        self.eval_rep_times = 3
        self.eval_num_samples = 1000
        self.log_interval = 1000
        self.save_interval = 50000
        self.num_steps = 600000
        self.num_frames = 60
        self.resume_checkpoint = ""

        # Sampling options
        self.model_path = model_path
        self.output_dir = ''
        self.num_samples = 10
        self.num_repetitions = 3
        self.guidance_param = 2.5

        # Generate options
        self.motion_length = 6.0
        self.input_text = ''
        self.action_file = ''
        self.text_prompt = ''
        self.action_name = ''

        # Edit options
        self.edit_mode = 'in_between'
        self.text_condition = ''
        self.prefix_end = 0.25
        self.suffix_start = 0.75

        # Evaluation options
        self.eval_model_path = eval_model_path
        self.eval_mode = 'wo_mm'
        self.eval_guidance_param = 2.5


In [10]:
args = DefaultArgs(save_dir="/home/kenji/Fyp/DeepMimic_mujoco/diffusion/logs/", model_path="/home/kenji/Fyp/DeepMimic_mujoco/diffusion/logs/model.pt", eval_model_path="/home/kenji/Fyp/DeepMimic_mujoco/diffusion/logs/model.pt")
args.device

'cuda:0'

In [11]:
from train.training_loop import TrainLoop
TrainLoop(args, None , model, diffusion, dataloader).run_loop()


Starting epoch 0


  0%|          | 0/1 [00:00<?, ?it/s]

Emb torch.Size([16, 1, 32])
torch.float32 torch.Size([16, 29, 69])
Input Embedding torch.Size([16, 29, 32])
Concat x and zkx torch.Size([16, 30, 32])
Sequence Pos Encoder torch.Size([16, 30, 32])
Transformer Encoder torch.Size([16, 29, 32])
Output Embedding torch.Size([16, 29, 69])
x_start.shape torch.Size([16, 29, 69])
model_output.shape torch.Size([16, 29, 32])
target.shape torch.Size([16, 29, 69])





AssertionError: 