# Train Transformer for the Lang2motion task

In [1]:
// for local development
%install-location /notebooks/language2motion.gt/swift-install
%install-swiftpm-flags -c release
%install '.package(path: "/notebooks/language2motion.gt")' Datasets TranslationModels TextModels ModelSupport SummaryWriter MotionModels

[1/2] Compiling jupyterInstalledPackages jupyterInstalledPackages.swift
Initializing Swift...
Installation complete!


In [2]:
// for colab
// %install-location $cwd/swift-install
// %install-swiftpm-flags -c release
// %install '.package(url: "https://github.com/wojtekcz/language2motion.git", .branch("master"))' Datasets TranslationModels TextModels ModelSupport SummaryWriter MotionModels

In [3]:
import TensorFlow
import TextModels
import TranslationModels
import Foundation
import ModelSupport
import Datasets
import SummaryWriter
import MotionModels

## What's the GPU?

In [4]:
import Foundation

func shell(_ command: String) -> String {
    let task = Process()
    let pipe = Pipe()

    task.standardOutput = pipe
    task.arguments = ["-c", command]
    task.launchPath = "/bin/bash"
    task.launch()

    let data = pipe.fileHandleForReading.readDataToEndOfFile()
    return String(data: data, encoding: .utf8)!
}

func sh(_ command: String) {
    print(shell(command))
}

// sh("""
// export PATH="$PATH:/opt/bin:/swift/toolchain/usr/bin"
// export LD_LIBRARY_PATH="/usr/lib64-nvidia:$LD_LIBRARY_PATH"
// nvidia-smi
// """)

## Download data

In [5]:
let datasetSize: DatasetSize = .mini
let dataset_name = "motion_dataset_v3.10Hz.\(datasetSize.rawValue)"

In [6]:
// sh("mkdir -p /content/data/motion_images/")
// sh("""
// cd /content/data/
// wget -nv --show-progress -N https://github.com/wojtekcz/language2motion/releases/download/v0.3.0/\(dataset_name)tgz
// wget -nv -N https://github.com/wojtekcz/language2motion/releases/download/v0.1.0/labels_ds_v2.csv
// wget -nv -N https://github.com/wojtekcz/language2motion/releases/download/v0.1.0/vocab.txt
// tar xzvf \(dataset_name)tgz --skip-old-files
// """)

## Set training params

In [55]:
let runName = "run_1"
let batchSize = 1
// let batchSize = 150
let maxTextSequenceLength =  20
let maxMotionLength =  100
let nEpochs = 5
let learningRate: Float = 5e-4

print("runName: \(runName)")
print("batchSize: \(batchSize)")
print("maxTextSequenceLength: \(maxTextSequenceLength)")
print("maxMotionLength: \(maxMotionLength)")
print("nEpochs: \(nEpochs)")
print("learningRate: \(learningRate)")

// let dataURL = URL(fileURLWithPath: "/content/data/")
let dataURL = URL(fileURLWithPath: "/notebooks/language2motion.gt/data/")
let motionDatasetURL = dataURL.appendingPathComponent("\(dataset_name)plist")
let langDatasetURL = dataURL.appendingPathComponent("labels_ds_v2.csv")

runName: run_1
batchSize: 1
maxTextSequenceLength: 20
maxMotionLength: 100
nEpochs: 5
learningRate: 0.0005


## Select eager or X10 backend

In [8]:
// let device = Device.defaultXLA
let device = Device.defaultTFEager
print(device)

Device(kind: .CPU, ordinal: 0, backend: .TF_EAGER)


## X10 warm-up

In [9]:
// let eagerTensor1 = Tensor([0.0, 1.0, 2.0])
// let eagerTensor2 = Tensor([1.5, 2.5, 3.5])
// let eagerTensorSum = eagerTensor1 + eagerTensor2
// print(eagerTensorSum)
// print(eagerTensor1.device)
// let x10Tensor2 = Tensor([1.5, 2.5, 3.5], on: Device.defaultXLA)
// print(x10Tensor2.device)

## Instantiate model

In [56]:
/// instantiate text processor
let vocabularyURL = dataURL.appendingPathComponent("vocab.txt")
let vocabulary: Vocabulary = try! Vocabulary(fromFile: vocabularyURL)
let tokenizer: Tokenizer = BERTTokenizer(vocabulary: vocabulary, caseSensitive: false, unknownToken: "[UNK]", maxTokenLength: nil)
let textProcessor = TextProcessor2(vocabulary: vocabulary, tokenizer: tokenizer, maxTextSequenceLength: maxTextSequenceLength, maxMotionLength: maxMotionLength)

/// instantiate model
let vocabSize = vocabulary.count
let nbJoints = 47 // TODO: get value from dataset
let layerCount: Int = 6
let modelSize: Int = 256
let feedForwardSize: Int = 1024
let headCount: Int = 8
let dropoutProbability: Double = 0.1

var transformer = LangMotionTransformer(
    vocabSize: vocabSize, 
    nbJoints: nbJoints,
    layerCount: layerCount, 
    modelSize: modelSize, 
    feedForwardSize: feedForwardSize, 
    headCount: headCount, 
    dropoutProbability: dropoutProbability
)

let nbMixtures = 20
// TODO: integrate MotionGaussianMixtureModel with Generator
var mixtureModel = MotionGaussianMixtureModel(inputSize: nbJoints, nbJoints: nbJoints, nbMixtures: nbMixtures)
// mixtureModel.move(to: device)

var model = LangMotionModel(transformer: transformer, mixtureModel: mixtureModel)
model.move(to: device)

## Load dataset

In [57]:
print("\nLoading dataset...")

var dataset = try Lang2Motion(
    motionDatasetURL: motionDatasetURL,
    langDatasetURL: langDatasetURL,
    batchSize: batchSize
) { (example: Lang2Motion.Example) -> LangMotionBatch in    
    let singleBatch = textProcessor.preprocess(example: example)
    return singleBatch
}

print("Dataset acquired.")


Loading dataset...
MotionDataset2(motionSamples: 1030)
keeping 834 annotatated motions
keeping 834 longer motions, with minimum 10 frames
Scaling motions...
Motions scaled.
Dataset acquired.


## Test model with one batch

In [12]:
func printBatch(_ batch: LangMotionBatch) {
    print("type: \(type(of:batch))")
    print("sampleID: shape \(batch.sampleID.shape), value \(batch.sampleID)")

    print("source")
    print("  tokenIds.shape: \(batch.tokenIds.shape)")
    print("  mask.shape: \(batch.mask.shape)")
    print("  tokenCount: shape \(batch.tokenCount.shape), value \(batch.tokenCount)")

    print("target")
    print("  targetMotionFrames.shape: \(batch.targetMotionFrames.shape)")
    print("  targetMask.shape: \(batch.targetMask.shape)")
    print("  targetTruth.shape: \(batch.targetTruth.shape)")
    print("  origMotionFramesCount: shape \(batch.origMotionFramesCount.shape), value \(batch.origMotionFramesCount)")
}

In [13]:
/// one example to single batch
// print("\nSingle batch")
// print("============")
// let example = dataset.trainExamples[0]
// print("example.sentence: \"\(example.sentence)\"")

// let singleBatch = textProcessor.preprocess(example: example)
// printBatch(singleBatch)

In [14]:
/// get a batch
// print("\nOne batch:")
// print("=========")
// var epochIterator = dataset.trainingEpochs.enumerated().makeIterator()
// let epoch = epochIterator.next()
// let batches = Array(epoch!.1)
// let batch: LangMotionBatch = batches[0]
// printBatch(batch)

In [15]:
/// run one batch
// print("\nRun one batch:")
// print("==============")
// let deviceBatch = LangMotionBatch(copying: batch, to: device)
// let batch_generated = model.generate(input: deviceBatch)
// print("batch_generated.shape: \(batch_generated.shape)")

## Set up decoding

In [16]:
public func greedyDecodeMotion(sentence: String, prefix: String = "prefix") {
    // FIXME: for generation don't supply motion in a batch, maybe neutral motion frame only
    let randomMotionSample = dataset.trainExamples[0].motionSample
    let example = Lang2Motion.Example(sampleID: -1, sentence: sentence, motionSample: randomMotionSample)
    print("sentence: \"\(sentence)\"")

    let singleBatch = textProcessor.preprocess(example: example)
    printBatch(singleBatch)

    print("\nDecode single batch:")
    print("====================")
    Context.local.learningPhase = .inference
    let single_generated = model.generate(input: LangMotionBatch(copying: singleBatch, to: device)).squeezingShape(at: 0)
    print("generated.shape: \(single_generated.shape)")

    let (motion, log_probs, done) = performNormalMixtureSampling(
        preds: single_generated, nb_joints: nbJoints, nb_mixtures: nbMixtures, maxMotionLength: maxMotionLength)

    let descaled_motion = dataset.scaler.inverse_transform(motion)

    print("motion.shape: \(motion.shape)")
    print("log_probs.count: \(log_probs.count)")
    print("done.shape: \(done.shape)")
    print("done: \(done)")
    // print("log_probs: \(log_probs)")
    // print("descaled_motion: \(descaled_motion)")

    let imageURL = dataURL.appendingPathComponent("motion_images/\(prefix).png")
    motionToImg(url: imageURL, motion: descaled_motion, motionFlag: done, padTo: maxMotionLength, descr: "\(prefix), \(example.sentence)")
    print("Saved image: \(imageURL.path)")
}

## Optimizer

In [49]:
var optimizer = Adam(for: model, learningRate: learningRate)
optimizer = Adam(copying: optimizer, to: device)

let logdirURL = dataURL.appendingPathComponent("tboard/Lang2motion/\(runName)", isDirectory: true)
let summaryWriter = SummaryWriter(logdir: logdirURL, flushMillis: 30*1000)

## Training helpers

In [50]:
let args = LossArgs(
        nb_joints: nbJoints,
        nb_mixtures: nbMixtures,
        mixture_regularizer_type: "None",  // ["cv", "l2", "None"]
        mixture_regularizer: 0.0
)

func update(model: inout LangMotionModel, using optimizer: inout Adam<LangMotionModel>, for batch: LangMotionBatch) -> Float {
    let y_true = batch.targetTruth
    let result = withLearningPhase(.training) { () -> Float in
        let (loss, grad) = valueWithGradient(at: model) {
            (model) -> Tensor<Float> in
            let y_pred = model.generate(input: batch)
            let loss = normalMixtureSurrogateLoss(y_true: y_true, y_pred: y_pred, args: args)
            let n_items: Float = Float(loss.shape[0] * loss.shape[1])
            // let ones = Tensor<Float>(ones: loss.shape)
            // let nans = loss.isNaN
            // let loss_notNaN = loss.replacing(with:ones, where:nans)
            // let avg_loss = loss_notNaN.sum() / n_items
            let avg_loss = loss.sum() / n_items
            // print("avg_loss: \(avg_loss)")
            return avg_loss
        }
        optimizer.update(&model, along: grad)
        LazyTensorBarrier()
        return loss.scalarized()
    }
    return result
}

/// returns validation loss
func validate(model: inout LangMotionModel, for batch: LangMotionBatch) -> Float {
    let y_true = batch.targetTruth
    let result = withLearningPhase(.inference) { () -> Float in
        let y_pred = model.generate(input: batch)
        let loss = normalMixtureSurrogateLoss(y_true: y_true, y_pred: y_pred, args: args)
        let n_items: Float = Float(loss.shape[0] * loss.shape[1])
        let avg_loss = loss.sum() / n_items
        return avg_loss.scalarized()
    }
    LazyTensorBarrier()
    return result
}

## Debug surrogate loss nans

In [29]:
extension Bool {
    var intValue: Int {
        return self ? 1 : 0
    }
}

In [58]:
/// get a batch
print("\nOne batch:")
print("=========")
var epochIterator = dataset.trainingEpochs.enumerated().makeIterator()
let epoch = epochIterator.next()
let batches = Array(epoch!.1)

In [61]:
batches.count

668


In [62]:
let eagerBatch: LangMotionBatch = batches[1]
printBatch(eagerBatch)    

type: LangMotionBatch
sampleID: shape [1], value [2098]
source
  tokenIds.shape: [1, 20]
  mask.shape: [1, 1, 20]
  tokenCount: shape [1], value [9]
target
  targetMotionFrames.shape: [1, 99, 47]
  targetMask.shape: [1, 99, 99]
  targetTruth.shape: [1, 99, 47]
  origMotionFramesCount: shape [1], value [49]


In [63]:
let batch = LangMotionBatch(copying: eagerBatch, to: device)

let y_true = batch.targetTruth

let y_pred = model.generate(input: batch)
let loss = normalMixtureSurrogateLoss(y_true: y_true, y_pred: y_pred, args: args)
let n_items: Float = Float(loss.shape[0] * loss.shape[1])
let ones = Tensor<Float>(ones: loss.shape)
let nans: Tensor<Bool> = loss.isNaN
print(nans.scalars.count)
print(nans.scalars.reduce(0, {(x: Int, y: Bool) in x+y.intValue}))
let loss_notNaN = loss.replacing(with:ones, where:nans)
let avg_loss_notNaN = loss_notNaN.sum() / n_items
let avg_loss = loss.sum() / n_items

print("current loss: \(avg_loss.scalarized())")
print("avg_loss_notNaN: \(avg_loss_notNaN.scalarized())")
print(loss)

99
47
current loss: nan(0x1fffff)
avg_loss_notNaN: 343.77454
[[nan(0x1fffff), nan(0x1fffff), nan(0x1fffff), nan(0x1fffff), nan(0x1fffff), nan(0x1fffff),
  nan(0x1fffff), nan(0x1fffff), nan(0x1fffff), nan(0x1fffff), nan(0x1fffff), nan(0x1fffff),
  nan(0x1fffff), nan(0x1fffff), nan(0x1fffff), nan(0x1fffff), nan(0x1fffff), nan(0x1fffff),
  nan(0x1fffff), nan(0x1fffff), nan(0x1fffff), nan(0x1fffff), nan(0x1fffff), nan(0x1fffff),
  nan(0x1fffff), nan(0x1fffff), nan(0x1fffff), nan(0x1fffff), nan(0x1fffff), nan(0x1fffff),
  nan(0x1fffff), nan(0x1fffff),      656.5909, nan(0x1fffff), nan(0x1fffff), nan(0x1fffff),
  nan(0x1fffff), nan(0x1fffff), nan(0x1fffff), nan(0x1fffff), nan(0x1fffff), nan(0x1fffff),
  nan(0x1fffff), nan(0x1fffff), nan(0x1fffff), nan(0x1fffff), nan(0x1fffff), nan(0x1fffff),
      678.62244,     679.30273,     665.77325,     663.08453,      640.1204,     660.07465,
       669.6528,      672.4334,      655.6682,     685.94696,      683.8687,     653.43085,
          668.1,   

In [65]:
y_pred.shape

▿ [1, 99, 1901]
  ▿ dimensions : 3 elements
    - 0 : 1
    - 1 : 99
    - 2 : 1901


In [67]:
let y_pred_nans: Tensor<Bool> = y_pred.isNaN
print(y_pred_nans.scalars.reduce(0, {(x: Int, y: Bool) in x+y.intValue}))

0


In [69]:
(y_pred.min(), y_pred.max())

▿ 2 elements
  - .0 : -26.429714
  - .1 : 28.42573


In [None]:
#

In [85]:
let batch = LangMotionBatch(copying: eagerBatch, to: device)
let y_true = batch.targetTruth
let y_pred = model.generate(input: batch)
// let loss = normalMixtureSurrogateLoss2(y_true: y_true, y_pred: y_pred, args: args)

let TINY: Float = 1e-8
let pi: Float = 3.1415
let nb_mixtures = args.nb_mixtures
let nb_joints = args.nb_joints

let all_means = y_pred[0..., 0..., 0..<nb_joints * nb_mixtures]
let all_variances = y_pred[0..., 0..., nb_joints *
                       nb_mixtures..<2 * nb_joints * nb_mixtures] + TINY
let weights = y_pred[0..., 0..., 2 * nb_joints * nb_mixtures..<2 *
                 nb_joints * nb_mixtures + nb_mixtures]
let stop = y_pred[0..., 0..., -1]
let y_true_motion = y_true[0..., 0..., 0..<nb_joints]
let y_true_stop = y_true[0..., 0..., -1]

var log_mixture_pdf: Tensor<Float> = Tensor<Float>(zeros: [weights.shape[0], weights.shape[1]]) 
for mixture_idx in 0..<nb_mixtures {
    let start_idx = mixture_idx * nb_joints
    let means = all_means[0..., 0..., start_idx..<start_idx + nb_joints]
    let variances = all_variances[0..., 0..., start_idx..<start_idx + nb_joints]
    let diff = y_true_motion - means
    let pdf1 = 1.0 / sqrt(variances * 2.0 * pi)
    let pdf2a = diff.squared()
    let pdf2 = exp(-(pdf2a) / (2.0 * variances))
    let pdf = pdf1 * pdf2
    let weighted_pdf = weights[0..., 0..., mixture_idx] * 
        log(pdf + TINY).sum(alongAxes:2).squeezingShape(at: 2)
    log_mixture_pdf = log_mixture_pdf + weighted_pdf
}

In [86]:
let b_pdf1 = Float(1.0) - y_true_stop
let b_pdf2 = Float(1.0) - stop

In [91]:
y_true.shape

▿ [1, 99, 47]
  ▿ dimensions : 3 elements
    - 0 : 1
    - 1 : 99
    - 2 : 47


**y_true_stop is wrong!**

In [92]:
stop

[[0.99999666,        1.0,        1.0, 0.99999917,        1.0,  0.9999975,        1.0,        1.0,
         1.0, 0.99999905,        1.0, 0.99999475, 0.99999464, 0.99993336, 0.99999785,  0.9999944,
   0.9999958, 0.99998975,  0.9999926,  0.9999951, 0.99979967, 0.99996066,  0.9999993, 0.99999106,
    0.996846,  0.9993149,  0.9953934, 0.99829835,   0.992919, 0.99993384,  0.9990939, 0.98803073,
  0.99945265,  0.9999987, 0.99999905,  0.9999789,  0.9999734,  0.9996495,  0.9999944, 0.99999225,
   0.9997938,  0.9994863,   0.998094,  0.9999795,  0.9993005, 0.99979454,    0.99984,  0.9997893,
   0.9972362, 0.99997413,  0.9999975,  0.9999994,        1.0, 0.99998724,        1.0, 0.99999964,
         1.0,  0.9999968,        1.0,        1.0,        1.0, 0.99999964,  0.9999949, 0.99999917,
  0.99998176,  0.9990533,  0.9999901,  0.9999995, 0.99999976,        1.0,  0.9999852,  0.9999912,
  0.99989223,  0.9999993,  0.9999993,  0.9999641,  0.9999989,  0.9999987,   0.999995, 0.99999046,
   0.9999994,       

In [88]:
y_true_stop

[[  -0.715116,  -0.7146881,    -0.72092,   -0.715113, -0.71725553,  -0.7217584, -0.70832956,
  -0.69332194,  -0.6852078,  -0.6752627,  -0.6556557,   -0.627033, -0.61432016,  -0.6184848,
  -0.62803423, -0.65271044,  -0.6736506,  -0.7001925,  -0.7042824, -0.70342845, -0.69615895,
  -0.67181414, -0.66043687,  -0.6760727,  -0.7107362,   -0.731925, -0.73600185,  -0.7578355,
  -0.76840144,   -0.771513,  -0.7752077,  -0.7857186, -0.79690474,  -0.8012518,  -0.8003428,
   -0.8037902,  -0.8257884,  -0.8682184, -0.92883575,    -0.96354,  -0.9806364,  -0.9590457,
  -0.94773036,  -0.9456557, -0.92478853,  -0.8717901,  -0.7746654,  -0.7614319,         0.0,
          0.0,         0.0,         0.0,         0.0,         0.0,         0.0,         0.0,
          0.0,         0.0,         0.0,         0.0,         0.0,         0.0,         0.0,
          0.0,         0.0,         0.0,         0.0,         0.0,         0.0,         0.0,
          0.0,         0.0,         0.0,         0.0,         0.0,    

In [89]:
stop

[[0.99999666,        1.0,        1.0, 0.99999917,        1.0,  0.9999975,        1.0,        1.0,
         1.0, 0.99999905,        1.0, 0.99999475, 0.99999464, 0.99993336, 0.99999785,  0.9999944,
   0.9999958, 0.99998975,  0.9999926,  0.9999951, 0.99979967, 0.99996066,  0.9999993, 0.99999106,
    0.996846,  0.9993149,  0.9953934, 0.99829835,   0.992919, 0.99993384,  0.9990939, 0.98803073,
  0.99945265,  0.9999987, 0.99999905,  0.9999789,  0.9999734,  0.9996495,  0.9999944, 0.99999225,
   0.9997938,  0.9994863,   0.998094,  0.9999795,  0.9993005, 0.99979454,    0.99984,  0.9997893,
   0.9972362, 0.99997413,  0.9999975,  0.9999994,        1.0, 0.99998724,        1.0, 0.99999964,
         1.0,  0.9999968,        1.0,        1.0,        1.0, 0.99999964,  0.9999949, 0.99999917,
  0.99998176,  0.9990533,  0.9999901,  0.9999995, 0.99999976,        1.0,  0.9999852,  0.9999912,
  0.99989223,  0.9999993,  0.9999993,  0.9999641,  0.9999989,  0.9999987,   0.999995, 0.99999046,
   0.9999994,       

In [87]:
let bernoulli_pdf = y_true_stop * stop + b_pdf1 * b_pdf2
print("bernoulli_pdf: \(bernoulli_pdf)")
// let log_bernoulli_pdf = log(bernoulli_pdf + TINY)
// print(log_bernoulli_pdf)

bernoulli_pdf: [[   -0.7151079,    -0.7146881,      -0.72092,   -0.71511096,   -0.71725553,   -0.72175235,
    -0.70832956,   -0.69332194,    -0.6852078,    -0.6752604,    -0.6556557,    -0.6270212,
    -0.61430824,    -0.6183358,   -0.62802935,   -0.65269756,   -0.67364085,   -0.70016795,
    -0.70426464,   -0.70341665,   -0.69567966,     -0.671722,    -0.6604352,   -0.67605174,
     -0.7030989,     -0.730237,   -0.72461426,   -0.75355476,   -0.75043845,    -0.7713448,
     -0.7728967,    -0.7549404,     -0.795485,    -0.8012484,    -0.8003403,   -0.80373514,
     -0.8257179,    -0.8672594,    -0.9288198,   -0.96351737,    -0.9800259,    -0.9575468,
     -0.9422116,   -0.94559646,   -0.92279524,   -0.87122643,    -0.7742576,   -0.76090026,
   0.0027638078, 2.5868416e-05,  2.503395e-06, 5.9604645e-07,           0.0, 1.2755394e-05,
            0.0, 3.5762787e-07,           0.0, 3.2186508e-06,           0.0,           0.0,
            0.0, 3.5762787e-07, 5.1259995e-06,   8.34465e-07, 1.8

In [82]:
// var mixture_reg: Float = 0.0
// if args.mixture_regularizer_type == "cv" {
//     // We want to use (std / mean)^2 = std^2 / mean^2 = var / mean^2.
//     mixture_reg = weights.variance().scalarized() / 
//         weights.mean().squared().scalarized()
// } else if args.mixture_regularizer_type == "l2" {
//     mixture_reg = weights.squared().sum().scalarized()
// } else {
//     mixture_reg = 0.0
// }

// print(log_mixture_pdf)

// let loss = -(log_mixture_pdf + log_bernoulli_pdf) +
//     args.mixture_regularizer * mixture_reg

// print("loss[\(loss.shape)]: \(loss)")

[[-nan(0x1fffff), -nan(0x1fffff), -nan(0x1fffff), -nan(0x1fffff), -nan(0x1fffff),
  -nan(0x1fffff), -nan(0x1fffff), -nan(0x1fffff), -nan(0x1fffff), -nan(0x1fffff),
  -nan(0x1fffff), -nan(0x1fffff), -nan(0x1fffff), -nan(0x1fffff), -nan(0x1fffff),
  -nan(0x1fffff), -nan(0x1fffff), -nan(0x1fffff), -nan(0x1fffff), -nan(0x1fffff),
  -nan(0x1fffff), -nan(0x1fffff), -nan(0x1fffff), -nan(0x1fffff), -nan(0x1fffff),
  -nan(0x1fffff), -nan(0x1fffff), -nan(0x1fffff), -nan(0x1fffff), -nan(0x1fffff),
  -nan(0x1fffff), -nan(0x1fffff), -nan(0x1fffff), -nan(0x1fffff), -nan(0x1fffff),
  -nan(0x1fffff), -nan(0x1fffff), -nan(0x1fffff), -nan(0x1fffff), -nan(0x1fffff),
  -nan(0x1fffff), -nan(0x1fffff), -nan(0x1fffff), -nan(0x1fffff), -nan(0x1fffff),
  -nan(0x1fffff), -nan(0x1fffff), -nan(0x1fffff),     -10.571357,     -18.420681,
      -18.420681,     -10.684453,     -14.136742,      -9.608956,     -13.852512,
      -14.816195,     -14.816195,     -18.420681,     -12.302594,     -14.816195,
      -18.420681

In [75]:
let n_items: Float = Float(loss.shape[0] * loss.shape[1])
let ones = Tensor<Float>(ones: loss.shape)
let nans: Tensor<Bool> = loss.isNaN
print(nans.scalars.count)
print(nans.scalars.reduce(0, {(x: Int, y: Bool) in x+y.intValue}))
let loss_notNaN = loss.replacing(with:ones, where:nans)
let avg_loss_notNaN = loss_notNaN.sum() / n_items
let avg_loss = loss.sum() / n_items

print("current loss: \(avg_loss.scalarized())")
print("avg_loss_notNaN: \(avg_loss_notNaN.scalarized())")

99
48
current loss: nan(0x1fffff)
avg_loss_notNaN: 337.681


## Training loop

In [None]:
// let nEpochs = 2

In [14]:
print("\nTraining Transformer for the Lang2motion task!")
var trainingStepCount = 0
let print_every = 10
time() {
    LazyTensorBarrier()
    for (epoch, epochBatches) in dataset.trainingEpochs.prefix(nEpochs).enumerated() {
        print("[Epoch \(epoch + 1)]")
        Context.local.learningPhase = .training
        var trainingLossSum: Float = 0
        var trainingBatchCount = 0
        if epoch == 0 {
            print("epochBatches.count: \(epochBatches.count)")
        }

        for eagerBatch in epochBatches {
            if (trainingStepCount < 5 || trainingStepCount % print_every == 0) {
                print("==> step \(trainingStepCount)")
            }
            let batch = LangMotionBatch(copying: eagerBatch, to: device)
            let loss: Float = update(model: &model, using: &optimizer, for: batch)
            if (trainingStepCount < 5 || trainingStepCount % print_every == 0) {
                print("current loss at step \(trainingStepCount): \(loss)")
            }
            trainingLossSum += loss
            trainingBatchCount += 1
            summaryWriter.writeScalarSummary(tag: "TrainingLoss", step: trainingStepCount, value: trainingLossSum / Float(trainingBatchCount))
            trainingStepCount += 1
        }
        print(
            """
            Training loss: \(trainingLossSum / Float(trainingBatchCount))
            """
        )
        summaryWriter.writeScalarSummary(tag: "EpochTrainingLoss", step: epoch+1, value: trainingLossSum / Float(trainingBatchCount))

        if epoch == 0 {
            print("dataset.validationBatches.count: \(dataset.validationBatches.count)")
        }
        Context.local.learningPhase = .inference
        var devLossSum: Float = 0
        var devBatchCount = 0
        var totalGuessCount = 0

        for eagerBatch in dataset.validationBatches {
            let batch = LangMotionBatch(copying: eagerBatch, to: device)
            let loss: Float = validate(model: &model, for: batch)
            let valBatchSize = batch.targetMotionFrames.shape[0]

            devLossSum += loss
            devBatchCount += 1
            totalGuessCount += valBatchSize
        }

        print(
            """
            totalGuessCount: \(totalGuessCount) \
            Eval loss: \(devLossSum / Float(devBatchCount))
            """
        )
        summaryWriter.writeScalarSummary(tag: "EpochTestLoss", step: epoch+1, value: devLossSum / Float(devBatchCount))
        greedyDecodeMotion(sentence: "human is walking", prefix: "epoch_\(epoch+1)")
    }
    summaryWriter.flush()
}

print("\nFinished training.")


Training Transformer for the Lang2motion task!
[Epoch 1]
epochBatches.count: 161
==> step 0
current loss at step 0: nan(0x1fffff)
==> step 1


: 

## Generate motion

In [None]:
// TODO: show motion inline
greedyDecodeMotion(sentence: "human is walking", prefix: "foo9")