# Train Transformer for the Motion2lang task

## Get sources

In [1]:
// for colab
%install-location $cwd/swift-install
%install-swiftpm-flags -c release
%install '.package(url: "https://github.com/wojtekcz/language2motion.git", .branch("colab-tpu-error"))' Datasets TranslationModels TextModels ModelSupport SummaryWriter MotionModels

Installing packages:
	.package(url: "https://github.com/wojtekcz/language2motion.git", .branch("colab-tpu-error"))
		Datasets
		TranslationModels
		TextModels
		ModelSupport
		SummaryWriter
		MotionModels
With SwiftPM flags: ['-c', 'release']
Working in: /tmp/tmp5l0emmpu/swift-install
[1/2] Compiling jupyterInstalledPackages jupyterInstalledPackages.swift
Initializing Swift...
Installation complete!


In [2]:
import TensorFlow
import TextModels
import TranslationModels
import Foundation
import ModelSupport
import Datasets
import SummaryWriter
import MotionModels

## Download data

In [3]:
// let dataset_name = "motion_dataset_v3.norm.10Hz"
let dataset_name = "motion_dataset_v3.norm.10Hz.mini"

In [4]:
import Foundation

func shell(_ command: String) -> String {
    let task = Process()
    let pipe = Pipe()

    task.standardOutput = pipe
    task.arguments = ["-c", command]
    task.launchPath = "/bin/bash"
    task.launch()

    let data = pipe.fileHandleForReading.readDataToEndOfFile()
    return String(data: data, encoding: .utf8)!
}

func sh(_ command: String) {
    print(shell(command))
}

In [7]:
sh("mkdir -p /content/data/")
sh("""
cd /content/data/
wget -nv --show-progress -N https://github.com/wojtekcz/language2motion/releases/download/v0.2.0/\(dataset_name).tgz
wget -nv -N https://github.com/wojtekcz/language2motion/releases/download/v0.1.0/labels_ds_v2.csv
wget -nv -N https://github.com/wojtekcz/language2motion/releases/download/v0.1.0/vocab.txt
tar xzvf \(dataset_name).tgz --skip-old-files
""")


2020-07-15 14:49:51 URL:https://github-production-release-asset-2e65be.s3.amazonaws.com/258798747/24bb8980-c505-11ea-86e7-b8b6236db096?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20200715%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200715T144950Z&X-Amz-Expires=300&X-Amz-Signature=d9677b7e7f162127efcd90b99cabda5764e53c3e58fc2d4b7c7cf5f66ddd24ac&X-Amz-SignedHeaders=host&actor_id=0&repo_id=258798747&response-content-disposition=attachment%3B%20filename%3Dmotion_dataset_v3.norm.10Hz.mini.tgz&response-content-type=application%2Foctet-stream [80600141/80600141] -> "motion_dataset_v3.norm.10Hz.mini.tgz" [1]
2020-07-15 14:49:52 URL:https://github-production-release-asset-2e65be.s3.amazonaws.com/258798747/16bcbb80-95dd-11ea-8fed-886381d8bee7?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20200715%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200715T144952Z&X-Amz-Expires=300&X-Amz-Signature=114a3ff7536abe3789375c1be23b98bcc6eb3454bd69f238cb

## Set training params

In [5]:
let runName = "run_1"
let batchSize = 6000
// let batchSize = 3000
let maxSequenceLength =  50
let nEpochs = 10
let learningRate: Float = 5e-4

print("runName: \(runName)")
print("batchSize: \(batchSize)")
print("maxSequenceLength: \(maxSequenceLength)")
print("nEpochs: \(nEpochs)")
print("learningRate: \(learningRate)")

let dataURL = URL(fileURLWithPath: "/content/data/")
let motionDatasetURL = dataURL.appendingPathComponent("\(dataset_name).plist")
let langDatasetURL = dataURL.appendingPathComponent("labels_ds_v2.csv")

runName: run_1
batchSize: 6000
maxSequenceLength: 50
nEpochs: 10
learningRate: 0.0005


## Select eager or X10 backend

In [6]:
let device = Device.defaultXLA
// let device = Device.defaultTFEager
print(device)

Device(kind: .TPU, ordinal: 0, backend: .XLA)


## X10 warm-up

In [10]:
// let eagerTensor1 = Tensor([0.0, 1.0, 2.0])
// let eagerTensor2 = Tensor([1.5, 2.5, 3.5])
// let eagerTensorSum = eagerTensor1 + eagerTensor2
// print(eagerTensorSum)
// print(eagerTensor1.device)
// let x10Tensor2 = Tensor([1.5, 2.5, 3.5], on: Device.defaultXLA)
// print(x10Tensor2.device)

## Instantiate model

In [7]:
// instantiate text processor
let vocabularyURL = dataURL.appendingPathComponent("vocab.txt")
let vocabulary: Vocabulary = try! Vocabulary(fromFile: vocabularyURL)
let tokenizer: Tokenizer = BERTTokenizer(vocabulary: vocabulary, caseSensitive: false, unknownToken: "[UNK]", maxTokenLength: nil)
let textProcessor = TextProcessor(vocabulary: vocabulary, tokenizer: tokenizer, maxSequenceLength: maxSequenceLength)

// instantiate model
let sourceVocabSize = vocabulary.count
let inputSize = 48 // TODO: get value from dataset
let targetVocabSize = vocabulary.count
let layerCount: Int = 6
let modelSize: Int = 256
let feedForwardSize: Int = 1024
let headCount: Int = 8
let dropoutProbability: Double = 0.1

var model = MotionLangTransformer(
    sourceVocabSize: sourceVocabSize, 
    inputSize: inputSize,
    targetVocabSize: targetVocabSize,
    layerCount: layerCount, 
    modelSize: modelSize, 
    feedForwardSize: feedForwardSize, 
    headCount: headCount, 
    dropoutProbability: dropoutProbability
)

model.move(to: device)

## Load dataset

In [8]:
print("\nLoading dataset...")

var dataset = try Motion2Lang(
    motionDatasetURL: motionDatasetURL,
    langDatasetURL: langDatasetURL,
    maxSequenceLength: maxSequenceLength,
    batchSize: batchSize
) { (example: Motion2Lang.Example) -> MotionLangBatch in    
    let singleBatch = textProcessor.preprocess(example: example)
    return singleBatch
}

print("Dataset acquired.")


Loading dataset...
MotionDataset(motionSamples: 4860)
keeping 3740 annotatated motions
keeping 3718 longer motions, with minimum 10 frames
Dataset acquired.


## Test model with one batch

In [9]:
// get a batch
print("\nOne batch (MotionLangBatch):")
var epochIterator = dataset.trainingEpochs.enumerated().makeIterator()
let epoch = epochIterator.next()
let batches = Array(epoch!.1)
let batch: MotionLangBatch = batches[0]
print("type: \(type(of:batch))")
print("motionFrames.shape: \(batch.motionFrames.shape)")
// print("motionFlag.shape: \(batch.motionFlag.shape)")
print("mask.shape: \(batch.mask.shape)")
print("origMotionFramesCount.shape: \(batch.origMotionFramesCount.shape)")
print("origMotionFramesCount: \(batch.origMotionFramesCount)")
print("targetTokenIds.shape: \(batch.targetTokenIds.shape)")
print("targetMask.shape: \(batch.targetMask.shape)")
print("targetTruth.shape: \(batch.targetTruth.shape)")


One batch (MotionLangBatch):
type: MotionLangBatch
motionFrames.shape: [120, 50, 48]
mask.shape: [120, 1, 50]
origMotionFramesCount.shape: [120]
origMotionFramesCount: [ 37,  83,  58,  49,  46,  51,  97,  50,  38,  48,  54,  60,  99,  55,  58,  65,  53,  11,  36,  77,  52,  52,  27,  68,  60,  47,
  61,  48, 103,  47,  59,  50,  51,  58,  49, 272, 258,  40,  40,  46,  76,  10,  38,  65, 104,  70,  65, 189,  73,  47,  68,  72,
  78, 176,  53,  48,  69,  58,  40,  56, 129,  41,  95,  47,  59,  53,  48,  36,  95, 126, 109,  68,  50,  40, 126,  34,  83,  90,
  44,  48,  50,  60,  54, 153, 209,  52,  53,  85,  58, 159,  67,  66,  83,  41, 117,  60, 119,  10,  53,  59,  65,  50,  58,  57,
  68,  45,  76, 203,  59,  35,  57,  90,  46,  70,  83,  55,  57,  54,  54, 265]
targetTokenIds.shape: [120, 49]
targetMask.shape: [120, 49, 49]
targetTruth.shape: [120, 49]


In [10]:
// run one batch
print("\nRun one batch:")
print("==============")
let deviceBatch = MotionLangBatch(copying: batch, to: device)
let output = model(deviceBatch)
print("output.shape: \(output.shape)")


Run one batch:
output.shape: [120, 49, 256]


## Optimizer

In [11]:
var optimizer = Adam(for: model, learningRate: learningRate)
optimizer = Adam(copying: optimizer, to: device)

let logdirURL = dataURL.appendingPathComponent("tboard/Motion2lang/\(runName)", isDirectory: true)
let summaryWriter = SummaryWriter(logdir: logdirURL, flushMillis: 30*1000)

## Training helpers

In [12]:
func update(model: inout MotionLangTransformer, using optimizer: inout Adam<MotionLangTransformer>, for batch: MotionLangBatch) -> Float {
    let labels = batch.targetTruth.reshaped(to: [-1])
    let resultSize = batch.targetTruth.shape.last! * batch.targetTruth.shape.first!
    let result = withLearningPhase(.training) { () -> Float in
        let (loss, grad) = valueWithGradient(at: model) {
            (model) -> Tensor<Float> in
            let logits = model.generate(input: batch).reshaped(to: [resultSize, -1])
            let sce = softmaxCrossEntropy(logits: logits, labels: labels)
            return sce
        }
        optimizer.update(&model, along: grad)
        LazyTensorBarrier()
        return loss.scalarized()
    }
    return result
}

/// returns validation loss
func validate(model: inout MotionLangTransformer, for batch: MotionLangBatch) -> Float {
    let labels = batch.targetTruth.reshaped(to: [-1])
    let resultSize = batch.targetTruth.shape.last! * batch.targetTruth.shape.first!
    let result = withLearningPhase(.inference) { () -> Float in
        softmaxCrossEntropy(logits: model.generate(input: batch).reshaped(to: [resultSize, -1]), labels: labels).scalarized()
    }
    LazyTensorBarrier()
    return result
}

## Set up decoding

In [13]:
func greedyDecode(model: MotionLangTransformer, input: MotionLangBatch, maxLength: Int, startSymbol: Int32) -> Tensor<Int32> {
    let memory = model.encode(input: input)
    var ys = Tensor(repeating: startSymbol, shape: [1,1])
    // ys = Tensor(copying: ys, to: device)
    for _ in 0..<maxLength {
        let decoderInput = MotionLangBatch(motionFrames: input.motionFrames,
                                     mask: input.mask,
                                     origMotionFramesCount: input.origMotionFramesCount,
                                     targetTokenIds: ys,
                                     targetMask: Tensor<Float>(subsequentMask(size: ys.shape[1])),
                                     targetTruth: input.targetTruth)
        // decoderInput = MotionLangBatch(copying: decoderInput, to: device)
        let out = model.decode(input: decoderInput, memory: memory)
        let prob = model.generate(input: out[0...,-1])
        let nextWord = Int32(prob.argmax().scalarized())
        ys = Tensor(concatenating: [ys, Tensor(repeating: nextWord, shape: [1,1])], alongAxis: 1) // , on: device
        // ys = Tensor(copying: ys, to: device)
    }
    return ys
}

In [14]:
func greedyDecodeSample(_ sample_id: Int, verbose: Bool = false) {
    // get example
    let ms = dataset.motionSampleDict[sample_id]!
    let langRec = dataset.langRecsDict[sample_id]!
    let example = Motion2Lang.getExample(motionSample: ms, langRec: langRec)
    if verbose {
        print("example.id: \(example.id)")
        print("  motionSample.timestepsArray.last: \(example.motionSample.timestepsArray.last!)")
        print("  motionSample.motionFramesArray.shape: \(example.motionSample.motionFramesArray.shape)")
    }
    let singleExampleBatch = textProcessor.preprocess(example: example)
    var source = Motion2Lang.reduceDataBatches([singleExampleBatch])
    print("\nDecoding one sample:") // on eager device
    print("  targetSentence: \"\(example.targetSentence)\"")
    Context.local.learningPhase = .inference
    source = MotionLangBatch(copying: source, to: Device.defaultTFEager)
    model.move(to: Device.defaultTFEager)
    let out = greedyDecode(model: model, input: source, maxLength: 50, startSymbol: textProcessor.bosId)
    // TODO: only show one [PAD]
    let outputStr = textProcessor.decode(tensor: out)
    print("  decoded: \"\(outputStr)\"")
    model.move(to: device)
}

In [15]:
// get example
let example = dataset.trainExamples[0]
greedyDecodeSample(Int(example.id)!, verbose: true)

example.id: 2078
  motionSample.timestepsArray.last: 5.8
  motionSample.motionFramesArray.shape: [59, 48]

Decoding one sample:
  targetSentence: "a person knees on the floor"
  decoded: "[CLS] adjusting brodie brodie brodie ##llah ##llah ##llah ##llah ##llah ##llah ##llah ##llah ##llah brodie brodie ##llah ##llah ##llah ##llah ##llah ##llah ##llah ##llah ##llah ##llah ##llah brodie brodie brodie brodie ##llah ##llah ##llah brodie brodie brodie brodie brodie brodie brodie avon ##llah ##llah ##llah ##llah ##llah ##llah ##llah ##llah ##llah"


## Training loop

In [None]:
// let nEpochs = 2

In [16]:
print("\nTraining Transformer for the Motion2lang task!")
var trainingStepCount = 0
let print_every = 10
time() {
    LazyTensorBarrier()
    for (epoch, epochBatches) in dataset.trainingEpochs.prefix(nEpochs).enumerated() {
        print("[Epoch \(epoch + 1)]")
        Context.local.learningPhase = .training
        var trainingLossSum: Float = 0
        var trainingBatchCount = 0
        if epoch == 0 {
            print("epochBatches.count: \(epochBatches.count)")
        }

        for eagerBatch in epochBatches {
            if (trainingStepCount < 5 || trainingStepCount % print_every == 0) {
                print("==> step \(trainingStepCount)")
            }
            let batch = MotionLangBatch(copying: eagerBatch, to: device)
            let loss: Float = update(model: &model, using: &optimizer, for: batch)
            if (trainingStepCount < 5 || trainingStepCount % print_every == 0) {
                print("current loss at step \(trainingStepCount): \(loss)")
            }
            trainingLossSum += loss
            trainingBatchCount += 1
            summaryWriter.writeScalarSummary(tag: "TrainingLoss", step: trainingStepCount, value: trainingLossSum / Float(trainingBatchCount))
            trainingStepCount += 1
        }
        print(
            """
            Training loss: \(trainingLossSum / Float(trainingBatchCount))
            """
        )
        summaryWriter.writeScalarSummary(tag: "EpochTrainingLoss", step: epoch+1, value: trainingLossSum / Float(trainingBatchCount))

        if epoch == 0 {
            print("dataset.validationBatches.count: \(dataset.validationBatches.count)")
        }
        Context.local.learningPhase = .inference
        var devLossSum: Float = 0
        var devBatchCount = 0
        var totalGuessCount = 0

        for eagerBatch in dataset.validationBatches {
            let batch = MotionLangBatch(copying: eagerBatch, to: device)
            let loss: Float = validate(model: &model, for: batch)
            let valBatchSize = batch.motionFrames.shape[0]

            devLossSum += loss
            devBatchCount += 1
            totalGuessCount += valBatchSize
        }

        print(
            """
            totalGuessCount: \(totalGuessCount) \
            Eval loss: \(devLossSum / Float(devBatchCount))
            """
        )
        summaryWriter.writeScalarSummary(tag: "EpochTestLoss", step: epoch+1, value: devLossSum / Float(devBatchCount))
        greedyDecodeSample(Int(example.id)!)
    }
    summaryWriter.flush()
}

print("\nFinished training.")


Training Transformer for the Motion2lang task!
[Epoch 1]
epochBatches.count: 24
==> step 0
2020-07-15 14:57:51.319334: F tensorflow/compiler/xla/xla_client/xla_util.cc:90] Invalid argument: From /job:tpu_worker/replica:0/task:0:
Computation requires more parameters (333) than supported (limit 237).
	 [[{{node XRTCompile}}]]


: ignored

## Generate motion description

In [None]:
let sample_id = 2410
greedyDecodeSample(sample_id, verbose: true)