# Train Transformer for the Motion2lang task

In [1]:
// for colab
%install-location $cwd/swift-install
%install-swiftpm-flags -c release
%install '.package(url: "https://github.com/wojtekcz/language2motion.git", .branch("koszalin-dl-9"))' Datasets TranslationModels TextModels ModelSupport SummaryWriter MotionModels

Installing packages:
	.package(url: "https://github.com/wojtekcz/language2motion.git", .branch("koszalin-dl-9"))
		Datasets
		TranslationModels
		TextModels
		ModelSupport
		SummaryWriter
		MotionModels
With SwiftPM flags: ['-c', 'release']
Working in: /tmp/tmpm6t5380f/swift-install
[1/2] Compiling jupyterInstalledPackages jupyterInstalledPackages.swift
Initializing Swift...
Installation complete!


In [2]:
// for local development
// %install-location /notebooks/language2motion.gt/swift-install
// %install-swiftpm-flags -c release
// %install '.package(path: "/notebooks/language2motion.gt")' Datasets TranslationModels TextModels ModelSupport SummaryWriter MotionModels

In [2]:
import TensorFlow
import TextModels
import TranslationModels
import Foundation
import ModelSupport
import Datasets
import SummaryWriter
import MotionModels

In [3]:
import Foundation

func shell(_ command: String) -> String {
    let task = Process()
    let pipe = Pipe()

    task.standardOutput = pipe
    task.arguments = ["-c", command]
    task.launchPath = "/bin/bash"
    task.launch()

    let data = pipe.fileHandleForReading.readDataToEndOfFile()
    return String(data: data, encoding: .utf8)!
}

func sh(_ command: String) {
    print(shell(command))
}

sh("""
export PATH="$PATH:/opt/bin:/swift/toolchain/usr/bin"
export LD_LIBRARY_PATH="/usr/lib64-nvidia:$LD_LIBRARY_PATH"
nvidia-smi
""")

Sat Jul  4 13:54:48 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   53C    P8    10W /  70W |      0MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [5]:
// sh("mkdir -p /content/data/")
// sh("""
// cd /content/data/
// wget https://github.com/wojtekcz/language2motion/releases/download/v0.2.0/motion_dataset_v3.norm.10Hz.tgz
// wget https://github.com/wojtekcz/language2motion/releases/download/v0.1.0/labels_ds_v2.csv
// wget https://github.com/wojtekcz/language2motion/releases/download/v0.1.0/vocab.txt
// wget https://github.com/wojtekcz/language2motion/releases/download/v0.1.0/labels_ds_v2.balanced.515.csv
// tar xzvf motion_dataset_v3.norm.10Hz.tgz
// """)

## Set training params

In [4]:
let runName = "run_3"
let batchSize = 6000
// let batchSize = 3000
let maxSequenceLength =  50
let nEpochs = 3
// let learningRate: Float = 2e-5
let learningRate: Float = 5e-4

print("runName: \(runName)")
print("batchSize: \(batchSize)")
print("maxSequenceLength: \(maxSequenceLength)")
print("nEpochs: \(nEpochs)")
print("learningRate: \(learningRate)")

let dataURL = URL(fileURLWithPath: "/content/data/")
let motionDatasetURL = dataURL.appendingPathComponent("motion_dataset_v3.norm.10Hz.plist")
let langDatasetURL = dataURL.appendingPathComponent("labels_ds_v2.csv")

runName: run_3
batchSize: 6000
maxSequenceLength: 50
nEpochs: 3
learningRate: 0.0005


## Select eager or X10 backend

In [5]:
let device = Device.defaultXLA
// let device = Device.defaultTFEager
print(device)

2020-07-04 13:54:49.121626: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-07-04 13:54:49.204459: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-07-04 13:54:49.205104: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties: 
pciBusID: 0000:00:04.0 name: Tesla T4 computeCapability: 7.5
coreClock: 1.59GHz coreCount: 40 deviceMemorySize: 14.73GiB deviceMemoryBandwidth: 298.08GiB/s
2020-07-04 13:54:49.227735: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-07-04 13:54:51.866140: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
2020-07-04 13:54:55.843780: I tensorflow/stream_executor/platform/default/d

## X10 warmup

In [6]:
let eagerTensor1 = Tensor([0.0, 1.0, 2.0])
let eagerTensor2 = Tensor([1.5, 2.5, 3.5])
let eagerTensorSum = eagerTensor1 + eagerTensor2
print(eagerTensorSum)
print(eagerTensor1.device)
let x10Tensor2 = Tensor([1.5, 2.5, 3.5], on: Device.defaultXLA)
print(x10Tensor2.device)

2020-07-04 13:55:27.506213: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-07-04 13:55:27.508023: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties: 
pciBusID: 0000:00:04.0 name: Tesla T4 computeCapability: 7.5
coreClock: 1.59GHz coreCount: 40 deviceMemorySize: 14.73GiB deviceMemoryBandwidth: 298.08GiB/s
2020-07-04 13:55:27.508108: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-07-04 13:55:27.508130: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
2020-07-04 13:55:27.508149: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcufft.so.10
2020-07-04 13:55:27.508166: I tensorflow/stream_executor/platform/

## Instantiate model

In [7]:
// instantiate text processor
let vocabularyURL = dataURL.appendingPathComponent("vocab.txt")
let vocabulary: Vocabulary = try! Vocabulary(fromFile: vocabularyURL)
let tokenizer: Tokenizer = BERTTokenizer(vocabulary: vocabulary, caseSensitive: false, unknownToken: "[UNK]", maxTokenLength: nil)
let textProcessor = TextProcessor(vocabulary: vocabulary, tokenizer: tokenizer, maxSequenceLength: maxSequenceLength)

// instantiate model
let sourceVocabSize = vocabulary.count
let inputSize = 48 // TODO: get value from dataset
let targetVocabSize = vocabulary.count
let layerCount: Int = 6
let modelSize: Int = 256
let feedForwardSize: Int = 1024
let headCount: Int = 8
let dropoutProbability: Double = 0.1

var model = MotionLangTransformer(
    sourceVocabSize: sourceVocabSize, 
    inputSize: inputSize,
    targetVocabSize: targetVocabSize,
    layerCount: layerCount, 
    modelSize: modelSize, 
    feedForwardSize: feedForwardSize, 
    headCount: headCount, 
    dropoutProbability: dropoutProbability
)

model.move(to: device)

## Load dataset

In [8]:
print("\nLoading dataset...")

var dataset = try Motion2Lang(
    motionDatasetURL: motionDatasetURL,
    langDatasetURL: langDatasetURL,
    maxSequenceLength: maxSequenceLength,
    batchSize: batchSize
) { (example: Motion2Lang.Example) -> MotionLangBatch in    
    let singleBatch = textProcessor.preprocess(example: example)
    return singleBatch
}

print("Dataset acquired.")


Loading dataset...
MotionDataset(motionSamples: 39102)
keeping 30120 annotatated motions
keeping 29970 longer motions, with minimum 10 frames
Dataset acquired.


## Check model on a batch

In [11]:
// get a batch
// print("\nOne batch (MotionLangBatch):")
// var epochIterator = dataset.trainingEpochs.enumerated().makeIterator()
// let epoch = epochIterator.next()
// let batches = Array(epoch!.1)
// let batch: MotionLangBatch = batches[0]
// print("type: \(type(of:batch))")
// print("motionFrames.shape: \(batch.motionFrames.shape)")
// // print("motionFlag.shape: \(batch.motionFlag.shape)")
// print("mask.shape: \(batch.mask.shape)")
// print("origMotionFramesCount.shape: \(batch.origMotionFramesCount.shape)")
// print("origMotionFramesCount: \(batch.origMotionFramesCount)")
// print("targetTokenIds.shape: \(batch.targetTokenIds.shape)")
// print("targetMask.shape: \(batch.targetMask.shape)")
// print("targetTruth.shape: \(batch.targetTruth.shape)")

In [12]:
// run one batch
// print("\nRun one batch:")
// print("==============")
// let deviceBatch = MotionLangBatch(copying: batch, to: device)
// let output = model(deviceBatch)
// print("output.shape: \(output.shape)")

## Optimizer

In [9]:
var optimizer = Adam(for: model, learningRate: learningRate)
optimizer = Adam(copying: optimizer, to: device)

let logdirURL = dataURL.appendingPathComponent("tboard/Motion2lang/\(runName)", isDirectory: true)
let summaryWriter = SummaryWriter(logdir: logdirURL, flushMillis: 30*1000)

## Training helpers

In [10]:
func update(model: inout MotionLangTransformer, using optimizer: inout Adam<MotionLangTransformer>, for batch: MotionLangBatch) -> Float {
    let labels = batch.targetTruth.reshaped(to: [-1])
    let resultSize = batch.targetTruth.shape.last! * batch.targetTruth.shape.first!
    let padIndex = textProcessor.padId
    let result = withLearningPhase(.training) { () -> Float in
        let (loss, grad) = valueWithGradient(at: model) {
            (model) -> Tensor<Float> in
            let logits = model.generate(input: batch).reshaped(to: [resultSize, -1])
            let sce = softmaxCrossEntropy(logits: logits, labels: labels)
            return sce
        }
        optimizer.update(&model, along: grad)
        LazyTensorBarrier()
        return loss.scalarized()
    }
    return result
}

/// returns validation loss
func validate(model: inout MotionLangTransformer, for batch: MotionLangBatch) -> Float {
    let labels = batch.targetTruth.reshaped(to: [-1])
    let resultSize = batch.targetTruth.shape.last! * batch.targetTruth.shape.first!
    let padIndex = textProcessor.padId
    let result = withLearningPhase(.inference) { () -> Float in
        softmaxCrossEntropy(logits: model.generate(input: batch).reshaped(to: [resultSize, -1]), labels: labels).scalarized()
    }
    LazyTensorBarrier()
    return result
}

## setup decoding

In [11]:
func greedyDecode(model: MotionLangTransformer, input: MotionLangBatch, maxLength: Int, startSymbol: Int32) -> Tensor<Int32> {
    let memory = model.encode(input: input)
    var ys = Tensor(repeating: startSymbol, shape: [1,1])
    // ys = Tensor(copying: ys, to: device)
    for _ in 0..<maxLength {
        let decoderInput = MotionLangBatch(motionFrames: input.motionFrames,
                                     mask: input.mask,
                                     origMotionFramesCount: input.origMotionFramesCount,
                                     targetTokenIds: ys,
                                     targetMask: Tensor<Float>(subsequentMask(size: ys.shape[1])),
                                     targetTruth: input.targetTruth)
        // decoderInput = MotionLangBatch(copying: decoderInput, to: device)
        let out = model.decode(input: decoderInput, memory: memory)
        let prob = model.generate(input: out[0...,-1])
        let nextWord = Int32(prob.argmax().scalarized())
        ys = Tensor(concatenating: [ys, Tensor(repeating: nextWord, shape: [1,1])], alongAxis: 1) // , on: device
        // ys = Tensor(copying: ys, to: device)
    }
    return ys
}

In [12]:
// get example
let example = dataset.trainExamples[0]
print("example.id: \(example.id)")
print("example.motionSample.timestepsArray.last: \(example.motionSample.timestepsArray.last!)")
print("example.motionSample.motionFramesArray.shape: \(example.motionSample.motionFramesArray.shape)")
print("example.targetSentence: \(example.targetSentence)")

let singleExampleBatch = textProcessor.preprocess(example: example)
var source = Motion2Lang.reduceDataBatches([singleExampleBatch])

example.id: 2069
example.motionSample.timestepsArray.last: 3.9
example.motionSample.motionFramesArray.shape: [40, 48]
example.targetSentence: Subject starts on its knees and stands up.


In [13]:
var outputStr = textProcessor.decode(tensor: source.targetTokenIds)
print("decode(source.targetTokenIds): \(outputStr)")

Context.local.learningPhase = .inference
source = MotionLangBatch(copying: source, to: Device.defaultTFEager)
model.move(to: Device.defaultTFEager)
let out = greedyDecode(model: model, input: source, maxLength: 50, startSymbol: textProcessor.bosId)
outputStr = textProcessor.decode(tensor: out)
print("greedyDecode(): \"\(outputStr)\"")
model.move(to: device)

decode(source.targetTokenIds): [CLS] subject starts on its knees and stands up .
2020-07-04 13:56:59.633709: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
greedyDecode(): "[CLS] mora mora mora mora mora mora mora mora mora mora mora mora mora mora mora mora mora mora mora mora mora mora mora mora mora mora mora mora mora mora mora mora mora mora mora mora relief relief mora mora mora mora mora mora mora mora mora mora mora mora"


## Training loop

In [None]:
print("\nTraining Transformer for the Motion2lang task!")
var trainingStepCount = 0
time() {
    LazyTensorBarrier()
    for (epoch, epochBatches) in dataset.trainingEpochs.prefix(nEpochs).enumerated() {
        print("[Epoch \(epoch + 1)]")
        Context.local.learningPhase = .training
        var trainingLossSum: Float = 0
        var trainingBatchCount = 0
        if epoch == 0 {
            print("epochBatches.count: \(epochBatches.count)")
        }

        for eagerBatch in epochBatches {
            if (trainingStepCount < 5) {
                print("==> step \(trainingStepCount)")
            }
            let batch = MotionLangBatch(copying: eagerBatch, to: device)
            let loss: Float = update(model: &model, using: &optimizer, for: batch)
            if (trainingStepCount < 5) {
                print("current loss at step \(trainingStepCount): \(loss)")
            }
            trainingLossSum += loss
            trainingBatchCount += 1
            summaryWriter.writeScalarSummary(tag: "TrainingLoss", step: trainingStepCount, value: trainingLossSum / Float(trainingBatchCount))
            trainingStepCount += 1
        }
        print(
            """
            Training loss: \(trainingLossSum / Float(trainingBatchCount))
            """
        )
        summaryWriter.writeScalarSummary(tag: "EpochTrainingLoss", step: epoch+1, value: trainingLossSum / Float(trainingBatchCount))

        if epoch == 0 {
            print("dataset.validationBatches.count: \(dataset.validationBatches.count)")
        }
        Context.local.learningPhase = .inference
        var devLossSum: Float = 0
        var devBatchCount = 0
        var totalGuessCount = 0

        for eagerBatch in dataset.validationBatches {
            let batch = MotionLangBatch(copying: eagerBatch, to: device)
            let loss: Float = validate(model: &model, for: batch)
            let valBatchSize = batch.motionFrames.shape[0]

            devLossSum += loss
            devBatchCount += 1
            totalGuessCount += valBatchSize
        }

        print(
            """
            totalGuessCount: \(totalGuessCount) \
            Eval loss: \(devLossSum / Float(devBatchCount))
            """
        )
        summaryWriter.writeScalarSummary(tag: "EpochTestLoss", step: epoch+1, value: devLossSum / Float(devBatchCount))

        print("\nEncoding/decoding one example") // on eager device
        Context.local.learningPhase = .inference
        source = MotionLangBatch(copying: source, to: Device.defaultTFEager)
        model.move(to: Device.defaultTFEager)
        let out = greedyDecode(model: model, input: source, maxLength: 50, startSymbol: textProcessor.bosId)
        outputStr = textProcessor.decode(tensor: out)
        print("greedyDecode(): \"\(outputStr)\"")
        model.move(to: device)
    }
    summaryWriter.flush()
}


print("\nFinished training.")


Training Transformer for the Motion2lang task!
2020-07-04 13:57:02.366909: W tensorflow/compiler/jit/xla_device.cc:398] XLA_GPU and XLA_CPU devices are deprecated and will be removed in subsequent releases. Instead, use either @tf.function(experimental_compile=True) for must-compile semantics, or run with TF_XLA_FLAGS=--tf_xla_auto_jit=2 for auto-clustering best-effort compilation.
[Epoch 1]
epochBatches.count: 199
==> step 0
current loss at step 0: 11.6007
==> step 1
current loss at step 1: 10.313914
==> step 2
current loss at step 2: 9.06369
==> step 3
current loss at step 3: 7.8494
==> step 4
current loss at step 4: 6.7248254
Training loss: 1.4837726
dataset.validationBatches.count: 50
totalGuessCount: 5993 Eval loss: 0.9534913

Encoding/decoding one example
greedyDecode(): "[CLS] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]

## Decoding

In [None]:
// // get example
// let example = dataset.trainExamples[0]
// print("example.id: \(example.id)")
// print("example.motionSample.timestepsArray.last: \(example.motionSample.timestepsArray.last!)")
// print("example.motionSample.motionFramesArray.shape: \(example.motionSample.motionFramesArray.shape)")
// print("example.targetSentence: \(example.targetSentence)")

// let singleExampleBatch = textProcessor.preprocess(example: example)
// var source = Motion2Lang.reduceDataBatches([singleExampleBatch])

In [None]:
// // encode/decode one example
// print("\nEncoding/decoding one example") // on eager device
// // Context.local.learningPhase = .inference
// source = MotionLangBatch(copying: source, to: Device.defaultTFEager)
// model.move(to: Device.defaultTFEager)
// let out = greedyDecode(model: model, input: source, maxLength: 50, startSymbol: textProcessor.bosId)
// outputStr = textProcessor.decode(tensor: out)
// print("greedyDecode(): \"\(outputStr)\"")
// model.move(to: device)