# Train Transformer for the Motion2lang task

In [1]:
// for colab
%install-location $cwd/swift-install
%install-swiftpm-flags -c release
%install '.package(url: "https://github.com/wojtekcz/language2motion.git", .branch("koszalin-dl-9"))' Datasets TranslationModels TextModels ModelSupport SummaryWriter MotionModels

Installing packages:
	.package(url: "https://github.com/wojtekcz/language2motion.git", .branch("koszalin-dl-9"))
		Datasets
		TranslationModels
		TextModels
		ModelSupport
		SummaryWriter
		MotionModels
With SwiftPM flags: ['-c', 'release']
Working in: /tmp/tmpaobsxl0z/swift-install
[1/2] Compiling jupyterInstalledPackages jupyterInstalledPackages.swift
Initializing Swift...
Installation complete!


In [None]:
// for local development
// %install-location /notebooks/language2motion.gt/swift-install
// %install-swiftpm-flags -c release
// %install '.package(path: "/notebooks/language2motion.gt")' Datasets TranslationModels TextModels ModelSupport SummaryWriter MotionModels

In [2]:
import TensorFlow
import TextModels
import TranslationModels
import Foundation
import ModelSupport
import Datasets
import SummaryWriter
import MotionModels

In [3]:
import Foundation

func shell(_ command: String) -> String {
    let task = Process()
    let pipe = Pipe()

    task.standardOutput = pipe
    task.arguments = ["-c", command]
    task.launchPath = "/bin/bash"
    task.launch()

    let data = pipe.fileHandleForReading.readDataToEndOfFile()
    return String(data: data, encoding: .utf8)!
}

func sh(_ command: String) {
    print(shell(command))
}

sh("""
export PATH="$PATH:/opt/bin:/swift/toolchain/usr/bin"
export LD_LIBRARY_PATH="/usr/lib64-nvidia:$LD_LIBRARY_PATH"
nvidia-smi
""")

Sat Jul  4 12:54:55 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   48C    P8    11W /  70W |      0MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [5]:
sh("mkdir -p /content/data/")
sh("""
cd /content/data/
wget https://github.com/wojtekcz/language2motion/releases/download/v0.2.0/motion_dataset_v3.norm.10Hz.tgz
wget https://github.com/wojtekcz/language2motion/releases/download/v0.1.0/labels_ds_v2.csv
wget https://github.com/wojtekcz/language2motion/releases/download/v0.1.0/vocab.txt
wget https://github.com/wojtekcz/language2motion/releases/download/v0.1.0/labels_ds_v2.balanced.515.csv
tar xzvf motion_dataset_v3.norm.10Hz.tgz
""")


--2020-07-04 12:44:29--  https://github.com/wojtekcz/language2motion/releases/download/v0.2.0/motion_dataset_v3.norm.10Hz.tgz
Resolving github.com (github.com)... 13.229.188.59
Connecting to github.com (github.com)|13.229.188.59|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github-production-release-asset-2e65be.s3.amazonaws.com/258798747/0c5c8700-b172-11ea-97ff-87f806ccfe78?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20200704%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200704T124430Z&X-Amz-Expires=300&X-Amz-Signature=c4307296ce1df305662419e72d7233921f1b248890094567308944e65402e37e&X-Amz-SignedHeaders=host&actor_id=0&repo_id=258798747&response-content-disposition=attachment%3B%20filename%3Dmotion_dataset_v3.norm.10Hz.tgz&response-content-type=application%2Foctet-stream [following]
--2020-07-04 12:44:30--  https://github-production-release-asset-2e65be.s3.amazonaws.com/258798747/0c5c8700-b172-11ea-97ff-87f806ccfe

## Set training params

In [4]:
let runName = "run_2"
let batchSize = 6000
// let batchSize = 3000
let maxSequenceLength =  50
let nEpochs = 40
// let learningRate: Float = 2e-5
let learningRate: Float = 5e-4

print("runName: \(runName)")
print("batchSize: \(batchSize)")
print("maxSequenceLength: \(maxSequenceLength)")
print("nEpochs: \(nEpochs)")
print("learningRate: \(learningRate)")

let dataURL = URL(fileURLWithPath: "/content/data/")
let motionDatasetURL = dataURL.appendingPathComponent("motion_dataset_v3.norm.10Hz.plist")
let langDatasetURL = dataURL.appendingPathComponent("labels_ds_v2.csv")

runName: run_2
batchSize: 6000
maxSequenceLength: 50
nEpochs: 40
learningRate: 0.0005


## Select eager or X10 backend

In [6]:
let device = Device.defaultXLA
// let device = Device.defaultTFEager
print(device)

Device(kind: .GPU, ordinal: 0, backend: .XLA)


## X10 warmup

In [5]:
let eagerTensor1 = Tensor([0.0, 1.0, 2.0])
let eagerTensor2 = Tensor([1.5, 2.5, 3.5])
let eagerTensorSum = eagerTensor1 + eagerTensor2
print(eagerTensorSum)
print(eagerTensor1.device)
let x10Tensor2 = Tensor([1.5, 2.5, 3.5], on: Device.defaultXLA)
print(x10Tensor2.device)

[1.5, 3.5, 5.5]
Device(kind: .CPU, ordinal: 0, backend: .TF_EAGER)
Device(kind: .GPU, ordinal: 0, backend: .XLA)


## Instantiate model

In [7]:
// instantiate text processor
let vocabularyURL = dataURL.appendingPathComponent("vocab.txt")
let vocabulary: Vocabulary = try! Vocabulary(fromFile: vocabularyURL)
let tokenizer: Tokenizer = BERTTokenizer(vocabulary: vocabulary, caseSensitive: false, unknownToken: "[UNK]", maxTokenLength: nil)
let textProcessor = TextProcessor(vocabulary: vocabulary, tokenizer: tokenizer, maxSequenceLength: maxSequenceLength)

// instantiate model
let sourceVocabSize = vocabulary.count
let inputSize = 48 // TODO: get value from dataset
let targetVocabSize = vocabulary.count
let layerCount: Int = 6
let modelSize: Int = 256
let feedForwardSize: Int = 1024
let headCount: Int = 8
let dropoutProbability: Double = 0.1

var model = MotionLangTransformer(
    sourceVocabSize: sourceVocabSize, 
    inputSize: inputSize,
    targetVocabSize: targetVocabSize,
    layerCount: layerCount, 
    modelSize: modelSize, 
    feedForwardSize: feedForwardSize, 
    headCount: headCount, 
    dropoutProbability: dropoutProbability
)

model.move(to: device)

## Load dataset

In [8]:
print("\nLoading dataset...")

var dataset = try Motion2Lang(
    motionDatasetURL: motionDatasetURL,
    langDatasetURL: langDatasetURL,
    maxSequenceLength: maxSequenceLength,
    batchSize: batchSize
) { (example: Motion2Lang.Example) -> MotionLangBatch in    
    let singleBatch = textProcessor.preprocess(example: example)
    return singleBatch
}

print("Dataset acquired.")


Loading dataset...
MotionDataset(motionSamples: 39102)
keeping 30120 annotatated motions
keeping 29970 longer motions, with minimum 10 frames
Dataset acquired.


## Check model on a batch

In [None]:
// get a batch
// print("\nOne batch (MotionLangBatch):")
// var epochIterator = dataset.trainingEpochs.enumerated().makeIterator()
// let epoch = epochIterator.next()
// let batches = Array(epoch!.1)
// let batch: MotionLangBatch = batches[0]
// print("type: \(type(of:batch))")
// print("motionFrames.shape: \(batch.motionFrames.shape)")
// // print("motionFlag.shape: \(batch.motionFlag.shape)")
// print("mask.shape: \(batch.mask.shape)")
// print("origMotionFramesCount.shape: \(batch.origMotionFramesCount.shape)")
// print("origMotionFramesCount: \(batch.origMotionFramesCount)")
// print("targetTokenIds.shape: \(batch.targetTokenIds.shape)")
// print("targetMask.shape: \(batch.targetMask.shape)")
// print("targetTruth.shape: \(batch.targetTruth.shape)")

In [None]:
// run one batch
// print("\nRun one batch:")
// print("==============")
// let deviceBatch = MotionLangBatch(copying: batch, to: device)
// let output = model(deviceBatch)
// print("output.shape: \(output.shape)")

## Optimizer

In [9]:
var optimizer = Adam(for: model, learningRate: learningRate)
optimizer = Adam(copying: optimizer, to: device)

let logdirURL = dataURL.appendingPathComponent("tboard/Motion2lang/\(runName)", isDirectory: true)
let summaryWriter = SummaryWriter(logdir: logdirURL, flushMillis: 30*1000)

## Training helpers

In [17]:
func update(model: inout MotionLangTransformer, using optimizer: inout Adam<MotionLangTransformer>, for batch: MotionLangBatch) -> Float {
    let labels = batch.targetTruth.reshaped(to: [-1])
    let resultSize = batch.targetTruth.shape.last! * batch.targetTruth.shape.first!
    let padIndex = textProcessor.padId
    let result = withLearningPhase(.training) { () -> Float in
        let (loss, grad) = valueWithGradient(at: model) {
            (model) -> Tensor<Float> in
            let logits = model.generate(input: batch).reshaped(to: [resultSize, -1])
            let sce = softmaxCrossEntropy(logits: logits, labels: labels)
            return sce
        }
        optimizer.update(&model, along: grad)
        LazyTensorBarrier()
        return loss.scalarized()
    }
    return result
}

/// returns validation loss
func validate(model: inout MotionLangTransformer, for batch: MotionLangBatch) -> Float {
    let labels = batch.targetTruth.reshaped(to: [-1])
    let resultSize = batch.targetTruth.shape.last! * batch.targetTruth.shape.first!
    let padIndex = textProcessor.padId
    let result = withLearningPhase(.inference) { () -> Float in
        softmaxCrossEntropy(logits: model.generate(input: batch).reshaped(to: [resultSize, -1]), labels: labels).scalarized()
    }
    LazyTensorBarrier()
    return result
}

## setup decoding

In [14]:
func greedyDecode(model: MotionLangTransformer, input: MotionLangBatch, maxLength: Int, startSymbol: Int32) -> Tensor<Int32> {
    let memory = model.encode(input: input)
    var ys = Tensor(repeating: startSymbol, shape: [1,1])
    // ys = Tensor(copying: ys, to: device)
    for _ in 0..<maxLength {
        let decoderInput = MotionLangBatch(motionFrames: input.motionFrames,
                                     mask: input.mask,
                                     origMotionFramesCount: input.origMotionFramesCount,
                                     targetTokenIds: ys,
                                     targetMask: Tensor<Float>(subsequentMask(size: ys.shape[1])),
                                     targetTruth: input.targetTruth)
        // decoderInput = MotionLangBatch(copying: decoderInput, to: device)
        let out = model.decode(input: decoderInput, memory: memory)
        let prob = model.generate(input: out[0...,-1])
        let nextWord = Int32(prob.argmax().scalarized())
        ys = Tensor(concatenating: [ys, Tensor(repeating: nextWord, shape: [1,1])], alongAxis: 1) // , on: device
        // ys = Tensor(copying: ys, to: device)
    }
    return ys
}

In [12]:
// get example
let example = dataset.trainExamples[0]
print("example.id: \(example.id)")
print("example.motionSample.timestepsArray.last: \(example.motionSample.timestepsArray.last!)")
print("example.motionSample.motionFramesArray.shape: \(example.motionSample.motionFramesArray.shape)")
print("example.targetSentence: \(example.targetSentence)")

let singleExampleBatch = textProcessor.preprocess(example: example)
var source = Motion2Lang.reduceDataBatches([singleExampleBatch])

example.id: 743
example.motionSample.timestepsArray.last: 6.6
example.motionSample.motionFramesArray.shape: [67, 48]
example.targetSentence: A person walks forward.


In [15]:
var outputStr = textProcessor.decode(tensor: source.targetTokenIds)
print("decode(source.targetTokenIds): \(outputStr)")

Context.local.learningPhase = .inference
source = MotionLangBatch(copying: source, to: Device.defaultTFEager)
model.move(to: Device.defaultTFEager)
let out = greedyDecode(model: model, input: source, maxLength: 50, startSymbol: textProcessor.bosId)
outputStr = textProcessor.decode(tensor: out)
print("greedyDecode(): \"\(outputStr)\"")
model.move(to: device)

decode(source.targetTokenIds): [CLS] a person walks forward .
greedyDecode(): "[CLS] stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław stanisław raged stanisław stanisław stanisław"


## Training loop

In [None]:
print("\nTraining Transformer for the Motion2lang task!")
var trainingStepCount = 0
time() {
    LazyTensorBarrier()
    for (epoch, epochBatches) in dataset.trainingEpochs.prefix(nEpochs).enumerated() {
        print("[Epoch \(epoch + 1)]")
        Context.local.learningPhase = .training
        var trainingLossSum: Float = 0
        var trainingBatchCount = 0
        if epoch == 0 {
            print("epochBatches.count: \(epochBatches.count)")
        }

        for eagerBatch in epochBatches {
            if (trainingStepCount < 5) {
                print("==> step \(trainingStepCount)")
            }
            let batch = MotionLangBatch(copying: eagerBatch, to: device)
            let loss: Float = update(model: &model, using: &optimizer, for: batch)
            if (trainingStepCount < 5) {
                print("current loss at step \(trainingStepCount): \(loss)")
            }
            trainingLossSum += loss
            trainingBatchCount += 1
            summaryWriter.writeScalarSummary(tag: "TrainingLoss", step: trainingStepCount, value: trainingLossSum / Float(trainingBatchCount))
            trainingStepCount += 1
        }
        print(
            """
            Training loss: \(trainingLossSum / Float(trainingBatchCount))
            """
        )
        summaryWriter.writeScalarSummary(tag: "EpochTrainingLoss", step: epoch+1, value: trainingLossSum / Float(trainingBatchCount))

        if epoch == 0 {
            print("dataset.validationBatches.count: \(dataset.validationBatches.count)")
        }
        Context.local.learningPhase = .inference
        var devLossSum: Float = 0
        var devBatchCount = 0
        var totalGuessCount = 0

        for eagerBatch in dataset.validationBatches {
            let batch = MotionLangBatch(copying: eagerBatch, to: device)
            let loss: Float = validate(model: &model, for: batch)
            let valBatchSize = batch.motionFrames.shape[0]

            devLossSum += loss
            devBatchCount += 1
            totalGuessCount += valBatchSize
        }

        print(
            """
            totalGuessCount: \(totalGuessCount) \
            Eval loss: \(devLossSum / Float(devBatchCount))
            """
        )
        summaryWriter.writeScalarSummary(tag: "EpochTestLoss", step: epoch+1, value: devLossSum / Float(devBatchCount))

        print("\nEncoding/decoding one example") // on eager device
        Context.local.learningPhase = .inference
        source = MotionLangBatch(copying: source, to: Device.defaultTFEager)
        model.move(to: Device.defaultTFEager)
        let out = greedyDecode(model: model, input: source, maxLength: 50, startSymbol: textProcessor.bosId)
        outputStr = textProcessor.decode(tensor: out)
        print("greedyDecode(): \"\(outputStr)\"")
        model.move(to: device)
    }
    summaryWriter.flush()
}


print("\nFinished training.")


Training Transformer for the Motion2lang task!
[Epoch 1]
epochBatches.count: 199
==> step 0
current loss at step 0: 11.059542
==> step 1
current loss at step 1: 9.656056
==> step 2
current loss at step 2: 8.300174
==> step 3
current loss at step 3: 6.951364
==> step 4
current loss at step 4: 5.714941
Training loss: 1.4272894
dataset.validationBatches.count: 50
totalGuessCount: 5996 Eval loss: 0.91727996

Encoding/decoding one example
greedyDecode(): "[CLS] person person person walks [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]"
[Epoch 2]


## Decoding

In [None]:
// encode/decode one example
// print("\nEncoding/decoding one example")
// Context.local.learningPhase = .inference
// source = MotionLangBatch(copying: source, to: device)
// model.move(to: Device.defaultTFEager)
// let out = greedyDecode(model: model, input: source, maxLength: 50, startSymbol: textProcessor.bosId)
// outputStr = textProcessor.decode(tensor: out)
// print("greedyDecode(), outputStr: \(outputStr)")