# Train Transformer for the Lang2motion task

In [1]:
// for colab
%install-location $cwd/swift-install
%install-swiftpm-flags -c release
%install '.package(url: "https://github.com/wojtekcz/language2motion.git", .branch("master"))' Datasets TranslationModels TextModels ModelSupport SummaryWriter LangMotionModels TrainingLoop

Installing packages:
	.package(url: "https://github.com/wojtekcz/language2motion.git", .branch("master"))
		Datasets
		TranslationModels
		TextModels
		ModelSupport
		SummaryWriter
		LangMotionModels
		TrainingLoop
With SwiftPM flags: ['-c', 'release']
Working in: /tmp/tmpl1fn_jub/swift-install
Fetching https://github.com/wojtekcz/language2motion.git
Fetching https://github.com/apple/swift-protobuf.git
Cloning https://github.com/wojtekcz/language2motion.git
Resolving https://github.com/wojtekcz/language2motion.git at master
Cloning https://github.com/apple/swift-protobuf.git
Resolving https://github.com/apple/swift-protobuf.git at 1.12.0
[1/5] Compiling SummaryWriter SummaryWriter.swift
[2/6] Compiling Batcher Backend.swift
[3/6] Compiling STBImage stb_image_write.c
[4/6] Compiling STBImage stb_image.c
[5/6] Compiling SwiftProtobuf AnyMessageStorage.swift
[6/7] Compiling ModelSupport BijectiveDictionary.swift
[7/9] Compiling TrainingLoop LossFunctions.swift
[8/10] Compiling Checkpoints

## What's the GPU?

In [2]:
import Foundation

func shell(_ command: String) -> String {
    let task = Process()
    let pipe = Pipe()

    task.standardOutput = pipe
    task.arguments = ["-c", command]
    task.launchPath = "/bin/bash"
    task.launch()

    let data = pipe.fileHandleForReading.readDataToEndOfFile()
    return String(data: data, encoding: .utf8)!
}

func sh(_ command: String) {
    print(shell(command))
}

sh("""
export PATH="$PATH:/opt/bin:/swift/toolchain/usr/bin"
export LD_LIBRARY_PATH="/usr/lib64-nvidia:$LD_LIBRARY_PATH"
nvidia-smi
""")

Fri Sep  4 15:37:01 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.66       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   48C    P8    10W /  70W |      0MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

## run colab ssh server

In [None]:
// run colab ssh server
// after it finishes, interrupt cell execution
sh("bash <(curl -s https://raw.githubusercontent.com/wojtekcz/language2motion/master/notebooks/Colab/swift_colab_ssh_server.sh)")

In [None]:
sh("ps ax|grep ssh")

In [None]:
sh("kill -9 2760")  // enter ssh pid to kill the tunnel

## Imports

In [4]:
import TensorFlow
import TextModels
import TranslationModels
import Foundation
import FoundationXML
import ModelSupport
import Datasets
import SummaryWriter
import LangMotionModels
import TrainingLoop
import PythonKit
import x10_optimizers_optimizer

In [5]:
import PythonKit

%include "EnableIPythonDisplay.swift"
IPythonDisplay.shell.enable_matplotlib("inline")

('inline', 'module://ipykernel.pylab.backend_inline')


In [None]:
// sh("ps ax")

In [None]:
// sh("kill 2150")

In [None]:
// sh("ls -la /content/data/runs/Lang2motion/run_19/checkpoints")
// sh("ls -la /content/checkpoints/")

In [None]:
// sh("mv /content/data/runs/Lang2motion/run_17/checkpoints/* /content/data/runs")

## Download data

In [9]:
let datasetSize: DatasetSize = .multi_full
let dataset_name = "motion_dataset_v3.10Hz.\(datasetSize.rawValue)"
let runName = "run_27"

In [7]:
sh("""
mkdir -p /content/data/
cd /content/data/
wget -nv --show-progress -N https://github.com/wojtekcz/language2motion/releases/download/v0.3.0/\(dataset_name)tgz
wget -nv -N https://github.com/wojtekcz/language2motion/releases/download/v0.1.0/vocab.txt
tar xzvf \(dataset_name)tgz --skip-old-files
""")

2020-09-04 15:38:47 URL:https://github-production-release-asset-2e65be.s3.amazonaws.com/258798747/48622c00-d288-11ea-8d95-40d568bbf42a?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20200904%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200904T153832Z&X-Amz-Expires=300&X-Amz-Signature=a2476915b1411ec53bfe20a1f77aa5fcc3a33fdf0a978d8e6b48ecffa6040bc8&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=258798747&response-content-disposition=attachment%3B%20filename%3Dmotion_dataset_v3.10Hz.multi.tgz&response-content-type=application%2Foctet-stream [699830419/699830419] -> "motion_dataset_v3.10Hz.multi.tgz" [1]
2020-09-04 15:38:48 URL:https://github-production-release-asset-2e65be.s3.amazonaws.com/258798747/d61a0480-a6a7-11ea-9a3e-8c42fc2775cc?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20200904%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200904T153848Z&X-Amz-Expires=300&X-Amz-Signature=2bd1f50cd75f8e356be2d2003037e4715e63df8b95d17a6d

In [17]:
sh("""
cd /content/data/
mv motion_dataset_v3.10Hz.plist motion_dataset_v3.10Hz.multi.plist
""")




In [10]:
sh("""
cd /content/data/
mkdir -p runs/Lang2motion/\(runName)/checkpoints
wget -nv --show-progress -N https://github.com/wojtekcz/language2motion/releases/download/v0.3.0/run_16.model.e35.tgz
tar xzvf run_16.model.e35.tgz --skip-old-files --no-same-owner -C runs/Lang2motion/\(runName)/checkpoints
""")

model.e35.data-00000-of-00001
model.e35.index



In [None]:
// sh("ln -s /content/data/runs/Lang2motion/\(runName)/checkpoints /content/checkpoints")

## Set training params

In [11]:
// let batchSize = 4
let batchSize = 150
let maxTextSequenceLength =  20
let maxMotionLength =  100
let nEpochs = 30
// let peakLearningRate: Float = 5e-4
// let peakLearningRate: Float = 2e-4
// let peakLearningRate: Float = 2e-5
let peakLearningRate: Float = 5e-5

let stepsPerEpoch = 383*2 // function of training set size and batching configuration

let beta1: Float = 0.9
let beta2: Float = 0.999
let useBiasCorrection = false
let weightDecayRate: Float = 0.00

print("runName: \(runName)")
print("batchSize: \(batchSize)")
print("maxTextSequenceLength: \(maxTextSequenceLength)")
print("maxMotionLength: \(maxMotionLength)")
print("nEpochs: \(nEpochs)")
print("peakLearningRate: \(peakLearningRate)")
print("datasetSize: \(datasetSize)")

runName: run_27
batchSize: 150
maxTextSequenceLength: 20
maxMotionLength: 100
nEpochs: 30
peakLearningRate: 5e-05
datasetSize: multi_full


In [12]:
let dataURL = URL(fileURLWithPath: "/content/data/")
let motionDatasetURL = dataURL.appendingPathComponent("motion_dataset_v3.10Hz.\(datasetSize.rawValue)plist")

let logdirURL = dataURL.appendingPathComponent("runs/Lang2motion/\(runName)", isDirectory: true)
let checkpointURL = logdirURL.appendingPathComponent("checkpoints", isDirectory: true)
try! FileManager().createDirectory(at: checkpointURL, withIntermediateDirectories: true)

## Select eager or X10 backend

In [13]:
// let device = Device.defaultXLA
let device = Device.defaultTFEager
print(device)

Device(kind: .CPU, ordinal: 0, backend: .TF_EAGER)


## Instantiate model

In [14]:
/// instantiate text processor
print("instantiate text processor")
let vocabularyURL = dataURL.appendingPathComponent("vocab.txt")
let vocabulary: Vocabulary = try! Vocabulary(fromFile: vocabularyURL)
let tokenizer: Tokenizer = BERTTokenizer(vocabulary: vocabulary, caseSensitive: false, unknownToken: "[UNK]", maxTokenLength: nil)
let textProcessor = TextProcessor(vocabulary: vocabulary, tokenizer: tokenizer)

/// instantiate model
print("instantiate model")
let config = LangMotionTransformerConfig(
    vocabSize: vocabulary.count,
    nbJoints: 47, // TODO: get value from dataset
    nbMixtures: 20,
    layerCount: 6,
    modelSize: 256,
    feedForwardSize: 1024,
    headCount: 8,
    dropoutProbability:  0.1,
    sentenceMaxPositionalLength: 100,
    motionMaxPositionalLength: 500
)

var start_epoch = 0

/// create new model
// var model = LangMotionTransformer(config: config)

/// load model checkpoint
print("checkpointURL: \(checkpointURL.path)")
start_epoch = 35
var model = try! LangMotionTransformer(checkpoint: checkpointURL, config: config, name: "model.e\(start_epoch)")

instantiate text processor
instantiate model
checkpointURL: /content/data/runs/Lang2motion/run_27/checkpoints
Loading model "model.e35" from "/content/data/runs/Lang2motion/run_27/checkpoints"...


## Load dataset

In [None]:
motionDatasetURL.path

In [18]:
print("\nLoading dataset...")

var dataset = try Lang2Motion(
    motionDatasetURL: motionDatasetURL,
    batchSize: batchSize,
    minMotionLength: 10,
    maxMotionLength: 100,
    trainTestSplit: 1.0,
    demultiplyMotions: false,
    device: device
) { (motionSample: MotionSample) -> LangMotionBatch in    
    let sentence = textProcessor.preprocess(sentence: motionSample.annotations[0], maxTextSequenceLength: maxTextSequenceLength)
    let (motionPart, target) = LangMotionBatch.preprocessTargetMotion(sampleID: motionSample.sampleID, motion: motionSample.motion, maxMotionLength: maxMotionLength)
    let source = LangMotionBatch.Source(sentence: sentence, motionPart: motionPart)
    let singleBatch = LangMotionBatch(data: source, label: target)
    return singleBatch
}

print("Dataset acquired.")


Loading dataset...
MotionDataset(motionSamples: 39728)
Keeping 30404 annotated motions.
Keeping 30209 longer motions, with minimum 10 frames.
Keeping 25560 shorter motions, with maximum 100 frames.
Scaling motions...
Motions scaled.
Having 57573 annotations with motions.
Dataset acquired.


## Optimizer

In [19]:
var optimizer = x10_optimizers_optimizer.GeneralOptimizer(
    for: model,
    TensorVisitorPlan(model.differentiableVectorView),
    defaultOptimizer: makeWeightDecayedAdam(
      learningRate: peakLearningRate,
      beta1: beta1,
      beta2: beta2,
      weightDecayRate: weightDecayRate
    )
)

var scheduledLearningRate = LinearlyDecayedParameter(
  baseParameter: LinearlyWarmedUpParameter(
      baseParameter: FixedParameter<Float>(peakLearningRate),
      warmUpStepCount: 20,
      warmUpOffset: 0),
  slope: -(peakLearningRate / Float(stepsPerEpoch * nEpochs)),  // The LR decays linearly to zero.
  startStep: 10
)

public func learningRateUpdater<L: TrainingLoopProtocol>(_ loop: inout L, event: TrainingLoopEvent) throws {
    if event == .updateStart {
        let optimizer: GeneralOptimizer<LangMotionTransformer> = loop.optimizer as! GeneralOptimizer<LangMotionTransformer>
        let step = optimizer.step + 1 // for scheduled rates and bias correction, steps start at 1
        optimizer.learningRate = scheduledLearningRate(forStep: UInt64(step))
        if useBiasCorrection {
          let f_step = Float(step)
          optimizer.learningRate *= sqrtf(1 - powf(beta2, f_step)) / (1 - powf(beta1, f_step))
        }
        // print("\noptimizer: step: \(optimizer.step), learningRate: \(optimizer.learningRate)")
    }
}

## Training helpers

In [21]:
// Loss function
let args = LossArgs(
        nb_joints: config.nbJoints,
        nb_mixtures: config.nbMixtures,
        mixture_regularizer_type: "None",  // ["cv", "l2", "None"]
        mixture_regularizer: 0.0,
        device: device
)

@differentiable(wrt: y_pred)
func embeddedNormalMixtureSurrogateLoss(y_pred: MixtureModelPreds, y_true: LangMotionBatch.Target) -> Tensor<Float> {
    return normalMixtureSurrogateLoss(y_pred: y_pred, y_true: y_true, args: args)
}

public func saveCheckpoint<L: TrainingLoopProtocol>(_ loop: inout L, event: TrainingLoopEvent) throws {
    if event == .epochEnd {
        guard let epochIndex = loop.epochIndex else {
            return
        }
        let transformer: LangMotionTransformer = loop.model as! LangMotionTransformer
        try! transformer.writeCheckpoint(to: checkpointURL, name: "model.e\(epochIndex+1)")
    }
}

public class StatsRecorder {
    let summaryWriter = SummaryWriter(logdir: logdirURL, flushMillis: 30*1000)
    public var trainingStepCount = 0
    public var trainingBatchCount = 0
    public var trainingLossSum: Float = 0.0
    public var epochIndex = 0 // FIXME: Workaround

    public func writeStats<L: TrainingLoopProtocol>(_ loop: inout L, event: TrainingLoopEvent) throws {
        if event == .batchEnd {
            guard 
            // let batchIndex = loop.batchIndex, 
            let trainingLoss = loop.lastLoss else {
                return
            }
            // print("\nbatch stats: batchIndex: \(batchIndex), trainingStepCount: \(trainingStepCount), trainingLoss: \(trainingLoss)")
            summaryWriter.writeScalarSummary(tag: "TrainingLoss", step: trainingStepCount, value:trainingLoss.scalar!)
            trainingStepCount += 1
            trainingBatchCount += 1
            trainingLossSum += Float(trainingLoss.scalar!)
        }
        if event == .epochStart {
            trainingBatchCount = 0
            trainingLossSum = 0.0
        }
        if event == .epochEnd {
            // guard let epochIndex = loop.epochIndex else {
            //     return
            // }
            let current_epoch = epochIndex + 1
            let epochTrainingLoss = trainingLossSum / Float(trainingBatchCount)
            // print("\nepoch stats: current_epoch: \(current_epoch), epochTrainingLoss: \(epochTrainingLoss)")
            summaryWriter.writeScalarSummary(tag: "EpochTrainingLoss", step: current_epoch, value: epochTrainingLoss)
        }
        if event == .fitEnd {
            summaryWriter.flush()
        }
    }
}

## Training loop

In [None]:
let nEpochs = 1

In [None]:
let statsRecorder = StatsRecorder()

// Training loop
print("\nSetting up the training loop")
let trainingProgress = TrainingProgress(metrics: [.loss])
var trainingLoop = TrainingLoop(
  training: dataset.trainEpochs,
  validation: dataset.testBatches,
  optimizer: optimizer,
  lossFunction: embeddedNormalMixtureSurrogateLoss,
  callbacks: [trainingProgress.update, statsRecorder.writeStats, learningRateUpdater])

print("\nTraining Transformer for the Lang2motion task!")
// FIXME: epoch loop workaround for checkpoint saving
for epochIndex in start_epoch..<start_epoch+nEpochs {
    print("epoch \(epochIndex+1)/\(start_epoch + nEpochs)")
    statsRecorder.epochIndex = epochIndex
    try! trainingLoop.fit(&model, epochs: 1, on: device)
    try! model.writeCheckpoint(to: checkpointURL, name: "model.e\(epochIndex+1)")
}

try! model.writeCheckpoint(to: checkpointURL, name: "model.final")
print("\nFinished training.")


Setting up the training loop

Training Transformer for the Lang2motion task!
epoch 36/65
Epoch 1/1
42/383 [===>..........................] - loss: -5.2839