# Masks in decoder - analysis

## visualize masks

In [None]:
// for local development
%install-location /notebooks/language2motion.gt/swift-install
%install-swiftpm-flags -c release
%install '.package(path: "/notebooks/language2motion.gt")' Datasets TranslationModels TextModels ModelSupport SummaryWriter LangMotionModels Checkpoints

In [None]:
import TensorFlow
import TextModels
import TranslationModels
import Foundation
import FoundationXML
import ModelSupport
import Datasets
import SummaryWriter
import LangMotionModels
import Checkpoints
import PythonKit

In [None]:
%include "EnableIPythonDisplay.swift"
IPythonDisplay.shell.enable_matplotlib("inline")

In [None]:
let plt = Python.import("matplotlib.pyplot")
let np = Python.import("numpy")

## Set training params

In [None]:
let device = Device.defaultTFEager

In [None]:
let maxTextSequenceLength =  20
let maxMotionLength =  100

In [None]:
let datasetSize: DatasetSize = .midi
let batchSize = 2

In [None]:
let dataURL = URL(fileURLWithPath: "/notebooks/language2motion.gt/data/")
let motionDatasetURL = dataURL.appendingPathComponent("motion_dataset_v3.10Hz.\(datasetSize.rawValue)plist")

In [None]:
/// instantiate text processor
let vocabularyURL = dataURL.appendingPathComponent("vocab.txt")
let vocabulary: Vocabulary = try! Vocabulary(fromFile: vocabularyURL)
let tokenizer: Tokenizer = BERTTokenizer(vocabulary: vocabulary, caseSensitive: false, unknownToken: "[UNK]", maxTokenLength: nil)
let textProcessor = TextProcessor(vocabulary: vocabulary, tokenizer: tokenizer)

/// instantiate model
let config = LangMotionTransformerConfig(
    vocabSize: vocabulary.count,
    nbJoints: 47, // TODO: get value from dataset
    nbMixtures: 20,
    layerCount: 6,
    modelSize: 256,
    feedForwardSize: 1024,
    headCount: 8,
    dropoutProbability:  0.1,
    sentenceMaxPositionalLength: 100,
    motionMaxPositionalLength: 500,
    doMotionDense: false    
)

let runName = "run_37"
let epoch = 24

let runURL = dataURL.appendingPathComponent("runs/Lang2motion/\(runName)", isDirectory: true)
let checkpointURL = runURL.appendingPathComponent("checkpoints", isDirectory: true)
let motionsURL = runURL.appendingPathComponent("generated_motions", isDirectory: true)
try! FileManager().createDirectory(at: motionsURL, withIntermediateDirectories: true)

let model = LangMotionTransformer(checkpoint: checkpointURL, config: config, name: "model.e\(epoch)")

In [None]:
/// load dataset
print("\nLoading dataset...")

var dataset = try Lang2Motion(
    motionDatasetURL: motionDatasetURL,
    batchSize: batchSize,
    trainTestSplit: 1.0,
    demultiplyMotions: false,
    device: device
) { (motionSample: MotionSample) -> LangMotionBatch in    
    let sentence = textProcessor.preprocess(sentence: motionSample.annotations[0], maxTextSequenceLength: maxTextSequenceLength)
    let (motionPart, target) = LangMotionBatch.preprocessTargetMotion(sampleID: motionSample.sampleID, motion: motionSample.motion, maxMotionLength: maxMotionLength)
    let source = LangMotionBatch.Source(sentence: sentence, motionPart: motionPart)
    let singleBatch = LangMotionBatch(source: source, target: target)
    return singleBatch
}

print("Dataset acquired.")

# Helpers

In [None]:
extension LangMotionBatch {
    public static func preprocessTargetMotion2(sampleID: Int, motion: Tensor<Float>, maxMotionLength: Int) -> (motionPart: MotionPart, target: Target)
    {
        let origMotionFramesCount: Tensor<Int32> = Tensor<Int32>([Int32(motion.shape[0])])
        let nbJoints = motion.shape[1]
        
        let zeroMotionFrame = Self.zeroMotionFrame(nbJoints: nbJoints)
        
        let motion2 = Tensor(concatenating: [zeroMotionFrame, motion], alongAxis: 0)

        var (paddedMotion, motionFlag) = motion2.paddedAndCropped(to: maxMotionLength+1)
        paddedMotion = paddedMotion.expandingShape(at: 0) // FIXME: move adding batch dimension further down
        motionFlag = motionFlag.expandingShape(at: 0) // FIXME: move adding batch dimension further down

        // source (motionPart & motion flag)
        let rangeExceptLast = 0..<(paddedMotion.shape[1] - 1)
        let motionPartTensor = paddedMotion[0..., rangeExceptLast, 0...]

        // compute previous motion
        let previousMotionPartTensor = Tensor(concatenating: [zeroMotionFrame, motionPartTensor[0, 0..<motionPartTensor.shape[1]-1, 0...]], alongAxis: 0).expandingShape(at: 0)

        let motionPartFlag = motionFlag[0..., rangeExceptLast]
        let motionPartMask = makeStandardMask(target: motionPartFlag, pad: 0) // FIXME: fix target mask
        var motionStartFlag = Tensor<Float>(zeros: [motionPartTensor.shape[1], 1]).expandingShape(at: 0)
        motionStartFlag[0, 0, 0] = Tensor(1.0)

        let motionPart = MotionPart(motion: motionPartTensor, mask: motionPartMask, previousMotion: previousMotionPartTensor, startFlag: motionStartFlag, motionFlag: motionPartFlag)

        // target (motion & stops)
        let targetMotion: Tensor<Float> = paddedMotion[0..., 1..., 0...]
        let targetMotionFlag = motionFlag[0..., 1...]
        let targetStops: Tensor<Float> = 1.0 - Tensor<Float>(targetMotionFlag)

        let target = Target(sampleID: Tensor([Int32(sampleID)]), motion: targetMotion, stops: targetStops, origMotionFramesCount: origMotionFramesCount)
        return (motionPart: motionPart, target: target)
    }
}

In [None]:
func tensorShow(_ tensor: Tensor<Float>, cmapRange: Int = 6) {
    plt.figure(figsize: [5, 5])
    plt.imshow(tensor.makeNumpyArray(), aspect: "auto", cmap: "Spectral", vmin: -cmapRange, vmax: cmapRange)
    plt.show()
}

In [None]:
extension LangMotionTransformer {
    public func getDecoderInput(sourceMask: Tensor<Float>, motionPart: LangMotionBatch.MotionPart, memory: Tensor<Float>) -> DecoderInput<Float> {
        var motionPartFeatures: Tensor<Float>
        if doMotionDense {
            // TODO: kill motionDense layer eventually
            let shape = motionPart.motion.shape
            let (origBatchSize, numFrames) = (shape[0], shape[1])

            let tmpBatchSize = origBatchSize * numFrames
            let tmpMotionPart = motionPart.motion.reshaped(to: [tmpBatchSize, nbJoints])

            // FIXME: make targetEmbed() work
            let tmpMotionPartFeatures = motionDense(tmpMotionPart) // batch size here is origBatchSize*numFrames
            motionPartFeatures = tmpMotionPartFeatures.reshaped(to: [origBatchSize, numFrames, self.modelSize])
            motionPartFeatures = motionPositionalEncoding(motionPartFeatures)
        } else {
            // TODO: refactor this out
            // assuming modelSize = 256
            let shape = motionPart.motion.shape
            let (batchSize, numFrames) = (shape[0], shape[1])

            // motion positional encoding
            var motionPositionalEncodingVector = Tensor<Float>(repeating: 0.0, shape: [batchSize, numFrames, motionPositionalEncodingSize])
            motionPositionalEncodingVector = motionPositionalEncoding(motionPositionalEncodingVector)
            
            // current motion
            let currentMotion = motionPart.motion

            // compute contextVector
            let numTokens = memory.shape[1]
            let mask = sourceMask[0..., 0, 0...].expandingShape(at: 2).broadcasted(to: [batchSize, numTokens, modelSize])
            let maskedMemory = memory * mask
            let meanMemory = maskedMemory.mean(alongAxes: 1).squeezingShape(at: 1) // get mean across steps

            let contextVector = contextDense(meanMemory).expandingShape(at: 1).broadcasted(to: [batchSize, numFrames, contextSize])

            // previousMotion
            let previousMotion = motionPart.previousMotion

            // compute padding
            let motionFramePadding = Tensor<Float>(repeating: 0.0, shape: [batchSize, numFrames, modelSize - (1+motionPositionalEncodingSize+nbJoints*2+contextSize)])

            let tensorStack = [motionPart.startFlag, motionPositionalEncodingVector, currentMotion, previousMotion, contextVector, motionFramePadding]
            let tmpMotionPartFeatures = Tensor<Float>(concatenating: tensorStack, alongAxis: 2)

            // FIXME: preserve following?
            // tile motion along joints dimension
            // let multiplyBy = modelSize/nbJoints+1
            // let tmpMotionPartFeatures = motionPart.motion.tiled(multiples: [1, 1, multiplyBy])[0..., 0..., 0..<modelSize]
            // motionPartFeatures = motionPositionalEncoding(tmpMotionPartFeatures)
            motionPartFeatures = tmpMotionPartFeatures
        }
        let decoderInput = DecoderInput(sequence: motionPartFeatures, sourceMask: sourceMask, targetMask: motionPart.mask, memory: memory)
        return decoderInput
    }
}

## TODO

In [None]:
// TODO: visualize data:
// - mask(s)
// - signals that go through the decoder

## Single batch

In [None]:
let motionSample = dataset.motionSamples[0]
print("sampleID: \(motionSample.sampleID)")
print(motionSample.description)
print(motionSample.annotations)

In [None]:
let s = "A person plays the guitar, dances and kicks, then kneels down."

In [None]:
// let sentence = textProcessor.preprocess(sentence: s, maxTextSequenceLength: maxTextSequenceLength)
let sentence = textProcessor.preprocess(sentence: motionSample.annotations[0], maxTextSequenceLength: maxTextSequenceLength)
let (motionPart, target) = LangMotionBatch.preprocessTargetMotion2(sampleID: motionSample.sampleID, motion: motionSample.motion, maxMotionLength: maxMotionLength)
let source = LangMotionBatch.Source(sentence: sentence, motionPart: motionPart)
let singleBatch = LangMotionBatch(source: source, target: target)

In [None]:
let trInput = singleBatch.source

# encoder

In [None]:
trInput.sentence.printSentence()

In [None]:
let encodedMemory = model.encode(input: trInput.sentence)
encodedMemory.shape

# decode

In [None]:
let decoded = model.decode(sourceMask: trInput.sourceAttentionMask, motionPart: trInput.motionPart, memory: encodedMemory)
print(decoded.allOutputs.count)
decoded.lastLayerOutput.shape

# decoder deep dive

In [None]:
extension TransformerDecoderLayer {
    @differentiable
    public func callAsFunction2(_ input: DecoderInput<Float>) -> Tensor<Float> {
        // SR-11882
        // we have to pass the input as a param in the Sublayer input because we still need to diferentiate
        // targetMask, memory, and sourceMask
        let selfNoDerivative = withoutDerivative(at: self)
        let batchSize = withoutDerivative(at: input.batchSize)
        
        var output = input.sequence
        
        
        output = self.sublayers[0].decoderForward(.init(sequence: output, decoderContext: input, activation: {
            selfNoDerivative.selfAttention(.init(source: $0,
                                                 target: $0,
                                                 mask: $1.targetMask,
                                                 batchSize: batchSize))
        }))
        output = self.sublayers[1].decoderForward(.init(sequence: output, decoderContext: input, activation: {
            print("\nsource attention")
            print("  source.shape: \($0.shape)")
            print("  target.shape: \($1.memory.shape)")
            print("  mask.shape: \($1.sourceMask.shape)")
            return selfNoDerivative.sourceAttention(.init(source: $0,
                                                   target: $1.memory,
                                                   mask: $1.sourceMask,
                                                   batchSize: batchSize))
        }))
        output = self.sublayers[2].decoderForward(.init(sequence: output, decoderContext: input, activation: {(result, _) in
            selfNoDerivative.feedForward(result)
        }))
        return output
    }
}

In [None]:
extension Decoder {
    @differentiable
    public func callAsFunction2(_ input: DecoderInput<Float>) -> DecoderOutput<Float> {
        var allOutputs: [Tensor<Float>] = []
        var transformerInput = input.sequence
        let memoryInput = input.memory
        
        for layerIndex in 0..<(withoutDerivative(at: layers) { $0.count }) {
            print("\(layerIndex)")
            print("sequence:\(transformerInput.shape), sourceMask:\(input.sourceMask.shape), targetMask:\(input.targetMask.shape), memory:\(memoryInput.shape)")
            let layerOutput = layers[layerIndex].callAsFunction2(DecoderInput(
                sequence: transformerInput,
                sourceMask: input.sourceMask,
                targetMask: input.targetMask,
                memory: memoryInput
            ))
            allOutputs.append(layerOutput)
            transformerInput = layerOutput
        }
        
        return DecoderOutput<Float>(lastLayerOutput: transformerInput, allOutputs: allOutputs)
    }
}

In [None]:
trInput.sentence.mask

In [None]:
let decoderInput = model.getDecoderInput(sourceMask: trInput.sourceAttentionMask, motionPart: trInput.motionPart, memory: encodedMemory)

In [None]:
let decoderOuptut = model.decoder.callAsFunction2(decoderInput)

# create source attention mask of [bs x maxMotionLength x maxTextSequenceLength] dimensions

In [None]:
sentence

In [None]:
motionPart.printMotionPart()

In [None]:
target.printTarget()

In [None]:
func createSourceAttentionMask(sourceSequenceLength: Int, targetSequenceLength: Int, maxSourceSequenceLength: Int, maxTargetSequenceLength: Int) -> Tensor<Float> {
    var mask = Tensor<Float>(zeros: [maxTargetSequenceLength, maxSourceSequenceLength])
    let ones = Tensor<Float>(ones: [targetSequenceLength, sourceSequenceLength])
    mask[0..<ones.shape[0], 0..<ones.shape[1]] = ones
    return mask
}

In [None]:
let mask = createSourceAttentionMask(sourceSequenceLength: 9, targetSequenceLength: 55, maxSourceSequenceLength: 20, maxTargetSequenceLength: 100)

In [None]:
mask

In [None]:
tensorShow(mask)

In [None]:
// Where to get values from:
// sourceSequenceLength
// targetSequenceLength

## create source attention mask from two 1-dim flags

In [None]:
let sentenceMask = singleBatch.source.sentence.mask.squeezingShape(at: 1)
sentenceMask

In [None]:
let motionFlag = Tensor<Float>(singleBatch.source.motionPart.motionFlag)
motionFlag.shape

In [None]:
let sourceAttentionMask = sentenceMask * motionFlag.transposed()
sourceAttentionMask

In [None]:
tensorShow(sourceAttentionMask)

### check mask for batch

In [None]:
let motionSample2 = dataset.motionSamples[1]
print("sampleID: \(motionSample2.sampleID)")
print(motionSample2.description)
print(motionSample2.annotations)

In [None]:
let sentence2 = textProcessor.preprocess(sentence: motionSample2.annotations[0], maxTextSequenceLength: maxTextSequenceLength)
let (motionPart2, target2) = LangMotionBatch.preprocessTargetMotion2(sampleID: motionSample2.sampleID, motion: motionSample2.motion, maxMotionLength: maxMotionLength)
let source2 = LangMotionBatch.Source(sentence: sentence2, motionPart: motionPart2)
let singleBatch2 = LangMotionBatch(source: source2, target: target2)

In [None]:
let batch = LangMotionBatch.reduceDataBatches([singleBatch, singleBatch2])

In [None]:
let sourceAttentionMask = batch.data.sourceAttentionMask
sourceAttentionMask.shape

In [None]:
tensorShow(sourceAttentionMask[0])

In [None]:
tensorShow(sourceAttentionMask[1])