# Masks in decoder - analysis

In [None]:
// for local development
%install-location /notebooks/language2motion.gt/swift-install
%install-swiftpm-flags -c release
%install '.package(path: "/notebooks/language2motion.gt")' Datasets TranslationModels TextModels ModelSupport SummaryWriter LangMotionModels Checkpoints

In [None]:
import TensorFlow
import TextModels
import TranslationModels
import Foundation
import FoundationXML
import ModelSupport
import Datasets
import SummaryWriter
import LangMotionModels
import Checkpoints
import PythonKit

In [None]:
%include "EnableIPythonDisplay.swift"
IPythonDisplay.shell.enable_matplotlib("inline")

In [None]:
let plt = Python.import("matplotlib.pyplot")
let np = Python.import("numpy")

## Set training params

In [None]:
let device = Device.defaultTFEager

In [None]:
let maxTextSequenceLength =  20
let maxMotionLength =  50

In [None]:
let datasetSize: DatasetSize = .full
let batchSize = 2

In [None]:
let dataURL = URL(fileURLWithPath: "/notebooks/language2motion.gt/data/")
let motionDatasetURL = dataURL.appendingPathComponent("motion_dataset_v3.10Hz.\(datasetSize.rawValue)plist")

In [None]:
/// instantiate text processor
let vocabularyURL = dataURL.appendingPathComponent("vocab.txt")
let vocabulary: Vocabulary = try! Vocabulary(fromFile: vocabularyURL)
let tokenizer: Tokenizer = BERTTokenizer(vocabulary: vocabulary, caseSensitive: false, unknownToken: "[UNK]", maxTokenLength: nil)
let textProcessor = TextProcessor(vocabulary: vocabulary, tokenizer: tokenizer)

/// instantiate model
let modelSize = 128
let config = LangMotionTransformerConfig(
    vocabSize: vocabulary.count,
    nbJoints: 47, // TODO: get value from dataset
    nbMixtures: 20,
    layerCount: 6,
    modelSize: modelSize,
    feedForwardSize: 512,
    headCount: 4,
    dropoutProbability:  0.1,
    sentenceMaxPositionalLength: 100,
    motionMaxPositionalLength: 500,
    encoderSelfAttentionTemp: sqrt(Double(modelSize)),
    decoderSourceAttentionTemp: sqrt(Double(modelSize)),
    decoderSelfAttentionTemp: Double(modelSize)
)

let runName = "run_51"
let epoch = 150

let runURL = dataURL.appendingPathComponent("runs/Lang2motion/\(runName)", isDirectory: true)
let checkpointURL = runURL.appendingPathComponent("checkpoints", isDirectory: true)
let motionsURL = runURL.appendingPathComponent("generated_motions", isDirectory: true)
try! FileManager().createDirectory(at: motionsURL, withIntermediateDirectories: true)

// var model = LangMotionTransformer(config: config)
let model = LangMotionTransformer(checkpoint: checkpointURL, config: config, name: "model.e\(epoch)")

In [None]:
/// load dataset
print("\nLoading dataset...")

var dataset = try Lang2Motion(
    motionDatasetURL: motionDatasetURL,
    batchSize: batchSize,
    minMotionLength: 20,
    maxMotionLength: 50,
    trainTestSplit: 1.0,
    device: device
) { (motionSample: MotionSample) -> LangMotionBatch in    
    let sentence = textProcessor.preprocess(sentence: motionSample.annotations[0], maxTextSequenceLength: maxTextSequenceLength)
    let (motionPart, target) = LangMotionBatch.preprocessTargetMotion(sampleID: motionSample.sampleID, motion: motionSample.motion, maxMotionLength: maxMotionLength, shiftMaskRight: true)
    let source = LangMotionBatch.Source(sentence: sentence, motionPart: motionPart)
    let singleBatch = LangMotionBatch(source: source, target: target)
    return singleBatch
}

print("Dataset acquired.")

# Helpers

In [None]:
// func tensorShow(_ tensor: Tensor<Float>, cmapRange: Float = 6.0) {
//     plt.figure(figsize: [5, 5])
//     plt.imshow(tensor.makeNumpyArray(), aspect: "auto", cmap: "Spectral", vmin: -cmapRange, vmax: cmapRange)
//     plt.show()
// }

func tensorShow(_ tensor: Tensor<Float>, cmapRange: Float = 6.0) {
    plt.figure(figsize: [5, 5])
    if cmapRange == 0.0 {
        plt.imshow(tensor.makeNumpyArray()) //, aspect: "auto") //, cmap: "Spectral")
    } else {
        plt.imshow(tensor.makeNumpyArray(), aspect: "auto", cmap: "Spectral", vmin: -cmapRange, vmax: cmapRange)
    }
    plt.show()
}

In [None]:
extension LangMotionTransformer {
    public func getDecoderInput(sourceMask: Tensor<Float>, motionPart: LangMotionBatch.MotionPart, memory: Tensor<Float>) -> DecoderInput<Float> {
        var motionPartFeatures: Tensor<Float>

        // start flag, pos enc, current motion, padding with motion
        let shape = motionPart.motion.shape
        let (batchSize, numFrames) = (shape[0], shape[1])

        // motion positional encoding
        var motionPositionalEncodingVector = Tensor<Float>(repeating: 0.0, shape: [batchSize, numFrames, motionPositionalEncodingSize])
        motionPositionalEncodingVector = motionPositionalEncoding(motionPositionalEncodingVector)
        
        // compute padding
        let paddingSize = modelSize - (1 + motionPositionalEncodingSize + nbJoints)
        
        let multiplyBy = paddingSize/nbJoints + 1
        let motionFramePadding = motionPart.motion.tiled(multiples: [1, 1, multiplyBy])[0..., 0..., 0..<paddingSize]

        // stack everything together
        let tensorStack = [motionPart.startFlag, motionPositionalEncodingVector, motionPart.motion, motionFramePadding]
        let tmpMotionPartFeatures = Tensor<Float>(concatenating: tensorStack, alongAxis: 2)
        motionPartFeatures = tmpMotionPartFeatures

        motionPartFeatures = self.motionNorm(motionPartFeatures)
        
        let decoderInput = DecoderInput(sequence: motionPartFeatures, sourceMask: sourceMask, targetMask: motionPart.mask, memory: memory,
                                        sourceAttentionTemperature: Float(self.decoderSourceAttentionTemp), selfAttentionTemperature: Float(self.decoderSelfAttentionTemp))
        return decoderInput
    }
}

## TODO

In [None]:
// TODO: visualize:
// - mask(s)
//   * encoder padding mask
//   - decoder source padding mask
//   - decoder self attention mask
// - attention weights
//   * encoder
//   - decoder 1
//   - decoder 2
// - outputs
//   - encoder output
//   - decoder output
// - signals that go through the decoder

## Single batch

In [None]:
let motionSample = dataset.motionSamples[0]
print("sampleID: \(motionSample.sampleID)")
print(motionSample.description)
print(motionSample.annotations)

In [None]:
let sentence = textProcessor.preprocess(sentence: motionSample.annotations[0], maxTextSequenceLength: maxTextSequenceLength)
let (motionPart, target) = LangMotionBatch.preprocessTargetMotion(sampleID: motionSample.sampleID, motion: motionSample.motion, maxMotionLength: maxMotionLength, shiftMaskRight: true)
let source = LangMotionBatch.Source(sentence: sentence, motionPart: motionPart)
let singleBatch = LangMotionBatch(source: source, target: target)

In [None]:
let trInput = singleBatch.source

In [None]:
func tensorShow2(_ tensor: Tensor<Float>) {
    plt.imshow(tensor.makeNumpyArray(), cmap: "Spectral")
    plt.show()
}

# encoder

In [None]:
trInput.sentence.printSentence()

In [None]:
let encoded = model.encode(input: trInput.sentence)
// let encoded = model.encode(input: bigSentence)
encoded.lastLayerOutput.shape

In [None]:
tensorShow(encoded.lastLayerOutput[0], cmapRange: 0.0)

## encoder attention probs

In [None]:
encoded.allLayerOutputs.count

In [None]:
encoded.allLayerOutputs[0].attentionOutput!.result.shape

In [None]:
encoded.allLayerOutputs[0].attentionOutput!.attentionScores.shape

In [None]:
encoded.allLayerOutputs[0].attentionOutput!.attentionScores.min()

In [None]:
encoded.allLayerOutputs[0].attentionOutput!.attentionScores[0, 1].shape

In [None]:
encoded.allLayerOutputs.map {tensorShow2($0.attentionOutput!.attentionProbs[0, 0])}

In [None]:
encoded.allLayerOutputs.map {tensorShow($0.result[0], cmapRange: 0.0)}

In [None]:
encoded.allLayerOutputs[5].result

# sourceAttentionMask

In [None]:
tensorShow2(trInput.sourceAttentionMask.squeezingShape(at: 0))

## self-attention decoder mask

In [None]:
tensorShow2(trInput.motionPart.mask.squeezingShape(at: 0))

# change self-attention target mask

In [None]:
var m1 = 1 - trInput.motionPart.mask.squeezingShape(at: 0)
tensorShow2(m1)

In [None]:
let t1 = Tensor<Float>(repeating: 0.0, shape: [12, 50])
m1[38..<50, 0...] = t1
tensorShow2(m1)

In [None]:
let t2 = Tensor<Float>(repeating: 0.0, shape: [50, 12])
m1[0..., 38..<50] = t2

In [None]:
tensorShow2(m1)

In [None]:
m1 = m1.expandingShape(at: 0)

In [None]:
var m2 = trInput.motionPart.mask.squeezingShape(at: 0)
tensorShow2(m2)

In [None]:
trInput.motionPart.motionFlag.sum()

In [None]:
m2[0..<38, 0..<38+1] -= 1
tensorShow2(m2)

In [None]:
m2 = abs(m2)
tensorShow2(m2)

In [None]:
m2 = m2.expandingShape(at: 0)

In [None]:
m2

# decoding

In [None]:
// let mp = trInput.motionPart
// let mp1 = LangMotionBatch.MotionPart(motion: mp.motion, mask: m1, previousMotion: mp.previousMotion, startFlag: mp.startFlag, motionFlag: mp.motionFlag)

In [None]:
let decoded = model.decode(sourceMask: trInput.sourceAttentionMask, motionPart: trInput.motionPart, memory: encoded.lastLayerOutput)
// let decoded = model.decode(sourceMask: trInput.sourceAttentionMask, motionPart: mp1, memory: encoded.lastLayerOutput)
print(decoded.allLayerOutputs.count)
decoded.lastLayerOutput.shape

## decoder source attention

In [None]:
decoded.allLayerOutputs.map {tensorShow2($0.sourceAttentionOutput!.attentionProbs[0, 0])}

## one source attention

In [None]:
let oneSourceScores = decoded.allLayerOutputs[0].sourceAttentionOutput!.attentionScores[0, 0]
oneSourceScores.shape

In [None]:
tensorShow2(oneSourceScores)

In [None]:
oneSourceScores.min()

In [None]:
let oneSourceScores2 = oneSourceScores.replacing(with: Tensor(onesLike: oneSourceScores) * 0.0, where: oneSourceScores .< Tensor<Float>([0.0]))
// let oneSourceScores2 = oneSourceScores.replacing(with: Tensor(zerosLike: oneSourceScores), where: oneSourceScores .< Tensor<Float>([0.0]))

In [None]:
tensorShow2(oneSourceScores2)

In [None]:
oneSourceScores[10]

In [None]:
softmax(oneSourceScores[10])*100

In [None]:
plt.plot(softmax(oneSourceScores[10]).makeNumpyArray())
plt.show()

In [None]:
oneSourceScores2[10]

In [None]:
(oneSourceScores2[10] * 5).sum()

# scale attention scores

In [None]:
softmax(oneSourceScores2[10] * 128)*100

In [None]:
plt.plot(softmax(oneSourceScores2[10]).makeNumpyArray())
plt.show()

## multiply scores by factor

In [None]:
softmax(oneSourceScores[10] * sqrt(128))*100

In [None]:
plt.plot(softmax(oneSourceScores[10] * sqrt(128)).makeNumpyArray())
plt.show()

In [None]:
let factor: Float = 1.0
// let factor: Float = sqrt(128)
// let factor: Float = 128
// let factor: Float = 1000
tensorShow2(softmax(oneSourceScores * factor, alongAxis: 1))

## decoder self-attention

In [None]:
decoded.allLayerOutputs.map {tensorShow2($0.targetAttentionOutput!.attentionProbs[0, 0])}

## self-attention activations mins, maxs

In [None]:
tensorShow2(trInput.motionPart.mask.squeezingShape(at: 0))

In [None]:
let oneMaskProbs = decoded.allLayerOutputs[0].targetAttentionOutput!.attentionProbs[0, 0]
oneMaskProbs.shape

In [None]:
tensorShow2(oneMaskProbs)

In [None]:
oneMaskProbs.min(alongAxes: 1)*100

In [None]:
oneMaskProbs.max(alongAxes: 1)*100

In [None]:
plt.plot(oneMaskProbs[20, 0...20].makeNumpyArray())
plt.show()

In [None]:
plt.plot(oneMaskProbs[20].makeNumpyArray())
plt.show()

# oneMaskScores

In [None]:
let oneMaskScores = decoded.allLayerOutputs[0].targetAttentionOutput!.attentionScores[0, 0]
oneMaskScores.shape

In [None]:
tensorShow2(oneMaskScores)

In [None]:
oneMaskScores.min(alongAxes: 1)

In [None]:
sqrt(128)

In [None]:
oneMaskScores.max(alongAxes: 1)

In [None]:
let oneMaskScores2 = oneMaskScores.replacing(with: Tensor(zerosLike: oneMaskScores), where: oneMaskScores .< Tensor<Float>([0.0]))

In [None]:
tensorShow2(oneMaskScores2)

## multiply scores by factor

In [None]:
let factor: Float = sqrt(128)
// let factor: Float = 128
// let factor: Float = 1
tensorShow2(softmax(oneMaskScores * factor, alongAxis: 1))

## loss

In [None]:
let mixtureModelInput = Tensor<Float>(concatenating: decoded.allResults, alongAxis: 2)
let transformerOutput = LangMotionTransformerOutput(preds: model.mixtureModel(mixtureModelInput), encoded: encoded, decoded: decoded)

In [None]:
// Loss function
let args = LossArgs(
        nb_joints: config.nbJoints,
        nb_mixtures: config.nbMixtures,
        mixture_regularizer_type: "None",  // ["cv", "l2", "None"]
        mixture_regularizer: 0.0,
        device: device
)

@differentiable(wrt: y_pred)
public func normalMixtureSurrogateLoss2(y_pred: MixtureModelPreds, y_true: LangMotionBatch.Target, args: LossArgs) -> (Tensor<Float>, Tensor<Float>) {
    // masking
    var y_pred = y_pred.squeezed()
    var y_true = y_true.squeezed()
    let ids = Tensor<Int32>(rangeFrom: 0, to: Int32(y_true.stops.shape[1]), stride: 1, on: args.device)
    let indices = ids.gathering(where: y_true.stops .!= Tensor(1, on: args.device))
    y_pred = y_pred.gathering(atIndices: indices, alongAxis: 1)
    y_true = y_true.gathering(atIndices: indices, alongAxis: 1)
    
    let loss = _normalMixtureSurrogateLoss(y_true: y_true, y_pred: y_pred, args: args)
    let mean_loss = loss.mean()
    return (mean_loss, loss)
}

In [None]:
let (avg_loss, loss) = normalMixtureSurrogateLoss2(y_pred: transformerOutput.preds, y_true: singleBatch.target, args: args)

In [None]:
print(avg_loss)
loss

In [None]:
plt.plot(loss.scalars)
plt.show()

# decoder deep dive

In [None]:
extension TransformerDecoderLayer {
    @differentiable
    public func callAsFunction2(_ input: DecoderInput<Float>) -> Tensor<Float> {
        // SR-11882
        // we have to pass the input as a param in the Sublayer input because we still need to diferentiate
        // targetMask, memory, and sourceMask
        let selfNoDerivative = withoutDerivative(at: self)
        let batchSize = withoutDerivative(at: input.batchSize)
        
        var output = input.sequence
        
        
        output = self.sublayers[0].decoderForward(.init(sequence: output, decoderContext: input, activation: {
            selfNoDerivative.selfAttention(.init(source: $0,
                                                 target: $0,
                                                 mask: $1.targetMask,
                                                 batchSize: batchSize))
        }))
        output = self.sublayers[1].decoderForward(.init(sequence: output, decoderContext: input, activation: {
            print("\nsource attention")
            print("  source.shape: \($0.shape)")
            print("  target.shape: \($1.memory.shape)")
            print("  mask.shape: \($1.sourceMask.shape)")
            return selfNoDerivative.sourceAttention(.init(source: $0,
                                                   target: $1.memory,
                                                   mask: $1.sourceMask,
                                                   batchSize: batchSize))
        }))
        output = self.sublayers[2].decoderForward(.init(sequence: output, decoderContext: input, activation: {(result, _) in
            selfNoDerivative.feedForward(result)
        }))
        return output
    }
}

In [None]:
extension Decoder {
    @differentiable
    public func callAsFunction2(_ input: DecoderInput<Float>) -> DecoderOutput<Float> {
        var allOutputs: [Tensor<Float>] = []
        var transformerInput = input.sequence
        let memoryInput = input.memory
        
        for layerIndex in 0..<(withoutDerivative(at: layers) { $0.count }) {
            print("\(layerIndex)")
            print("sequence:\(transformerInput.shape), sourceMask:\(input.sourceMask.shape), targetMask:\(input.targetMask.shape), memory:\(memoryInput.shape)")
            let layerOutput = layers[layerIndex].callAsFunction2(DecoderInput(
                sequence: transformerInput,
                sourceMask: input.sourceMask,
                targetMask: input.targetMask,
                memory: memoryInput
            ))
            allOutputs.append(layerOutput)
            transformerInput = layerOutput
        }
        
        return DecoderOutput<Float>(lastLayerOutput: transformerInput, allOutputs: allOutputs)
    }
}

In [None]:
trInput.sentence.mask

In [None]:
tensorShow(trInput.sourceAttentionMask.squeezingShape(at: 0))

In [None]:
let decoderInput = model.getDecoderInput(sourceMask: trInput.sourceAttentionMask, motionPart: trInput.motionPart, memory: encodedMemory)

In [None]:
let decoderOuptut = model.decoder.callAsFunction2(decoderInput)

# create source attention mask of [bs x maxMotionLength x maxTextSequenceLength] dimensions

In [None]:
sentence

In [None]:
motionPart.printMotionPart()

In [None]:
target.printTarget()

In [None]:
func createSourceAttentionMask(sourceSequenceLength: Int, targetSequenceLength: Int, maxSourceSequenceLength: Int, maxTargetSequenceLength: Int) -> Tensor<Float> {
    var mask = Tensor<Float>(zeros: [maxTargetSequenceLength, maxSourceSequenceLength])
    let ones = Tensor<Float>(ones: [targetSequenceLength, sourceSequenceLength])
    mask[0..<ones.shape[0], 0..<ones.shape[1]] = ones
    return mask
}

In [None]:
let mask = createSourceAttentionMask(sourceSequenceLength: 9, targetSequenceLength: 55, maxSourceSequenceLength: 20, maxTargetSequenceLength: 100)

In [None]:
mask

In [None]:
tensorShow(mask)

In [None]:
// Where to get values from:
// sourceSequenceLength
// targetSequenceLength

## create source attention mask from two 1-dim flags

In [None]:
let sentenceMask = singleBatch.source.sentence.mask.squeezingShape(at: 1)
sentenceMask

In [None]:
let motionFlag = Tensor<Float>(singleBatch.source.motionPart.motionFlag)
motionFlag.shape

In [None]:
let sourceAttentionMask = sentenceMask * motionFlag.transposed()
sourceAttentionMask

In [None]:
tensorShow(sourceAttentionMask.squeezingShape(at: 0))

### check mask for batch

In [None]:
let motionSample2 = dataset.motionSamples[1]
print("sampleID: \(motionSample2.sampleID)")
print(motionSample2.description)
print(motionSample2.annotations)

In [None]:
let sentence2 = textProcessor.preprocess(sentence: motionSample2.annotations[0], maxTextSequenceLength: maxTextSequenceLength)
let (motionPart2, target2) = LangMotionBatch.preprocessTargetMotion(sampleID: motionSample2.sampleID, motion: motionSample2.motion, maxMotionLength: maxMotionLength)
let source2 = LangMotionBatch.Source(sentence: sentence2, motionPart: motionPart2)
let singleBatch2 = LangMotionBatch(source: source2, target: target2)

In [None]:
let batch = LangMotionBatch.reduceDataBatches([singleBatch, singleBatch2])

In [None]:
let sourceAttentionMask = batch.data.sourceAttentionMask
sourceAttentionMask.shape

In [None]:
tensorShow(sourceAttentionMask[0])

In [None]:
tensorShow(sourceAttentionMask[1])