# Attention temperature analysis

In [None]:
// for local development
%install-location /notebooks/language2motion.gt/swift-install
%install-swiftpm-flags -c release
%install '.package(path: "/notebooks/language2motion.gt")' Datasets TranslationModels TextModels ModelSupport SummaryWriter LangMotionModels Checkpoints

In [None]:
import TensorFlow
import TextModels
import TranslationModels
import Foundation
import FoundationXML
import ModelSupport
import Datasets
import SummaryWriter
import LangMotionModels
import Checkpoints
import PythonKit

In [None]:
%include "EnableIPythonDisplay.swift"
IPythonDisplay.shell.enable_matplotlib("inline")

In [None]:
let plt = Python.import("matplotlib.pyplot")
let np = Python.import("numpy")

## Set training params

In [None]:
let device = Device.defaultTFEager

In [None]:
let maxTextSequenceLength =  20
let maxMotionLength =  50

In [None]:
let datasetSize: DatasetSize = .full
let batchSize = 2

In [None]:
let dataURL = URL(fileURLWithPath: "/notebooks/language2motion.gt/data/")
let motionDatasetURL = dataURL.appendingPathComponent("motion_dataset_v3.10Hz.\(datasetSize.rawValue)plist")

In [None]:
/// instantiate text processor
let vocabularyURL = dataURL.appendingPathComponent("vocab.txt")
let vocabulary: Vocabulary = try! Vocabulary(fromFile: vocabularyURL)
let tokenizer: Tokenizer = BERTTokenizer(vocabulary: vocabulary, caseSensitive: false, unknownToken: "[UNK]", maxTokenLength: nil)
let textProcessor = TextProcessor(vocabulary: vocabulary, tokenizer: tokenizer)

/// instantiate model
let modelSize = 128
let config = LangMotionTransformerConfig(
    vocabSize: vocabulary.count,
    nbJoints: 47, // TODO: get value from dataset
    nbMixtures: 20,
    layerCount: 6,
    modelSize: modelSize,
    feedForwardSize: 512,
    headCount: 4,
    dropoutProbability:  0.1,
    sentenceMaxPositionalLength: 100,
    motionMaxPositionalLength: 500,
    encoderSelfAttentionTemp: sqrt(Double(modelSize)),
    decoderSourceAttentionTemp: sqrt(Double(modelSize)),
    decoderSelfAttentionTemp: Double(modelSize)
)

let runName = "run_51"
let epoch = 150

let runURL = dataURL.appendingPathComponent("runs/Lang2motion/\(runName)", isDirectory: true)
let checkpointURL = runURL.appendingPathComponent("checkpoints", isDirectory: true)
let motionsURL = runURL.appendingPathComponent("generated_motions", isDirectory: true)
try! FileManager().createDirectory(at: motionsURL, withIntermediateDirectories: true)

let model = LangMotionTransformer(checkpoint: checkpointURL, config: config, name: "model.e\(epoch)")

In [None]:
/// load dataset
print("\nLoading dataset...")

var dataset = try Lang2Motion(
    motionDatasetURL: motionDatasetURL,
    batchSize: batchSize,
    minMotionLength: 20,
    maxMotionLength: 50,
    trainTestSplit: 1.0,
    device: device
) { (motionSample: MotionSample) -> LangMotionBatch in    
    let sentence = textProcessor.preprocess(sentence: motionSample.annotations[0], maxTextSequenceLength: maxTextSequenceLength)
    let (motionPart, target) = LangMotionBatch.preprocessTargetMotion(sampleID: motionSample.sampleID, motion: motionSample.motion, maxMotionLength: maxMotionLength, shiftMaskRight: true)
    let source = LangMotionBatch.Source(sentence: sentence, motionPart: motionPart)
    let singleBatch = LangMotionBatch(source: source, target: target)
    return singleBatch
}

print("Dataset acquired.")

# Helpers

In [None]:
// func tensorShow(_ tensor: Tensor<Float>, cmapRange: Float = 6.0) {
//     plt.figure(figsize: [5, 5])
//     plt.imshow(tensor.makeNumpyArray(), aspect: "auto", cmap: "Spectral", vmin: -cmapRange, vmax: cmapRange)
//     plt.show()
// }

func tensorShow(_ tensor: Tensor<Float>, cmapRange: Float = 6.0) {
    plt.figure(figsize: [5, 5])
    if cmapRange == 0.0 {
        plt.imshow(tensor.makeNumpyArray()) //, aspect: "auto") //, cmap: "Spectral")
    } else {
        plt.imshow(tensor.makeNumpyArray(), aspect: "auto", cmap: "Spectral", vmin: -cmapRange, vmax: cmapRange)
    }
    plt.show()
}

In [None]:
extension LangMotionTransformer {
    public func getDecoderInput(sourceMask: Tensor<Float>, motionPart: LangMotionBatch.MotionPart, memory: Tensor<Float>,
                                decoderSourceAttentionTemp: Float = 1.0,
                                decoderSelfAttentionTemp: Float = 1.0
                               ) -> DecoderInput<Float> {
        var motionPartFeatures: Tensor<Float>

        // start flag, pos enc, current motion, padding with motion
        let shape = motionPart.motion.shape
        let (batchSize, numFrames) = (shape[0], shape[1])

        // motion positional encoding
        var motionPositionalEncodingVector = Tensor<Float>(repeating: 0.0, shape: [batchSize, numFrames, motionPositionalEncodingSize])
        motionPositionalEncodingVector = motionPositionalEncoding(motionPositionalEncodingVector)
        
        // compute padding
        let paddingSize = modelSize - (1 + motionPositionalEncodingSize + nbJoints)
        
        let multiplyBy = paddingSize/nbJoints + 1
        let motionFramePadding = motionPart.motion.tiled(multiples: [1, 1, multiplyBy])[0..., 0..., 0..<paddingSize]

        // stack everything together
        let tensorStack = [motionPart.startFlag, motionPositionalEncodingVector, motionPart.motion, motionFramePadding]
        let tmpMotionPartFeatures = Tensor<Float>(concatenating: tensorStack, alongAxis: 2)
        motionPartFeatures = tmpMotionPartFeatures

        motionPartFeatures = self.motionNorm(motionPartFeatures)
        
        let decoderInput = DecoderInput(sequence: motionPartFeatures, sourceMask: sourceMask, targetMask: motionPart.mask, memory: memory,
                                        sourceAttentionTemperature: decoderSourceAttentionTemp, selfAttentionTemperature: decoderSelfAttentionTemp)
        return decoderInput
    }
}

## Single batch

In [None]:
let motionSample = dataset.motionSamples[0]
print("sampleID: \(motionSample.sampleID)")
print(motionSample.description)
print(motionSample.annotations)

In [None]:
let sentence = textProcessor.preprocess(sentence: motionSample.annotations[0], maxTextSequenceLength: maxTextSequenceLength)
let (motionPart, target) = LangMotionBatch.preprocessTargetMotion(sampleID: motionSample.sampleID, motion: motionSample.motion, maxMotionLength: maxMotionLength, shiftMaskRight: true)
let source = LangMotionBatch.Source(sentence: sentence, motionPart: motionPart)
let singleBatch = LangMotionBatch(source: source, target: target)

In [None]:
let trInput = singleBatch.source

In [None]:
func tensorShow2(_ tensor: Tensor<Float>) {
    plt.imshow(tensor.makeNumpyArray(), cmap: "Spectral")
    plt.show()
}

In [None]:
func showAll(_ tensor: Tensor<Float>, contrast: Float = 1.0, figsize: [Float] = [5, 8]) {
    let npArr = tensor.makeNumpyArray()
    let sbpl = plt.subplots(6, 4, figsize: figsize)
    let f = sbpl[0]
    let axarr = sbpl[1]
    let vmax = (tensor.max()/contrast).scalar
    for i in 0..<6 {
        for j in 0..<4 {
            axarr[i, j].imshow(npArr[i, j], vmax: vmax, cmap: "Spectral")
            axarr[i, j].tick_params(labelbottom: false, labelleft: false, bottom: false, left: false)
        } 
    }

    plt.show()
}

# Encoder

In [None]:
trInput.sentence.printSentence()

In [None]:
let input = trInput.sentence
let embedded = model.sourceEmbed(input.tokenIds)
let encoderSelfAttentionTemp: Float = 100.0
let encoderInput = TransformerInput(sequence: embedded, attentionMask: input.mask, selfAttentionTemperature: encoderSelfAttentionTemp)
let encoded = model.encoder(encoderInput)

print("encoderSelfAttentionTemp: \(encoderSelfAttentionTemp)")
print(encoded.lastLayerOutput.shape)
tensorShow(encoded.lastLayerOutput[0], cmapRange: 0.0)

## self-attention mask

In [None]:
trInput.sentence.mask

## encoder self-attention
### one output

In [None]:
let encAttnOutput = encoded.allLayerOutputs[0].attentionOutput!
tensorShow2(encAttnOutput.attentionProbs[0, 0])
tensorShow2(max(encAttnOutput.attentionScores, 0)[0, 0])

## all outputs

In [None]:
let attentionProbs = Tensor(stacking: encoded.allLayerOutputs.map { $0.attentionOutput!.attentionProbs[0]}, alongAxis: 0)
print("attentionProbs.shape: \(attentionProbs.shape)")
print(attentionProbs.min(), attentionProbs.max())
showAll(attentionProbs, contrast: 1, figsize: [3, 6])

In [None]:
let attentionScores = max(Tensor(stacking: encoded.allLayerOutputs.map { $0.attentionOutput!.attentionScores[0]}, alongAxis: 0), 0)
print("attentionScores.shape: \(attentionScores.shape)")
print(attentionScores.min(), attentionScores.max())
showAll(attentionScores, contrast: 1, figsize: [3, 6])

## encoder output

In [None]:
let encodedResults = Tensor(stacking: encoded.allLayerOutputs.map {$0.result[0]} )
print(encodedResults.shape)
print(encodedResults.min(), encodedResults.max())
tensorShow(encodedResults.reshaped(to: [6*20, 128]), cmapRange: 0.0)

# Decoder

In [None]:
sqrt(Double(modelSize))

In [None]:
let decoderSourceAttentionTemp: Float = 160.0
let decoderSelfAttentionTemp: Float = 100.0

let decoderInput = model.getDecoderInput(
    sourceMask: trInput.sourceAttentionMask, 
    motionPart: trInput.motionPart, 
    memory: encoded.lastLayerOutput,
    decoderSourceAttentionTemp: decoderSourceAttentionTemp,
    decoderSelfAttentionTemp: decoderSelfAttentionTemp
)

let decoded = model.decoder(decoderInput)

print("decoderSourceAttentionTemp: \(decoderSourceAttentionTemp)")
print("decoderSelfAttentionTemp: \(decoderSelfAttentionTemp)")
print(decoded.allLayerOutputs.count)
decoded.lastLayerOutput.shape

## source attention mask

In [None]:
tensorShow2(trInput.sourceAttentionMask.squeezingShape(at: 0))

## self-attention decoder mask

In [None]:
tensorShow2(trInput.motionPart.mask.squeezingShape(at: 0))

## decoder source attention

In [None]:
let srcAttnOutput = decoded.allLayerOutputs[0].sourceAttentionOutput!
tensorShow2(srcAttnOutput.attentionProbs[0, 0])
tensorShow2(max(srcAttnOutput.attentionScores[0, 0], 0))

In [None]:
let srcAttnProbs = Tensor(stacking: decoded.allLayerOutputs.map { $0.sourceAttentionOutput!.attentionProbs[0]}, alongAxis: 0)
print("srcAttnProbs.shape: \(srcAttnProbs.shape)")
print(srcAttnProbs.min(), srcAttnProbs.max())
showAll(srcAttnProbs, contrast: 2, figsize: [3, 10])

In [None]:
let srcAttnScores = max(Tensor(stacking: decoded.allLayerOutputs.map { $0.sourceAttentionOutput!.attentionScores[0]}, alongAxis: 0), 0)
print("srcAttnScores.shape: \(srcAttnScores.shape)")
print(srcAttnScores.min(), srcAttnScores.max())
showAll(srcAttnScores, contrast: 10, figsize: [3, 10])

## one source attention

In [None]:
let oneSourceScores = decoded.allLayerOutputs[0].sourceAttentionOutput!.attentionScores[0, 0]
oneSourceScores.shape

In [None]:
let oneSourceScores2 = max(oneSourceScores, 0)

In [None]:
tensorShow2(oneSourceScores2)

In [None]:
let slice = 20

In [None]:
oneSourceScores[slice]

In [None]:
softmax(oneSourceScores[slice])*100

In [None]:
plt.plot(softmax(oneSourceScores[slice]).makeNumpyArray())
plt.show()

In [None]:
oneSourceScores2[slice]

In [None]:
(oneSourceScores2[slice]).sum()

In [None]:
plt.plot(softmax(oneSourceScores2[slice]).makeNumpyArray())
plt.show()

## decoder self-attention

In [None]:
let tgtAttnProbs = Tensor(stacking: decoded.allLayerOutputs.map { $0.targetAttentionOutput!.attentionProbs[0]}, alongAxis: 0)
print("tgtAttnProbs.shape: \(tgtAttnProbs.shape)")
print(tgtAttnProbs.min(), tgtAttnProbs.max())
showAll(tgtAttnProbs, contrast: 1, figsize: [7, 12])

In [None]:
let tgtAttnScores = max(Tensor(stacking: decoded.allLayerOutputs.map { $0.targetAttentionOutput!.attentionScores[0]}, alongAxis: 0), 0)
print("tgtAttnScores.shape: \(tgtAttnScores.shape)")
print(tgtAttnScores.min(), tgtAttnScores.max())
showAll(tgtAttnScores, contrast: 1, figsize: [7, 12])

## self-attention activations mins, maxs

In [None]:
tensorShow2(trInput.motionPart.mask.squeezingShape(at: 0))

In [None]:
let oneMaskProbs = decoded.allLayerOutputs[0].targetAttentionOutput!.attentionProbs[0, 0]
oneMaskProbs.shape

In [None]:
tensorShow2(oneMaskProbs)

In [None]:
plt.plot(oneMaskProbs[20].makeNumpyArray())
plt.show()

# oneMaskScores

In [None]:
let oneMaskScores = decoded.allLayerOutputs[0].targetAttentionOutput!.attentionScores[0, 0]
oneMaskScores.shape

In [None]:
tensorShow2(oneMaskScores)

In [None]:
let oneMaskScores2 = oneMaskScores.replacing(with: Tensor(zerosLike: oneMaskScores), where: oneMaskScores .< Tensor<Float>([0.0]))

In [None]:
tensorShow2(oneMaskScores2)

## loss

In [None]:
let mixtureModelInput = Tensor<Float>(concatenating: decoded.allResults, alongAxis: 2)
let transformerOutput = LangMotionTransformerOutput(preds: model.mixtureModel(mixtureModelInput), encoded: encoded, decoded: decoded)

In [None]:
// Loss function
let args = LossArgs(
        nb_joints: config.nbJoints,
        nb_mixtures: config.nbMixtures,
        mixture_regularizer_type: "None",  // ["cv", "l2", "None"]
        mixture_regularizer: 0.0,
        device: device
)

@differentiable(wrt: y_pred)
public func normalMixtureSurrogateLoss2(y_pred: MixtureModelPreds, y_true: LangMotionBatch.Target, args: LossArgs) -> (Tensor<Float>, Tensor<Float>) {
    // masking
    var y_pred = y_pred.squeezed()
    var y_true = y_true.squeezed()
    let ids = Tensor<Int32>(rangeFrom: 0, to: Int32(y_true.stops.shape[1]), stride: 1, on: args.device)
    let indices = ids.gathering(where: y_true.stops .!= Tensor(1, on: args.device))
    y_pred = y_pred.gathering(atIndices: indices, alongAxis: 1)
    y_true = y_true.gathering(atIndices: indices, alongAxis: 1)
    
    let loss = _normalMixtureSurrogateLoss(y_true: y_true, y_pred: y_pred, args: args)
    let mean_loss = loss.mean()
    return (mean_loss, loss)
}

In [None]:
let (avg_loss, loss) = normalMixtureSurrogateLoss2(y_pred: transformerOutput.preds, y_true: singleBatch.target, args: args)

In [None]:
print(avg_loss)
loss

In [None]:
plt.plot(loss.scalars)
plt.show()