# Transformer-motion2label2

In [1]:
%install-location /notebooks/language2motion.gt/swift-install
%install-swiftpm-flags -c release
%install '.package(path: "/notebooks/language2motion.gt/code")' Batcher ModelSupport Datasets ImageClassificationModels TextModels

Installing packages:
	.package(path: "/notebooks/language2motion.gt/code")
		Batcher
		ModelSupport
		Datasets
		ImageClassificationModels
		TextModels
With SwiftPM flags: ['-c', 'release']
Working in: /tmp/tmpdtkvuv9u/swift-install
[1/2] Compiling jupyterInstalledPackages jupyterInstalledPackages.swift
Initializing Swift...
Installation complete!


In [2]:
// + load Motion2Label dataset
// + create sliding resnet feature extractor
// * create tiny transformer encoder (small hiddenSize, smaller than 768)
// + copy relevant sources here
// * feed sliced features to transformer
// TODO: make it train (with classifier)

In [3]:
import Foundation
import TensorFlow
import PythonKit

import Batcher
import ModelSupport
import Datasets
import ImageClassificationModels
import TextModels

In [4]:
%include "EnableIPythonDisplay.swift"
IPythonDisplay.shell.enable_matplotlib("inline")

('inline', 'module://ipykernel.pylab.backend_inline')


# load dataset

In [49]:
let batchSize = 2
let tensorWidth = 60

let serializedDatasetURL = URL(fileURLWithPath: "/notebooks/language2motion.gt/data/motion_dataset.motion_flag.normalized.500.plist")
let labelsURL = URL(fileURLWithPath: "/notebooks/language2motion.gt/data/labels_ds_v2.csv")

let dataset = Motion2Label(
    batchSize: batchSize, 
    serializedDatasetURL: serializedDatasetURL,
    labelsURL: labelsURL,
    tensorWidth: tensorWidth
)
print("dataset.training.count: \(dataset.training.count)")
print("dataset.test.count: \(dataset.test.count)")

MotionData(motionSamples: 494)
trainTensorPairs.count = 395
testTensorPairs.count = 99
dataset.training.count: 198
dataset.test.count: 50


# get Motion2LabelBatch batch or batch of Motion2LabelSample(s)

# 1-channel ResNet model

In [6]:
let nOutputs = 768

In [7]:
var resnet = ResNet(classCount: nOutputs, depth: .resNet18, downsamplingInFirstStage: false, channelCount: 1)
// let optimizer = SGD(for: resnet, learningRate: 0.001)

# Sliding ResNet feature extractor

In [58]:
var batchIterator = dataset.training.sequenced()

In [59]:
let batch = batchIterator.next()

In [60]:
let batchTensor = batch!.first
batchTensor.shape

▿ [2, 60, 45, 1]
  ▿ dimensions : 4 elements
    - 0 : 2
    - 1 : 60
    - 2 : 45
    - 3 : 1


In [61]:
let stride = 10
let tWidth = stride*2

In [62]:
func extractMotionFeatures(_ batchTensor: Tensor<Float>, resnet: ResNet) -> Tensor<Float> {
    // sliding resnet feature extractor
    var t2: [Tensor<Float>] = []
    let origBatchSize = batchTensor.shape[0]
    let nElements = (tensorWidth/stride)-1
    for i in 0..<nElements {
        let start = i*stride
        let end = i*stride+tWidth
        // print(start, end)
        let t1 = batchTensor[0..., start..<end]
        // print(t1.shape)
        t2.append(t1)
    }
    let t3 = Tensor(concatenating: t2)
    // print(t3.shape)
    let emb2 = resnet(t3)
    let outShape: Array<Int> = [origBatchSize, nElements, resnet.classifier.weight.shape[1]]
    // print(outShape)
    let emb3 = emb2.reshaped(to: TensorShape(outShape))
    return emb3
}
var emb3: Tensor<Float>? = nil
time() {
    print(batchTensor.shape)
    emb3 = extractMotionFeatures(batchTensor, resnet: resnet)
    print(emb3!.shape)
}

[2, 60, 45, 1]
[2, 5, 768]
average: 387.7104 ms,   min: 387.7104 ms,   max: 387.7104 ms


# Tiny BERT/Transformer

In [13]:
// let bertPretrained = BERT.PreTrainedModel.bertBase(cased: false, multilingual: false)
// let workspaceURL = URL(
//     fileURLWithPath: "bert_models", isDirectory: true,
//     relativeTo: URL(
//         fileURLWithPath: NSTemporaryDirectory(),
//         isDirectory: true))
// let bert = try BERT.PreTrainedModel.load(bertPretrained)(from: workspaceURL)
// var bertClassifier = BERTClassifier(bert: bert, classCount: 5)

In [14]:
var caseSensitive: Bool = false

In [15]:
var subDirectory: String = "uncased_L-12_H-768_A-12"

In [16]:
let directory = URL(fileURLWithPath: "/notebooks/language2motion.gt/data/")

In [17]:
let vocabularyURL = directory
    .appendingPathComponent(subDirectory)
    .appendingPathComponent("vocab.txt")

let vocabulary: Vocabulary = try! Vocabulary(fromFile: vocabularyURL)

In [18]:
let tokenizer: Tokenizer = try 
                     BERTTokenizer(
                        vocabulary: vocabulary,
                        caseSensitive: caseSensitive,
                        unknownToken: "[UNK]",
                        maxTokenLength: nil)

In [19]:
var variant: BERT.Variant = .bert          
var hiddenSize: Int = 768
var hiddenLayerCount: Int = 12
var attentionHeadCount: Int = 12
var intermediateSize: Int = hiddenSize*4 // 3072/768=4

In [20]:
public func createAttentionMask2(input2: Tensor<Float>, mask: Tensor<Int32>) -> Tensor<Float> {
//     let batchSize = text.tokenIds.shape[0]
    let batchSize = input2.shape[0]
//     let fromSequenceLength = text.tokenIds.shape[1]
//     let toSequenceLength = text.mask.shape[1]
    let fromSequenceLength = input2.shape[1]
    let toSequenceLength = input2.shape[1]
//     let reshapedMask = Tensor<Float>(text.mask.reshaped(to: [batchSize, 1, toSequenceLength]))
    let reshapedMask = Tensor<Float>(mask.reshaped(to: [batchSize, 1, toSequenceLength]))

    // We do not assume that `input.tokenIds` is a mask. We do not actually care if we attend
    // *from* padding tokens (only *to* padding tokens) so we create a tensor of all ones.
//     let broadcastOnes = Tensor<Float>(ones: [batchSize, fromSequenceLength, 1], on: text.mask.device)
    let broadcastOnes = Tensor<Float>(ones: [batchSize, fromSequenceLength, 1], on: mask.device)

    // We broadcast along two dimensions to create the mask.
    return broadcastOnes * reshapedMask
}

In [21]:
extension BERT {
        @differentiable(wrt: self)
    public func callAsFunction(_ input2: Tensor<Float>) -> Tensor<Scalar> {
        print("ala ma kota")
        print("input2 = \(input2.shape)")
//         let tokenIds: Tensor<Int32> = Tensor<Int32>([[1, 2, 3, 4, 5], [1, 2, 3, 4, 5]])
//         let tokenTypeIds: Tensor<Int32> = Tensor<Int32>([[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]])
        let mask: Tensor<Int32> = Tensor<Int32>([[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]])
//         let input: TextBatch = TextBatch(tokenIds: tokenIds, tokenTypeIds: tokenTypeIds, mask: mask)
//         let sequenceLength = input.tokenIds.shape[1]
        let sequenceLength = input2.shape[1]
//         let variant = withoutDerivative(at: self.variant)
        print(1)

        // Compute the input embeddings and apply layer normalization and dropout on them.
//         let tokenEmbeddings = tokenEmbedding(input.tokenIds)
        let tokenEmbeddings = input2

        print("tokenEmbeddings: \(tokenEmbeddings.shape)")
        
//         let tokenTypeEmbeddings = tokenTypeEmbedding(input.tokenTypeIds)
        let positionPaddingIndex: Int
        
        positionPaddingIndex = 0
        
        let positionEmbeddings = positionEmbedding.embeddings.slice(
            lowerBounds: [positionPaddingIndex, 0],
            upperBounds: [positionPaddingIndex + sequenceLength, -1]
        ).expandingShape(at: 0)
        
        
        print("positionEmbeddings: \(positionEmbeddings.shape)")
        
        var embeddings = tokenEmbeddings + positionEmbeddings

        // Add token type embeddings if needed, based on which BERT variant is being used.
//         embeddings = embeddings + tokenTypeEmbeddings

        embeddings = embeddingLayerNorm(embeddings)
        embeddings = embeddingDropout(embeddings)

        // TODO: do masking, but outside
        // TODO: get mask from 45th row (motion flag)
//         let mask = Tensor<Int32>([Int32](repeating: 1, count: sequenceLength))
        // Create an attention mask for the inputs with shape
        // `[batchSize, sequenceLength, sequenceLength]`.
//         let attentionMask = createAttentionMask(forTextBatch: input)
        let attentionMask = createAttentionMask2(input2: input2, mask: mask)

        // We keep the representation as a 2-D tensor to avoid reshaping it back and forth from a
        // 3-D tensor to a 2-D tensor. Reshapes are normally free on GPUs/CPUs but may not be free
        // on TPUs, and so we want to minimize them to help the optimizer.
        var transformerInput = embeddings.reshapedToMatrix()
        let batchSize = embeddings.shape[0]

        // Run the stacked transformer.
        for layerIndex in 0..<(withoutDerivative(at: encoderLayers) { $0.count }) {
            transformerInput = encoderLayers[layerIndex](TransformerInput(
            sequence: transformerInput,
            attentionMask: attentionMask,
            batchSize: batchSize))
        }

        // Reshape back to the original tensor shape.
        return transformerInput.reshapedFromMatrix(originalShape: embeddings.shape)
    }
}

In [22]:
var bert = BERT(
    variant: variant,
    vocabulary: vocabulary,
    tokenizer: tokenizer,
    caseSensitive: caseSensitive,
    hiddenSize: hiddenSize,
    hiddenLayerCount: hiddenLayerCount,
    attentionHeadCount: attentionHeadCount,
    intermediateSize: intermediateSize,
    intermediateActivation: gelu,
    hiddenDropoutProbability: 0.1,
    attentionDropoutProbability: 0.1,
    maxSequenceLength: 512,
    typeVocabularySize: 2,
    initializerStandardDeviation: 0.02,
    useOneHotEmbeddings: false)

In [23]:
bert(emb3!).shape

ala ma kota
input2 = [2, 5, 768]
1
tokenEmbeddings: [2, 5, 768]
positionEmbeddings: [1, 5, 768]


▿ [2, 5, 768]
  ▿ dimensions : 3 elements
    - 0 : 2
    - 1 : 5
    - 2 : 768


# chain resnet and bert

# train