# BERT-language2label

In [1]:
%install-location /notebooks/language2motion.gt/swift-install
%install-swiftpm-flags -c release
%install '.package(path: "/notebooks/language2motion.gt/code")' Batcher ModelSupport Datasets TextModels

import Datasets
import Foundation
import ModelSupport
import TensorFlow
import TextModels
import PythonKit

Installing packages:
	.package(path: "/notebooks/language2motion.gt/code")
		Batcher
		ModelSupport
		Datasets
		TextModels
With SwiftPM flags: ['-c', 'release']
Working in: /tmp/tmpts_beqsx/swift-install
[1/2] Compiling jupyterInstalledPackages jupyterInstalledPackages.swift
Initializing Swift...
Installation complete!


In [2]:
let bertPretrained = BERT.PreTrainedModel.bertBase(cased: false, multilingual: false)
let workspaceURL = URL(
    fileURLWithPath: "bert_models", isDirectory: true,
    relativeTo: URL(
        fileURLWithPath: NSTemporaryDirectory(),
        isDirectory: true))
let bert = try BERT.PreTrainedModel.load(bertPretrained)(from: workspaceURL)
var bertClassifier = BERTClassifier(bert: bert, classCount: 5)

Loading BERT pre-trained model 'BERT Base Uncased'.
Loading resource: uncased_L-12_H-768_A-12


## load dataset

In [3]:
// TODO: 
// - configure dataset path outside of Language2Label
// + train

In [4]:
%include "/notebooks/language2motion.gt/code/Sources/BERT-language2label/Language2Label.swift"

In [5]:
let maxSequenceLength = 20
let batchSize = 2048

var dataset = try Language2Label(
  taskDirectoryURL: workspaceURL,
  maxSequenceLength: maxSequenceLength,
  batchSize: batchSize,
  entropy: SystemRandomNumberGenerator()
) { (example: Language2LabelExample) -> LabeledTextBatch in
  let textBatch = bertClassifier.bert.preprocess(
    sequences: [example.text],
    maxSequenceLength: maxSequenceLength)
   return (data: textBatch, 
           label: example.label.map { 
               (label: Language2LabelExample.LabelTuple) in Tensor(Int32(label.idx))
           }!
          )
}

print("Dataset acquired.")

Dataset acquired.


In [58]:
dataset.trainingExamples.count

2409


In [59]:
dataset.trainingExamples[0]

▿ 2 elements
  ▿ data : TextBatch
    - tokenIds : [[ 101, 1037, 2529, 2003, 3788, 3407,  102]]
    - tokenTypeIds : [[0, 0, 0, 0, 0, 0, 0]]
    - mask : [[1, 1, 1, 1, 1, 1, 1]]
  - label : 4


In [60]:
dataset.validationExamples[0]

▿ 2 elements
  ▿ data : TextBatch
    - tokenIds : [[  101,  1037,  2711, 14523,  2830,   102]]
    - tokenTypeIds : [[0, 0, 0, 0, 0, 0]]
    - mask : [[1, 1, 1, 1, 1, 1]]
  - label : 4


## train

In [6]:
var optimizer = WeightDecayedAdam(
    for: bertClassifier,
    learningRate: LinearlyDecayedParameter(
        baseParameter: LinearlyWarmedUpParameter(
            baseParameter: FixedParameter<Float>(2e-5),
            warmUpStepCount: 10,
            warmUpOffset: 0),
        slope: -5e-7,  // The LR decays linearly to zero in 100 steps.
        startStep: 10),
    weightDecayRate: 0.01,
    maxGradientGlobalNorm: 1)

In [7]:
print("Training BERT for the Language2Label task!")

for (epoch, epochBatches) in dataset.trainingEpochs.prefix(1).enumerated() {
    print("[Epoch \(epoch + 1)]")
    Context.local.learningPhase = .training
    var trainingLossSum: Float = 0
    var trainingBatchCount = 0
    print("epochBatches.count: \(epochBatches.count)")

    for batch in epochBatches {
        let (documents, labels) = (batch.data, Tensor<Int32>(batch.label))
        let (loss, gradients) = valueWithGradient(at: bertClassifier) { model -> Tensor<Float> in
            let logits = model(documents)
            return softmaxCrossEntropy(logits: logits, labels: labels)
        }

        trainingLossSum += loss.scalarized()
        trainingBatchCount += 1
        optimizer.update(&bertClassifier, along: gradients)

        print(
            """
              Training loss: \(trainingLossSum / Float(trainingBatchCount))
            """
        )
    }

    print("dataset.validationBatches.count: \(dataset.validationBatches.count)")
    Context.local.learningPhase = .inference
    var devLossSum: Float = 0
    var devBatchCount = 0
    var correctGuessCount = 0
    var totalGuessCount = 0

    for batch in dataset.validationBatches {
        let valBatchSize = batch.data.tokenIds.shape[0]

        let (documents, labels) = (batch.data, Tensor<Int32>(batch.label))
        let logits = bertClassifier(documents)
        let loss = softmaxCrossEntropy(logits: logits, labels: labels)
        devLossSum += loss.scalarized()
        devBatchCount += 1

        let correctPredictions = logits.argmax(squeezingAxis: 1) .== labels

        correctGuessCount += Int(Tensor<Int32>(correctPredictions).sum().scalarized())
        totalGuessCount += valBatchSize
    }
    
    let accuracy = Float(correctGuessCount) / Float(totalGuessCount)
    print(
        """
        Accuracy: \(correctGuessCount)/\(totalGuessCount) (\(accuracy)) \
        Eval loss: \(devLossSum / Float(devBatchCount))
        """
    )
}

Training BERT for the Language2Label task!
[Epoch 1]
epochBatches.count: 23
  Training loss: 1.9004849
  Training loss: 1.8883357
  Training loss: 1.8536991
  Training loss: 1.8264134
  Training loss: 1.7747473
  Training loss: 1.7418782
  Training loss: 1.7116826
  Training loss: 1.6916653
  Training loss: 1.6637888
  Training loss: 1.6384094
  Training loss: 1.6082419
  Training loss: 1.5870118
  Training loss: 1.5518224
  Training loss: 1.5203637
  Training loss: 1.4909145
  Training loss: 1.4588009
  Training loss: 1.4305873
  Training loss: 1.4030278
  Training loss: 1.3693583
  Training loss: 1.337987
  Training loss: 1.3086315
  Training loss: 1.2816018
  Training loss: 1.2540916
dataset.validationBatches.count: 6
Accuracy: 478/603 (0.79270315) Eval loss: 0.60312885


In [8]:
struct Prediction {
    public let classIdx: Int
    public let className: String
    public let probability: Float
}

In [128]:
// TODO: get num_best preds
func predict(_ texts: [String], bertClassifier: BERTClassifier) -> [Prediction] {
    print("predict()")
    print("texts: \(texts.count)")

//     let textBatch = bertClassifier.bert.preprocess(
//     sequences: texts,
//     maxSequenceLength: maxSequenceLength)

    let validationExamples = texts.map {
        (text) -> TextBatch in
//         print(text)
        return bertClassifier.bert.preprocess(
            sequences: [text],
            maxSequenceLength: maxSequenceLength
        )
    }
    
    print("validationExamples.count: \(validationExamples.count)")
//     print("validationExamples: \(validationExamples)")

    print("batchSize: \(batchSize)")
    print("maxSequenceLength: \(maxSequenceLength)")
    print("batchSize / maxSequenceLength: \(batchSize / maxSequenceLength)")

    let validationBatches = validationExamples.inBatches(of: batchSize / maxSequenceLength).map { 
        $0.paddedAndCollated(to: maxSequenceLength)
    }
    print("validationBatches: \(validationBatches.count)")
    var preds: [Prediction] = []
    for batch in validationBatches {
        print("batch")
        let logits = bertClassifier(batch)
//         print("logits.shape: \(logits.shape)")
//         print(logits)
        let probs = softmax(logits, alongAxis: 1)
//         print("probs.shape: \(probs.shape)")
//         print(probs)
        let classIdxs = logits.argmax(squeezingAxis: 1)
//         print(classIdxs)
        let batchPreds = (0..<classIdxs.shape[0]).map { 
            (idx) -> Prediction in
//             print("idx: \(idx)")
            let classIdx: Int = Int(classIdxs[idx].scalar!)
//             print("classIdx", classIdx, type(of: classIdx))
            let prob = probs[idx, classIdx].scalar!
//             print("prob", prob, type(of: prob))
            return Prediction(classIdx: classIdx, className: dataset.labels[classIdx], probability: prob)
        }
//         print(batchPreds)
        preds.append(contentsOf: batchPreds)
    }
    return preds
}

let texts = [
    "A person is walking forwards.", 
    "A person walks 4 steps forward.", 
    "A person walks in a circle counter clockwise.", 
    "A person getting done on their knees"
]
// let preds = predict(texts, bertClassifier: bertClassifier)

// for (idx, pred) in preds.enumerated() {
//     print(idx, texts[idx], pred)
// }

## do inference on whole dataset

In [73]:
let dsURL = URL(fileURLWithPath: "/notebooks/language2motion.gt/data/labels_ds_v1.csv")
let df = pd.read_csv(dsURL.path)

In [75]:
let labels = df.label.unique().sorted().map {String($0)!}
labels

▿ 5 elements
  - 0 : "Doing something"
  - 1 : "Performing motions with hands"
  - 2 : "Walking and turning"
  - 3 : "Walking forward few steps"
  - 4 : "Walking or running"


In [135]:
let texts2: [String] = Array(df.text.to_list())! // .iloc[0..<2000]
texts2.count

3012


In [136]:
let preds2 = predict(texts2, bertClassifier: bertClassifier)

predict()
texts: 3012
validationExamples.count: 3012
batchSize: 2048
maxSequenceLength: 20
batchSize / maxSequenceLength: 102
validationBatches: 30
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch
batch


In [None]:
for (idx, pred) in preds2.enumerated() {
    print(idx, texts2[idx], pred)
}