# BERT-language2label

In [1]:
%install-location /notebooks/language2motion.gt/swift-install
%install-swiftpm-flags -c release
%install '.package(path: "/notebooks/language2motion.gt/code")' Batcher ModelSupport Datasets TextModels

import Datasets
import Foundation
import ModelSupport
import TensorFlow
import TextModels
import PythonKit

Installing packages:
	.package(path: "/notebooks/language2motion.gt/code")
		Batcher
		ModelSupport
		Datasets
		TextModels
With SwiftPM flags: ['-c', 'release']
Working in: /tmp/tmp6v1o5k6u/swift-install
[1/2] Compiling jupyterInstalledPackages jupyterInstalledPackages.swift
Initializing Swift...
Installation complete!


In [2]:
let bertPretrained = BERT.PreTrainedModel.bertBase(cased: false, multilingual: false)
let workspaceURL = URL(
    fileURLWithPath: "bert_models", isDirectory: true,
    relativeTo: URL(
        fileURLWithPath: NSTemporaryDirectory(),
        isDirectory: true))
let bert = try BERT.PreTrainedModel.load(bertPretrained)(from: workspaceURL)
var bertClassifier = BERTClassifier(bert: bert, classCount: 5)

Loading BERT pre-trained model 'BERT Base Uncased'.
Loading resource: uncased_L-12_H-768_A-12


## load dataset

In [3]:
// TODO: 
// + load csv
// + split into train/dev
// + convert to [Example]
// + get sorted labels
// + integrate with Language2Label
// + Language2Label code cleanups
// - extract Language2Label codes
// + train

In [4]:
let pd  = Python.import("pandas")
let model_selection  = Python.import("sklearn.model_selection")

In [5]:
/// A `TextBatch` with the corresponding labels.
public typealias LabeledTextBatch = (data: TextBatch, label: Tensor<Int32>)


/// Language2Label example.
public struct Language2LabelExample {
    public typealias LabelTuple = (idx: Int, label: String)

    public let id: String
    public let text: String
    public let label: LabelTuple?

    public init(id: String, text: String, label: LabelTuple?) {
        self.id = id
        self.text = text
        self.label = label
    }
}


public struct Language2Label <Entropy: RandomNumberGenerator> {
    public typealias Samples = LazyMapSequence<[Language2LabelExample], LabeledTextBatch>
    
    /// The training texts.
    public let trainingExamples: Samples
    /// The validation texts.
    public let validationExamples: Samples

    /// The sequence length to which every sentence will be padded.
    public let maxSequenceLength: Int
    public let batchSize: Int
    public let labels: [String]

    /// The type of the collection of batches.
    public typealias Batches = Slices<Sampling<Samples, ArraySlice<Int>>>
    /// The type of the training sequence of epochs.
    public typealias TrainEpochs = LazyMapSequence<TrainingEpochs<Samples, Entropy>, 
        LazyMapSequence<Batches, LabeledTextBatch>>
    /// The sequence of training data (epochs of batches).
    public var trainingEpochs: TrainEpochs
    /// The validation batches.
    public var validationBatches: LazyMapSequence<Slices<Samples>, LabeledTextBatch>    
}

//===-----------------------------------------------------------------------------------------===//
// Data
//===-----------------------------------------------------------------------------------------===//

extension Language2Label {

    internal enum FileType: String {
        case train = "train"
        case dev = "dev"
    }
    
    static func Df2Example(df: PythonObject, labels: [String]) -> [Language2LabelExample] {
        return Python.list(df.iterrows()).map {
            (rowObj: PythonObject) -> Language2LabelExample in 
            let row = rowObj.tuple2.1
            let sample_id: String = "\(row.sample_id)" // Int to String
            let text: String = String(row.text)!
            let labelStr: String? = String(row.label)
            let label: Language2LabelExample.LabelTuple? = Language2LabelExample.LabelTuple(idx: labels.firstIndex(of: labelStr!)!, label: labelStr!)
            return Language2LabelExample(id: sample_id, text: text, label: label)
        }
    }
}

extension Language2Label {
  /// Creates an instance in `taskDirectoryURL` with batches of size `batchSize`
  /// by `maximumSequenceLength`.
  ///
  /// - Parameters:
  ///   - entropy: a source of randomness used to shuffle sample ordering. It
  ///     will be stored in `self`, so if it is only pseudorandom and has value
  ///     semantics, the sequence of epochs is determinstic and not dependent on
  ///     other operations.
  ///   - exampleMap: a transform that processes `Example` in `LabeledTextBatch`.
  public init(
    taskDirectoryURL: URL,
    maxSequenceLength: Int,
    batchSize: Int,
    entropy: Entropy,
    exampleMap: @escaping (Language2LabelExample) -> LabeledTextBatch
  ) throws {
    // Load the data file.
        let dsURL = URL(fileURLWithPath: "/notebooks/language2motion.gt/data/labels.csv")
        let df = pd.read_csv(dsURL.path)
        labels = df.label.unique().sorted().map {String($0)!}
        let (train_df, test_df) = model_selection.train_test_split(df, test_size: 0.2).tuple2
        
        trainingExamples = Language2Label.Df2Example(df: train_df, labels: labels).lazy.map(exampleMap)
        validationExamples = Language2Label.Df2Example(df: test_df, labels: labels).lazy.map(exampleMap)
      
      
    self.maxSequenceLength = maxSequenceLength
    self.batchSize = batchSize

    // Create the training sequence of epochs.
    trainingEpochs = TrainingEpochs(
      samples: trainingExamples, batchSize: batchSize / maxSequenceLength, entropy: entropy
    ).lazy.map { (batches: Batches) -> LazyMapSequence<Batches, LabeledTextBatch> in
      batches.lazy.map{ 
        (
          data: $0.map(\.data).paddedAndCollated(to: maxSequenceLength),
          label: Tensor($0.map(\.label))
        )
      }
    }
    
    // Create the validation collection of batches.
    validationBatches = validationExamples.inBatches(of: batchSize / maxSequenceLength).lazy.map{ 
      (
        data: $0.map(\.data).paddedAndCollated(to: maxSequenceLength),
        label: Tensor($0.map(\.label))
      )
    }
  }
}

extension Language2Label where Entropy == SystemRandomNumberGenerator {
  /// Creates an instance in `taskDirectoryURL` with batches of size `batchSize`
  /// by `maximumSequenceLength`.
  ///
  /// - Parameter exampleMap: a transform that processes `Example` in `LabeledTextBatch`.
  public init(
    taskDirectoryURL: URL,
    maxSequenceLength: Int,
    batchSize: Int,
    exampleMap: @escaping (Language2LabelExample) -> LabeledTextBatch
  ) throws {
    try self.init(
      taskDirectoryURL: taskDirectoryURL,
      maxSequenceLength: maxSequenceLength,
      batchSize: batchSize,
      entropy: SystemRandomNumberGenerator(),
      exampleMap: exampleMap
    )
  }
}

In [6]:
// Regarding the batch size, note that the way batching is performed currently is that we bucket
// input sequences based on their length (e.g., first bucket contains sequences of length 1 to 10,
// second 11 to 20, etc.). We then keep processing examples in the input data pipeline until a
// bucket contains enough sequences to form a batch. The batch size specified in the task
// constructor specifies the *total number of tokens in the batch* and not the total number of
// sequences. So, if the batch size is set to 1024, the first bucket (i.e., lengths 1 to 10)
// will need 1024 / 10 = 102 examples to form a batch (every sentence in the bucket is padded
// to the max length of the bucket). This kind of bucketing is common practice with NLP models and
// it is done to improve memory usage and computational efficiency when dealing with sequences of
// varied lengths. Note that this is not used in the original BERT implementation released by
// Google and so the batch size setting here is expected to differ from that one.
let maxSequenceLength = 128
let batchSize = 1024

var dataset = try Language2Label(
  taskDirectoryURL: workspaceURL,
  maxSequenceLength: maxSequenceLength,
  batchSize: batchSize,
  entropy: SystemRandomNumberGenerator()
) { (example: Language2LabelExample) -> LabeledTextBatch in
  let textBatch = bertClassifier.bert.preprocess(
    sequences: [example.text],
    maxSequenceLength: maxSequenceLength)
   return (data: textBatch, 
           label: example.label.map { 
               (label: Language2LabelExample.LabelTuple) in Tensor(Int32(label.idx))
           }!
          )
}

print("Dataset acquired.")

Dataset acquired.


In [7]:
dataset.trainingExamples.count

2409


In [8]:
dataset.trainingExamples[0]

▿ 2 elements
  ▿ data : TextBatch
    - tokenIds : [[  101,  1037,  2711, 13529,  2015,  2006,  1996,  2723,   102]]
    - tokenTypeIds : [[0, 0, 0, 0, 0, 0, 0, 0, 0]]
    - mask : [[1, 1, 1, 1, 1, 1, 1, 1, 1]]
  - label : 0


In [9]:
dataset.validationExamples[0]

▿ 2 elements
  ▿ data : TextBatch
    - tokenIds : [[ 101, 1037, 2529, 7365, 2830, 1998, 3632, 2039, 1996, 5108, 1012,  102]]
    - tokenTypeIds : [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
    - mask : [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
  - label : 0


## train

In [10]:
var optimizer = WeightDecayedAdam(
    for: bertClassifier,
    learningRate: LinearlyDecayedParameter(
        baseParameter: LinearlyWarmedUpParameter(
            baseParameter: FixedParameter<Float>(2e-5),
            warmUpStepCount: 10,
            warmUpOffset: 0),
        slope: -5e-7,  // The LR decays linearly to zero in 100 steps.
        startStep: 10),
    weightDecayRate: 0.01,
    maxGradientGlobalNorm: 1)

In [11]:
print("Training BERT for the Language2Label task!")

for (epoch, epochBatches) in dataset.trainingEpochs.prefix(3).enumerated() {
    print("[Epoch \(epoch + 1)]")
    Context.local.learningPhase = .training
    var trainingLossSum: Float = 0
    var trainingBatchCount = 0

    for batch in epochBatches {
        let (documents, labels) = (batch.data, Tensor<Int32>(batch.label))
        let (loss, gradients) = valueWithGradient(at: bertClassifier) { model -> Tensor<Float> in
            let logits = model(documents)
            return softmaxCrossEntropy(logits: logits, labels: labels)
        }

        trainingLossSum += loss.scalarized()
        trainingBatchCount += 1
        optimizer.update(&bertClassifier, along: gradients)

        print(
            """
              Training loss: \(trainingLossSum / Float(trainingBatchCount))
            """
        )
    }

    Context.local.learningPhase = .inference
    var devLossSum: Float = 0
    var devBatchCount = 0
    var devPredictedLabels = [Bool]()
    var devGroundTruth = [Bool]()
    for batch in dataset.validationBatches {
        let (documents, labels) = (batch.data, Tensor<Int32>(batch.label))
        let logits = bertClassifier(documents)
        let loss = softmaxCrossEntropy(logits: logits, labels: labels)
        devLossSum += loss.scalarized()
        devBatchCount += 1

        let predictedLabels = sigmoid(logits.squeezingShape(at: -1)) .>= 0.5
        devPredictedLabels.append(contentsOf: predictedLabels.scalars)
        devGroundTruth.append(contentsOf: labels.scalars.map { $0 == 1 })
    }

    let mcc = matthewsCorrelationCoefficient(
        predictions: devPredictedLabels,
        groundTruth: devGroundTruth)

    print(
        """
          MCC: \(mcc)
          Eval loss: \(devLossSum / Float(devBatchCount))
        """
    )
}

Training BERT for the Language2Label task!
[Epoch 1]
  Training loss: 1.9629791
  Training loss: 1.7798477


: 