# MotionDataset2label with ResNet

In [None]:
%install-location /notebooks/language2motion.gt/swift-install
%install-swiftpm-flags -c release
%install '.package(path: "/notebooks/language2motion.gt/code")' Batcher ModelSupport Datasets MotionDataset 
// ImageClassificationModels
// %install '.package(path: "/notebooks/language2motion.gt/code")' MotionDataset 

In [None]:
import Foundation
import TensorFlow
import MotionDataset

import Batcher
import ModelSupport
import Datasets

// import ImageClassificationModels

In [None]:
%include "EnableIPythonDisplay.swift"
IPythonDisplay.shell.enable_matplotlib("inline")

let plt = Python.import("matplotlib.pyplot")
let np  = Python.import("numpy")
let random  = Python.import("random")
let sklearn  = Python.import("sklearn")
let model_selection  = Python.import("sklearn.model_selection")
let subprocess = Python.import("subprocess")
let glob = Python.import("glob")
let pil = Python.import("PIL")
let Image = Python.import("PIL.Image")
let pd = Python.import("pandas")

# Create motion sample tensor

In [None]:
extension Tensor where Scalar: Numeric {
    func paddedOrCropped(to width: Int) -> Tensor<Scalar> {
        // pads or crops two-dimensional tensor along 0-th axis
        assert(self.shape.count == 2)
        let currentWidth = self.shape[0]
        let nPadding = Swift.max(width - currentWidth, 0)
        let maxCropping = Swift.max(currentWidth - width, 0)
        let nCropping = (maxCropping>0) ? Int.random(in: 0 ..< maxCropping) : 0
        return self[nCropping..<nCropping+width].padded(forSizes: [(before: 0, after: nPadding), (before: 0, after: 0)])
    }
}
// t.paddedOrCropped(to: 100).shape

# 1-channel ResNet model

In [None]:
// import TensorFlow

// Original Paper:
// "Deep Residual Learning for Image Recognition"
// Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
// https://arxiv.org/abs/1512.03385
// This uses shortcut layers to connect residual blocks
// (aka Option (B) in https://arxiv.org/abs/1812.01187).
//
// The structure of this implementation was inspired by the Flax ResNet example:
// https://github.com/google/flax/blob/master/examples/imagenet/models.py

public struct ConvBN: Layer {
    public var conv: Conv2D<Float>
    public var norm: BatchNorm<Float>

    public init(
        filterShape: (Int, Int, Int, Int),
        strides: (Int, Int) = (1, 1),
        padding: Padding = .valid
    ) {
        self.conv = Conv2D(filterShape: filterShape, strides: strides, padding: padding, useBias: false)
        self.norm = BatchNorm(featureCount: filterShape.3, momentum: 0.9, epsilon: 1e-5)
    }

    @differentiable
    public func callAsFunction(_ input: Tensor<Float>) -> Tensor<Float> {
        return input.sequenced(through: conv, norm)
    }
}

public struct ResidualBlock: Layer {
    public var projection: ConvBN
    @noDerivative public let needsProjection: Bool
    public var earlyConvs: [ConvBN] = []
    public var lastConv: ConvBN

    public init(
        inputFilters: Int, filters: Int, strides: (Int, Int), useLaterStride: Bool, isBasic: Bool
    ) {
        let outFilters = filters * (isBasic ? 1 : 4)
        self.needsProjection = (inputFilters != outFilters) || (strides.0 != 1)
        // TODO: Replace the following, so as to not waste memory for non-projection cases.
        if needsProjection {
            projection = ConvBN(filterShape: (1, 1, inputFilters, outFilters), strides: strides)
        } else {
            projection = ConvBN(filterShape: (1, 1, 1, 1))
        }

        if isBasic {
            earlyConvs = [
                (ConvBN(
                    filterShape: (3, 3, inputFilters, filters), strides: strides, padding: .same)),
            ]
            lastConv = ConvBN(filterShape: (3, 3, filters, outFilters), padding: .same)
        } else {
            if useLaterStride {
                // Configure for ResNet V1.5 (the more common implementation).
                earlyConvs.append(ConvBN(filterShape: (1, 1, inputFilters, filters)))
                earlyConvs.append(
                    ConvBN(filterShape: (3, 3, filters, filters), strides: strides, padding: .same))
            } else {
                // Configure for ResNet V1 (the paper implementation).
                earlyConvs.append(
                    ConvBN(filterShape: (1, 1, inputFilters, filters), strides: strides))
                earlyConvs.append(ConvBN(filterShape: (3, 3, filters, filters), padding: .same))
            }
            lastConv = ConvBN(filterShape: (1, 1, filters, outFilters))
        }
    }

    @differentiable
    public func callAsFunction(_ input: Tensor<Float>) -> Tensor<Float> {
        let residual: Tensor<Float>
        // TODO: Find a way for this to be checked only at initialization, not during training or 
        // inference.
        if needsProjection {
            residual = projection(input)
        } else {
            residual = input
        }

        let earlyConvsReduced = earlyConvs.differentiableReduce(input) { last, layer in
            relu(layer(last))
        }
        let lastConvResult = lastConv(earlyConvsReduced)

        return relu(lastConvResult + residual)
    }
}

/// An implementation of the ResNet v1 and v1.5 architectures, at various depths.
public struct MyResNet: Layer {
    public var initialLayer: ConvBN
    public var maxPool: MaxPool2D<Float>
    public var residualBlocks: [ResidualBlock] = []
    public var avgPool = GlobalAvgPool2D<Float>()
    public var flatten = Flatten<Float>()
    public var classifier: Dense<Float>

    /// Initializes a new ResNet v1 or v1.5 network model.
    ///
    /// - Parameters:
    ///   - classCount: The number of classes the network will be or has been trained to identify.
    ///   - depth: A specific depth for the network, chosen from the enumerated values in 
    ///     ResNet.Depth.
    ///   - downsamplingInFirstStage: Whether or not to downsample by a total of 4X among the first
    ///     two layers. For ImageNet-sized images, this should be true, but for smaller images like
    ///     CIFAR-10, this probably should be false for best results.
    ///   - inputFilters: The number of filters at the first convolution.
    ///   - useLaterStride: If false, the stride within the residual block is placed at the position
    ///     specified in He, et al., corresponding to ResNet v1. If true, the stride is moved to the
    ///     3x3 convolution, corresponding to the v1.5 variant of the architecture. 
    public init(
        classCount: Int, depth: Depth, downsamplingInFirstStage: Bool = true,
        useLaterStride: Bool = true, channelCount: Int = 3
    ) {
        let inputFilters: Int
        
        if downsamplingInFirstStage {
            inputFilters = 64
            initialLayer = ConvBN(
                filterShape: (7, 7, channelCount, inputFilters), strides: (2, 2), padding: .same)
            maxPool = MaxPool2D(poolSize: (3, 3), strides: (2, 2), padding: .same)
        } else {
            inputFilters = 16
            initialLayer = ConvBN(filterShape: (3, 3, channelCount, inputFilters), padding: .same)
            maxPool = MaxPool2D(poolSize: (1, 1), strides: (1, 1))  // no-op
        }

        var lastInputFilterCount = inputFilters
        for (blockSizeIndex, blockSize) in depth.layerBlockSizes.enumerated() {
            for blockIndex in 0..<blockSize {
                let strides = ((blockSizeIndex > 0) && (blockIndex == 0)) ? (2, 2) : (1, 1)
                let filters = inputFilters * Int(pow(2.0, Double(blockSizeIndex)))
                let residualBlock = ResidualBlock(
                    inputFilters: lastInputFilterCount, filters: filters, strides: strides,
                    useLaterStride: useLaterStride, isBasic: depth.usesBasicBlocks)
                lastInputFilterCount = filters * (depth.usesBasicBlocks ? 1 : 4)
                residualBlocks.append(residualBlock)
            }
        }

        let finalFilters = inputFilters * Int(pow(2.0, Double(depth.layerBlockSizes.count - 1)))
        classifier = Dense(
            inputSize: depth.usesBasicBlocks ? finalFilters : finalFilters * 4,
            outputSize: classCount)
    }

    @differentiable
    public func callAsFunction(_ input: Tensor<Float>) -> Tensor<Float> {
        let inputLayer = maxPool(relu(initialLayer(input)))
        let blocksReduced = residualBlocks.differentiableReduce(inputLayer) { last, layer in
            layer(last)
        }
        return blocksReduced.sequenced(through: avgPool, flatten, classifier)
    }
}

extension MyResNet {
    public enum Depth {
        case resNet18
        case resNet34
        case resNet50
        case resNet56
        case resNet101
        case resNet152

        var usesBasicBlocks: Bool {
            switch self {
            case .resNet18, .resNet34, .resNet56: return true
            default: return false
            }
        }

        var layerBlockSizes: [Int] {
            switch self {
            case .resNet18: return [2, 2, 2, 2]
            case .resNet34: return [3, 4, 6, 3]
            case .resNet50: return [3, 4, 6, 3]
            case .resNet56: return [9, 9, 9]
            case .resNet101: return [3, 4, 23, 3]
            case .resNet152: return [3, 8, 36, 3]
            }
        }
    }
}

In [None]:
var model = MyResNet(classCount: 5, depth: .resNet18, downsamplingInFirstStage: false, channelCount: 1)

In [None]:
let optimizer = SGD(for: model, learningRate: 0.001)

# load labels

# put dataset together

In [None]:
func getTensorLabel(_ ms: MotionSample, labelsDict: [Int: String], labels: [String], tensorWidth: Int) -> (Tensor<Float>, Int32) {
    // TODO: code _unknown_ label
//     print("a")
    var labelStr = labelsDict[ms.sampleID]
    
    if labelStr == nil {
        labelStr = "Doing something"
    }
    
    let label: Int32 = Int32(labels.index(of: labelStr!)!)
  
    var tensor = Tensor<Float>(ms.motionFramesArray)
//     print(tensor.shape)
    tensor = tensor.paddedOrCropped(to: tensorWidth).expandingShape(at: 2)
//     print(tensor.shape)
    return (tensor, label)
}

// let (tensor, label) = getTensorLabel(ms, labelsDict: labelsDict, labels: labels, tensorWidth: 100)
// (tensor.shape, label)

# train test split of collection

In [None]:
extension Array { 
    func trainTestSplit(split: Float) -> (train: Array<Element>, test: Array<Element>) {
        let shuffled = self.shuffled()
        let splitIdx = Int(roundf(Float(split * Float(self.count))))
        let train = Array(shuffled[0..<splitIdx])
        let test = Array(shuffled[splitIdx..<self.count])
        return (train: train, test: test)
    }
}

In [None]:
public struct Motion2Label {
    public typealias SourceDataSet = [TensorPair<Float, Int32>]
//     public typealias SourceDataSet = [(Tensor<Float>, Int32)]
    public let training: Batcher<SourceDataSet>
    public let test: Batcher<SourceDataSet>
    public let labels: [String]
    public let motionData: MotionData

    func readBinaryMotionData(_ serializedDatasetURL: URL) throws -> MotionData {
        print("Reading..., decoding...")
        let date = Date() 
        let motionData2 = MotionData(from: serializedDatasetURL)
        print("Done in \(abs(date.timeIntervalSinceNow)) sec.")
        print(motionData2.description)
        return motionData2
    }
    
    public init(batchSize: Int, serializedDatasetURL: URL, labelsURL: URL) {
//         let motionData2 = try! readBinaryMotionData(serializedDatasetURL)
        let motionData2 = MotionData(from: serializedDatasetURL)
        print(motionData2.description)
        
        print(1)
        let df = pd.read_csv(labelsURL.path)
        let labels2 = df.label.unique().sorted().map {String($0)!}
        print(2)

        var labelsDict: [Int: String] = [:]
        for pythonTuple in df.iterrows() {
            labelsDict[Int(pythonTuple[1].sample_id)!] = String(pythonTuple[1].label)!
        }
        print(3)
        
        let tensorPairs: [(Tensor<Float>, Int32)] = motionData2.motionSamples.map { 
            getTensorLabel($0, labelsDict: labelsDict, labels: labels2, tensorWidth: 224) 
        }
        print(4, "tensorPairs: \(tensorPairs.count)")
        let tensorPairs2: SourceDataSet = tensorPairs.map { TensorPair(first: $0.0, second: Tensor<Int32>($0.1)) }
        print(5)
        let (trainTensorPairs, testTensorPairs) = tensorPairs2.trainTestSplit(split: 0.8)
        print(6)
        
        print("trainTensorPairs.count = \(trainTensorPairs.count)")
        print("testTensorPairs.count = \(testTensorPairs.count)")
        self.training = Batcher(
            on: trainTensorPairs,
            batchSize: batchSize,
            numWorkers: 1, //No need to use parallelism since everything is loaded in memory
            shuffle: true)
        self.test = Batcher(
            on: testTensorPairs,
            batchSize: batchSize,
            numWorkers: 1,
            shuffle: false)
        self.labels = labels2
        self.motionData = motionData2
    }    
}

In [None]:
let batchSize = 100

let serializedDatasetURL = URL(fileURLWithPath: "/notebooks/language2motion.gt/data/motion_dataset.4000.plist")
let labelsURL = URL(fileURLWithPath: "/notebooks/language2motion.gt/data/labels_ds_v2.csv")

let dataset = Motion2Label(
    batchSize: batchSize, 
    serializedDatasetURL: serializedDatasetURL,
    labelsURL: labelsURL
)
print("dataset.training.count: \(dataset.training.count)")
print("dataset.test.count: \(dataset.test.count)")

In [None]:
print("Starting motion2label training...")

for epoch in 1...10 {
    print("epoch \(epoch)")
    Context.local.learningPhase = .training
    var trainingLossSum: Float = 0
    var trainingBatchCount = 0
    for batch in dataset.training.sequenced() {
        print("progress \(100.0*Float(trainingBatchCount)/Float(dataset.training.count))%")
        let (tensors, labels) = (batch.first, batch.second)
        let (loss, gradients) = valueWithGradient(at: model) { model -> Tensor<Float> in
            let logits = model(tensors)
            return softmaxCrossEntropy(logits: logits, labels: labels)
        }
        trainingLossSum += loss.scalarized()
        trainingBatchCount += 1
        optimizer.update(&model, along: gradients)
    }

    Context.local.learningPhase = .inference
    var testLossSum: Float = 0
    var testBatchCount = 0
    var correctGuessCount = 0
    var totalGuessCount = 0
    for batch in dataset.test.sequenced() {
        print("batch")
        let (tensors, labels) = (batch.first, batch.second)
        let logits = model(tensors)
        testLossSum += softmaxCrossEntropy(logits: logits, labels: labels).scalarized()
        testBatchCount += 1

        let correctPredictions = logits.argmax(squeezingAxis: 1) .== labels
        correctGuessCount = correctGuessCount
            + Int(
                Tensor<Int32>(correctPredictions).sum().scalarized())
        totalGuessCount = totalGuessCount + batchSize
    }

    let accuracy = Float(correctGuessCount) / Float(totalGuessCount)
    print(
        """
        [Epoch \(epoch)] \
        Accuracy: \(correctGuessCount)/\(totalGuessCount) (\(accuracy*100)%) \
        Loss: \(testLossSum / Float(testBatchCount))
        """
    )
}