In [None]:
!wget http://thinknook.com/wp-content/uploads/2012/09/Sentiment-Analysis-Dataset.zip
!unzip Sentiment-Analysis-Dataset.zip
!rm Sentiment-Analysis-Dataset.zip

In [10]:
const fs = require('fs');
const csv_parse = require('csv-parse');
const pp = require('promisepipe');
const tf = require('@tensorflow/tfjs');
require('@tensorflow/tfjs-node-gpu'); undefined;

cpu backend was already registered. Reusing existing backend


In [2]:
tf.getBackend()  // should be "tensorflow", not "cpu"

'tensorflow'

# Loading the dataset

In [11]:
class TextsLoader {
    constructor(config) {
        const maxLength = config.maxTextLength;
        const trainBatchSize = config.trainBatchSize;
        const validBatchSize = config.validBatchSize;
        const validRatio = config.validRatio;

        this.trainSet = null;
        this.trainPos = 0;
        this.trainBatchX = tf.buffer([trainBatchSize, maxLength], 'int32');
        this.trainBatchY = tf.buffer([trainBatchSize, 2], 'float32');
        this.validSet = null;
        this.validPos = 0;
        this.validBatchX = tf.buffer([validBatchSize, maxLength], 'int32');
        this.validBatchY = tf.buffer([validBatchSize, 2], 'float32');
        this.validRatio = validRatio;
        this.minOrd = 0;
        this.maxOrd = 256;
        this.X = [];
        this.Y = [];
    }
    
    maxTextLength() {
        return this.trainBatchX.shape[1];
    }
    
    trainBatchSize() {
        return this.trainBatchX.shape[0];
    }
    
    validBatchSize() {
        return this.validBatchX.shape[0];
    }
    
    async load(csvPath) {
        const maxLength = this.maxTextLength();
        let minOrd = 256;
        let maxOrd = 0;
        this.X = [];
        this.Y = [];
        let trueCount = 0;
        await pp(fs.createReadStream(csvPath)
          .pipe(csv_parse({skip_lines_with_error: true, columns: true}))
          .on("data", (chunk) => {
            const text = unescape(encodeURIComponent(chunk.SentimentText.trim()));
            const len = Math.min(text.length, maxLength);
            const arr = new Uint8Array(len);
            for (let i = 0; i < len; i++) {
                const ord = text.charCodeAt(i);
                if (ord > maxOrd) {
                    maxOrd = ord;
                }
                if (ord < minOrd) {
                    minOrd = ord;
                }
                arr[i] = ord;
            }
            const label = parseInt(chunk.Sentiment);
            if (label) {
                trueCount++;
            }
            this.X.push(arr);
            this.Y.push(label);
          }));
        this.minOrd = minOrd;
        this.maxOrd = maxOrd;
        console.log("char range:", minOrd, maxOrd);
        const permutation = await tf.range(0, this.X.length, 1, 'int32').data();
        await TextsLoader.shuffle(permutation);
        const cutoff = Math.floor(this.validRatio * this.X.length);
        this.trainSet = permutation.slice(cutoff, this.X.length);
        this.validSet = permutation.slice(0, cutoff);
        console.log('train:', this.trainSet.length, 'samples,', this.countTrainBatches(1), 'batches');
        console.log('validation:', this.validSet.length, 'samples,', this.countValidBatches(1), 'batches');
        console.log('balance:', trueCount / this.X.length);
        return this;
    }
    
    charRange() {
        return this.maxOrd + 1 - (this.minOrd - 1);
    }
    
    size() {
        return this.X.length;
    }
    
    countTrainBatches(numEpochs) {
        const batchesPerEpoch = Math.floor(this.trainSet.length / this.trainBatchSize());
        return Math.floor(numEpochs * batchesPerEpoch);
    }
    
    countValidBatches(numEpochs) {
        const batchesPerEpoch = Math.floor(this.validSet.length / this.validBatchSize());
        return Math.floor(numEpochs * batchesPerEpoch);
    }
    
    async nextTrainBatch() {
        return await this.nextBatch('train');
    }
    
    async nextValidBatch() {
        return await this.nextBatch('valid');
    }
    
    async nextBatch(name) {
        const X = this[name + 'BatchX'];
        const Y = this[name + 'BatchY'];
        const set = this[name + 'Set'];
        const posName = name + 'Pos';
        const batchSize = X.shape[0];
        const maxLength = Y.shape[1];
        const offset = this.minOrd - 1;
        if (this[posName] + batchSize > set.length) {
            await TextsLoader.shuffle(set);
            this[posName] = 0;
        }
        for (let i = 0; i < batchSize; i++) {
            const pos = set[i + this[posName]];
            const text = this.X[pos];
            for (let j = 0; j < text.length; j++) {
                X.set(text[j] - offset, i, j);
            }
            for (let j = text.length; j < maxLength; j++) {
                X.set(0, i, j);
            }
            const label = this.Y[pos];
            Y.set(label, i, label);
            Y.set(0, i, 1 - label);
        }
        this[posName] += batchSize;
        return [X.toTensor(), Y.toTensor()];
    }
    
    toBatch(text) {
        const maxLength = this.maxTextLength();
        const utf8text = unescape(encodeURIComponent(text.trim()));
        const len = Math.min(utf8text.length, maxLength);
        const arr = tf.buffer([1, maxLength], 'int32');
        const offset = this.minOrd - 1;
        for (let i = 0; i < len; i++) {
            const ord = utf8text.charCodeAt(i);
            arr.set(ord - offset, 0, i);
        }
        return arr.toTensor();
    }
    
    static async shuffle(array) {
        // Fisher-Yates
        const entropy = await tf.randomUniform([array.length], 0, 1).data(); 
        for (let i = array.length - 1; i > 0; i--) {
            let swapped = Math.floor(entropy[i] * (i + 1));
            let tmp = array[i];
            array[i] = array[swapped];
            array[swapped] = tmp;
        }
        return array;
    }
}

In [12]:
var loader = await (new TextsLoader({
    maxTextLength: 140,
    trainBatchSize: 100,
    validBatchSize: 500,
    validRatio: 0.2
})).load('Sentiment Analysis Dataset.csv');

char range: 9 226
train: 1262850 samples, 12628 batches
validation: 315712 samples, 631 batches
balance: 0.5005505010256169


# Model

In [13]:
function createModel(hyperp) {
    hyperp = {
        type: hyperp.type || "classification",
        learning_rate: hyperp.learning_rate || 0.001,
        lstm_units: hyperp.lstm_units || 96,
        lstm_layers: hyperp.lstm_layers || 2,
        bidi: hyperp.bidi || false,
        attention: hyperp.attention || false,
        dropout: hyperp.dropout || 0,
    }

    // uint8 dtype is not supported yet
    let input = tf.input({shape: [loader.maxTextLength()], dtype: "int32"});
    let head = tf.layers.embedding({inputDim: loader.charRange(),
                                    outputDim: loader.charRange(),
                                    embeddingsInitializer: "identity",
                                    trainable: false}).apply(input);
    for (let i = 0; i < hyperp.lstm_layers; i++) {
        let lstm = tf.layers.lstm({
            units: hyperp.lstm_units,
            unitForgetBias: true,
            returnSequences: true,
        });
        if (hyperp.bidi) {
            head = tf.layers.bidirectional({layer: lstm, mergeMode: 'concat'}).apply(head);
            if (i < hyperp.lstm_layers - 1 && hyperp.dropout > 0) {
                head = tf.layers.dropout({"rate": hyperp.dropout, "seed": 7}).apply(head);
            }
        } else {
            head = lstm.apply(head);
        }
    }

    if (hyperp.attention && hyperp.type != 'charnn') {
        attention = tf.layers.dense({units: 1, activation: 'tanh'}).apply(head);
        attention = tf.layers.flatten().apply(attention);
        attention = tf.layers.activation('softmax').apply(attention);
        attention = tf.layers.repeatVector({n: hyperp.lstm_units * (hyperp.bidi + 1)}).apply(attention);
        attention = tf.layers.permute({dims: [2, 1]}).apply(attention);
        head = tf.layers.multiply().apply([head, attention]);
        head = tf.layers.globalAveragePooling1d({}).apply(head);
    }
    let loss;
    if (hyperp.type == 'charnn') {
        head = tf.layers.timeDistributed({
            layer: tf.layers.dense({units: loader.charRange(), activation: 'softmax'})
        }).apply(head);
        loss = 'categoricalCrossentropy';
    } else if (hyperp.type == 'classification') {
        if (!hyperp.attention) {
            head = tf.layers.flatten().apply(head);
        }
        head = tf.layers.dense({units: 2, activation: 'softmax'}).apply(head);
        // two classes; change to 'categoricalCrossentropy' if > 2
        loss = 'binaryCrossentropy';
    }
    let model = tf.model({inputs: input, outputs: head})
    model.compile({optimizer: tf.train.adam(hyperp.learning_rate), loss: loss, metrics: ['accuracy']});
    model.summary();
    return model;
}

In [6]:
async function trainCharnn(model, epochs, validationInterval) {
    validationInterval = validationInterval || 50;
    
    var batchesCount = loader.countTrainBatches(epochs);
    for (let i = 0; i < batchesCount; i++) {
        let validBatch;
        if (i % validationInterval === 0) {
            validBatch = await loader.nextValidBatch();
            validBatch[1].dispose();
            validBatch[1] = tf.oneHot(validBatch[0], loader.charRange());
        }

        const trainBatch = await loader.nextTrainBatch();
        trainBatch[1].dispose();
        trainBatch[1] = tf.oneHot(trainBatch[0], loader.charRange());
        const result = await model.fit(trainBatch[0], trainBatch[1], {
            batchSize: loader.trainBatchSize(),
            epochs: 1,
            validationData: validBatch,
            shuffle: false
        });
        tf.dispose(trainBatch);

        if (i % validationInterval === 0) {
            tf.dispose(validBatch);
            const loss = result.history.loss[0];
            const accuracy = result.history.acc[0];
            console.log((i + 1) + '/' + batchesCount, 'loss:', loss, '   accuracy:', accuracy);
            const text = "Awesome, wonderful evening in the prison!";
            const predBatch = loader.toBatch(text);
            const out = model.predict(predBatch).buffer();
            predBatch.dispose();
            let predText = "";
            for (let i = 0; i < text.length; i++) {
                let maxP = 0;
                let maxJ = -1;
                for (let j = 0; j < loader.charRange(); j++) {
                    const p = out.get(0, i, j);
                    if (p > maxP) {
                        maxP = p;
                        maxJ = j;
                    }
                }
                maxJ += loader.minOrd - 1;
                predText += String.fromCharCode(maxJ);
            }
            console.log(predText);
        }
    }
}

In [None]:
// Experiment 101 - predict the last character in the sequence.
// This is a trivial task with an obvious information leak which
// proves that all the plumbing works correctly.
// We gradually turn on the features: bidi, attention.
// Expected accuracy: 0.99(9).

trainCharnn(createModel({'type': 'charnn'}), 3);

Orthogonal initializer is being called on a matrix with more than 2000 (36864) elements: Slowness may result.
Orthogonal initializer is being called on a matrix with more than 2000 (36864) elements: Slowness may result.


_________________________________________________________________
Layer (type)                 Output shape              Param #   
input1 (InputLayer)          [null,140]                0         
_________________________________________________________________
embedding_Embedding1 (Embedd [null,140,219]            47961     
_________________________________________________________________
lstm_LSTM1 (LSTM)            [null,140,96]             121344    
_________________________________________________________________
lstm_LSTM2 (LSTM)            [null,140,96]             74112     
_________________________________________________________________
time_distributed_TimeDistrib [null,140,219]            21243     
Total params: 264660
Trainable params: 216699
Non-trainable params: 47961
_________________________________________________________________
1/37884 loss: 5.396839618682861    accuracy: 0.0077142855152487755
d                       


In [None]:
trainCharnn(createModel({'type': 'charnn', 'bidi': true, 'lstm_units': 64}), 1);

Orthogonal initializer is being called on a matrix with more than 2000 (16384) elements: Slowness may result.
Orthogonal initializer is being called on a matrix with more than 2000 (16384) elements: Slowness may result.
Orthogonal initializer is being called on a matrix with more than 2000 (16384) elements: Slowness may result.
Orthogonal initializer is being called on a matrix with more than 2000 (16384) elements: Slowness may result.


_________________________________________________________________
Layer (type)                 Output shape              Param #   
input1 (InputLayer)          [null,140]                0         
_________________________________________________________________
embedding_Embedding1 (Embedd [null,140,219]            47961     
_________________________________________________________________
bidirectional_Bidirectional1 [null,140,128]            145408    
_________________________________________________________________
bidirectional_Bidirectional2 [null,140,128]            98816     
_________________________________________________________________
time_distributed_TimeDistrib [null,140,219]            28251     
Total params: 320436
Trainable params: 272475
Non-trainable params: 47961
_________________________________________________________________
1/12628 loss: 5.386876106262207    accuracy: 0.0022857142612338066
ttÅÅttttttÆÆÆ
51/12628 loss: 3.26692533493042    accuracy: 0.165214

In [8]:
async function trainClassification(model, epochs, validationInterval) {
    validationInterval = validationInterval || 50;
    
    var batchesCount = loader.countTrainBatches(epochs);
    for (let i = 0; i < batchesCount; i++) {
        let validBatch;
        if (i % validationInterval === 0) {
            validBatch = await loader.nextValidBatch();
        }

        const trainBatch = await loader.nextTrainBatch();
        const result = await model.fit(trainBatch[0], trainBatch[1], {
            batchSize: loader.trainBatchSize(),
            epochs: 1,
            validationData: validBatch,
            shuffle: false
        });
        tf.dispose(trainBatch);

        if (i % validationInterval === 0) {
            tf.dispose(validBatch);
            const loss = result.history.loss[0];
            const accuracy = result.history.acc[0];
            console.log((i + 1) + '/' + batchesCount, 'loss:', loss, '   accuracy:', accuracy);
            const text = "Awesome, wonderful evening in the prison!";
            const predBatch = loader.toBatch(text);
            const out = model.predict(predBatch).buffer();
            predBatch.dispose();
            console.log("negative:", out.get(0, 0), "   positive:", out.get(0, 1));
        }
    }
}

In [None]:
trainClassification(createModel({'type': 'classification', 'bidi': true, 'lstm_units': 64}), 1);

Orthogonal initializer is being called on a matrix with more than 2000 (16384) elements: Slowness may result.
Orthogonal initializer is being called on a matrix with more than 2000 (16384) elements: Slowness may result.
Orthogonal initializer is being called on a matrix with more than 2000 (16384) elements: Slowness may result.
Orthogonal initializer is being called on a matrix with more than 2000 (16384) elements: Slowness may result.


_________________________________________________________________
Layer (type)                 Output shape              Param #   
input5 (InputLayer)          [null,140]                0         
_________________________________________________________________
embedding_Embedding5 (Embedd [null,140,219]            47961     
_________________________________________________________________
bidirectional_Bidirectional9 [null,140,128]            145408    
_________________________________________________________________
bidirectional_Bidirectional1 [null,140,128]            98816     
_________________________________________________________________
flatten_Flatten2 (Flatten)   [null,17920]              0         
_________________________________________________________________
dense_Dense4 (Dense)         [null,2]                  35842     
Total params: 328027
Trainable params: 280066
Non-trainable params: 47961
_________________________________________________________________
1/

In [None]:
trainClassification(createModel({'type': 'classification', 'bidi': true, 'attention': true, 'lstm_units': 64}), 1);

Orthogonal initializer is being called on a matrix with more than 2000 (16384) elements: Slowness may result.
Orthogonal initializer is being called on a matrix with more than 2000 (16384) elements: Slowness may result.
Orthogonal initializer is being called on a matrix with more than 2000 (16384) elements: Slowness may result.
Orthogonal initializer is being called on a matrix with more than 2000 (16384) elements: Slowness may result.


__________________________________________________________________________________________________
Layer (type)                    Output shape         Param #     Receives inputs                  
input1 (InputLayer)             [null,140]           0                                            
__________________________________________________________________________________________________
embedding_Embedding1 (Embedding [null,140,219]       47961       input1[0][0]                     
__________________________________________________________________________________________________
bidirectional_Bidirectional1 (B [null,140,128]       145408      embedding_Embedding1[0][0]       
__________________________________________________________________________________________________
bidirectional_Bidirectional2 (B [null,140,128]       98816       bidirectional_Bidirectional1[0][0
__________________________________________________________________________________________________
dense_Dens