In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import tensorflow as tf
import numpy as np
import random

In [3]:
MAX_DOC_LENGTH = 500
NUM_CLASSES = 20

In [4]:
class DataReader:
    def __init__(self, data_path, batch_size):
        self._batch_size = batch_size
        with open(data_path, encoding='latin-1') as f:
            d_lines = f.read().splitlines()

        self._data = []
        self._labels = []
        self._sentence_lengths = []
        self._final_tokens = []

        for data_id, line in enumerate(d_lines):
            features = line.split('<fff>')
            label, doc_id, sentence_length = int(features[0]), int(features[1]), int(features[2])
            tokens = features[3].split()

            self._data.append(tokens)
            self._labels.append(label)
            self._sentence_lengths.append(sentence_length)
            self._final_tokens.append(tokens[-1])

        self._data = np.array(self._data)
        self._labels = np.array(self._labels)
        self._sentence_lengths = np.array(self._sentence_lengths)
        self._final_tokens = np.array(self._final_tokens)

        self._num_epoch = 0
        self._batch_id = 0

    def next_batch(self):
        start = self._batch_id * self._batch_size
        end = start + self._batch_size
        self._batch_id += 1

        if end + self._batch_size > len(self._data):
            end = len(self._data)
            start = end - self._batch_size

            self._num_epoch += 1
            self._batch_id = 0

            indices = list(range(len(self._data)))
            random.seed(2021)
            random.shuffle(indices)
            self._data, self._labels = self._data[indices], self._labels[indices]
            self._sentence_lengths = self._sentence_lengths[indices]
            self._final_tokens = self._final_tokens[indices]
        return self._data[start:end], self._labels[start:end], self._sentence_lengths[start:end], self._final_tokens[start:end]

In [5]:
class RNN:
    def __init__(self, vocab_size, embedding_size, lstm_size, batch_size):
        self._vocab_size = vocab_size
        self._embedding_size = embedding_size
        self._lstm_size = lstm_size
        self._batch_size = batch_size

        self._data = tf.compat.v1.placeholder(tf.int32, shape=[batch_size, MAX_DOC_LENGTH])
        self._labels = tf.compat.v1.placeholder(tf.int32, shape=[batch_size, ])
        self._sentence_lengths = tf.compat.v1.placeholder(tf.int32, shape=[batch_size, ])
        self._final_token = tf.compat.v1.placeholder(tf.int32, shape=[batch_size, ])

    def embedding_layer(self, indices):
        pretrained_vector = []
        pretrained_vector.append(np.zeros(self._embedding_size))
        np.random.seed(2021)
        for _ in range(self._vocab_size + 1):
            pretrained_vector.append(np.random.normal(loc=0., scale=1., size=self._embedding_size))

        pretrained_vector = np.array(pretrained_vector)

        self._embedding_matrix = tf.compat.v1.get_variable(
            name='embedding',
            shape=(self._vocab_size + 2, self._embedding_size),
            initializer=tf.constant_initializer(pretrained_vector)
        )
        return tf.nn.embedding_lookup(self._embedding_matrix, indices)

    def LSTM_layer(self, embeddings):
        lstm_cell = tf.compat.v1.nn.rnn_cell.BasicLSTMCell(self._lstm_size)
        zero_state = tf.zeros(shape=(self._batch_size, self._lstm_size))
        initial_state = tf.compat.v1.nn.rnn_cell.LSTMStateTuple(zero_state, zero_state)

        lstm_inputs = tf.unstack(tf.transpose(embeddings, perm=[1, 0, 2]))
        lstm_outputs, last_state = tf.compat.v1.nn.static_rnn(
            cell=lstm_cell,
            inputs=lstm_inputs,
            initial_state=initial_state,
            sequence_length=self._sentence_lengths
        )  # a length-500 list of [num_docs, lstm_size]

        lstm_outputs = tf.unstack(tf.transpose(lstm_outputs, perm=[1, 0, 2]))
        lstm_outputs = tf.concat(lstm_outputs, axis=0)  # [num docs * MAX_SENT_LENGTH, lstm_size]

        # self.mask: [num_docs * MAX_SENT_LENGTH, ]
        mask = tf.sequence_mask(
            lengths=self._sentence_lengths,
            maxlen=MAX_DOC_LENGTH,
            dtype=tf.float32
        )  # [num_docs, MAX_SENTENCE_LENGTH]
        mask = tf.concat(tf.unstack(mask, axis=0), axis=0)
        mask = tf.expand_dims(mask, -1)

        lstm_outputs = mask * lstm_outputs
        lstm_outputs_split = tf.split(lstm_outputs, num_or_size_splits=self._batch_size)
        lstm_outputs_sum = tf.reduce_sum(lstm_outputs_split, axis=1)    # [num_docs, lstm_size]
        lstm_outputs_average = lstm_outputs_sum / tf.expand_dims(
            tf.cast(self._sentence_lengths, tf.float32), -1) # [num_docs, lstm_size]

        return lstm_outputs_average

    def build_graph(self):
        embeddings = self.embedding_layer(self._data)
        lstm_outputs = self.LSTM_layer(embeddings)

        weights = tf.compat.v1.get_variable(
            name='final_layer_weights',
            shape=(self._lstm_size, NUM_CLASSES),
            initializer=tf.random_normal_initializer(seed=2021)
        )
        biases = tf.compat.v1.get_variable(
            name='final_layer_biases',
            shape=(NUM_CLASSES),
            initializer=tf.random_normal_initializer(seed=2021)
        )
        logits = tf.matmul(lstm_outputs, weights) + biases

        labels_one_hot = tf.one_hot(
            indices=self._labels,
            depth=NUM_CLASSES,
            dtype=tf.float32
        )

        loss = tf.nn.softmax_cross_entropy_with_logits(
            labels=labels_one_hot,
            logits=logits
        )
        loss = tf.reduce_mean(loss)

        probs = tf.nn.softmax(logits)
        predicted_labels = tf.argmax(probs, axis=1)
        predicted_labels = tf.squeeze(predicted_labels)
        return predicted_labels, loss

    def trainer(self, loss, learning_rate):
        train_op = tf.compat.v1.train.AdamOptimizer(learning_rate).minimize(loss)
        return train_op

In [6]:
def train_and_evaluate_RNN():
    with open('/content/drive/MyDrive/Session4/w2v/vocab-raw.txt', encoding='latin-1') as f:
        vocab_size = len(f.read().splitlines())

    train_data_path = '/content/drive/MyDrive/Session4/w2v/20news-train-encoded.txt'
    test_data_path = '/content/drive/MyDrive/Session4/w2v/20news-test-encoded.txt'

    tf.random.set_seed(2021)
    rnn = RNN(
        vocab_size=vocab_size,
        embedding_size=300,
        lstm_size=50,
        batch_size=50
    )
    predicted_labels, loss = rnn.build_graph()
    train_op = rnn.trainer(loss=loss, learning_rate=0.01)

    with tf.compat.v1.Session() as sess:
        train_data_reader = DataReader(
            data_path=train_data_path,
            batch_size=50
        )
        test_data_reader = DataReader(
            data_path=test_data_path,
            batch_size=50
        )
        step = 0
        MAX_STEP = 1000

        sess.run(tf.compat.v1.global_variables_initializer())
        while step < MAX_STEP:
            next_train_batch = train_data_reader.next_batch()
            train_data, train_labels, train_sentence_lengths, train_final_tokens = next_train_batch
            plabels_eval, loss_eval, _ = sess.run(
                [predicted_labels, loss, train_op],
                feed_dict={
                    rnn._data: train_data,
                    rnn._labels: train_labels,
                    rnn._sentence_lengths: train_sentence_lengths,
                    rnn._final_token: train_final_tokens
                }
            )
            step += 1
            if step % 20 == 0:
                print('loss:', loss_eval)

            if train_data_reader._batch_id == 0:
                num_true_preds = 0
                while True:
                    next_test_batch = test_data_reader.next_batch()
                    test_data, test_labels, test_sentence_lengths, test_final_tokens = next_test_batch

                    test_plabels_eval = sess.run(
                        predicted_labels,
                        feed_dict={
                            rnn._data: test_data,
                            rnn._labels: test_labels,
                            rnn._sentence_lengths: test_sentence_lengths,
                            rnn._final_token: test_final_tokens
                        }
                    )
                    matches = np.equal(test_plabels_eval, test_labels)
                    num_true_preds += np.sum(matches.astype(float))

                    if test_data_reader._batch_id == 0:
                        break
                print('Epoch:', train_data_reader._num_epoch)
                print('Accuracy on test data:', num_true_preds * 100. / len(test_data_reader._data))

In [7]:
tf.compat.v1.disable_eager_execution()
train_and_evaluate_RNN()

loss: 1.2600601
loss: 2.7552705
loss: 3.2598214
loss: 0.4841204
loss: 1.8978115
loss: 3.4846506
loss: 1.9305673
loss: 1.6547973
loss: 3.1990278
loss: 4.3268514
loss: 3.9816291
Epoch: 1
Accuracy on test data: 2.8412108337758895
loss: 3.082207
loss: 2.9578834
loss: 2.8207355
loss: 2.6995373
loss: 2.6000586
loss: 2.520143
loss: 2.3366468
loss: 2.1218448
loss: 2.0876868
loss: 2.0293484
loss: 1.8363167
Epoch: 2
Accuracy on test data: 47.22517259691981
loss: 1.5220538
loss: 1.3614072
loss: 1.1936047
loss: 1.2628952
loss: 1.0057307
loss: 1.067099
loss: 0.8837765
loss: 0.6301675
loss: 0.8061423
loss: 0.7356995
loss: 0.5982385
Epoch: 3
Accuracy on test data: 67.43228890069038
loss: 0.47074273
loss: 0.31024608
loss: 0.5430601
loss: 0.42074466
loss: 0.22054943
loss: 0.26697937
loss: 0.25895855
loss: 0.32652858
loss: 0.24538912
loss: 0.36663327
loss: 0.45304948
loss: 0.44568467
Epoch: 4
Accuracy on test data: 71.4551248008497
loss: 0.09487627
loss: 0.119782336
loss: 0.1639022
loss: 0.19712849
loss