In [1]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import tensorflow as tf
import numpy as np
import random

In [3]:
class MLP:
    def __init__(self, vocab_size, hidden_size):
        self._vocab_size = vocab_size
        self._hidden_size = hidden_size

    def build_graph(self):
        self._X = tf.compat.v1.placeholder(tf.float32, shape=[None, self._vocab_size])
        self._real_Y = tf.compat.v1.placeholder(tf.int32, shape=[None, ])
        NUM_CLASSES = 20

        weights_1 = tf.compat.v1.get_variable(
            name='weight_input_hidden',
            shape=(self._vocab_size, self._hidden_size),
            initializer=tf.random_normal_initializer(seed=2020),
        )
        biases_1 = tf.compat.v1.get_variable(
            name='biases_input_hidden',
            shape=(self._hidden_size),
            initializer=tf.random_normal_initializer(seed=2020)
        )

        weights_2 = tf.compat.v1.get_variable(
            name='weight_hidden_output',
            shape=(self._hidden_size, NUM_CLASSES),
            initializer=tf.random_normal_initializer(seed=2020),
        )

        biases_2 = tf.compat.v1.get_variable(
            name='biases_hidden_output',
            shape=(NUM_CLASSES),
            initializer=tf.random_normal_initializer(seed=2020),
        )

        hidden = tf.matmul(self._X, weights_1) + biases_1
        hidden = tf.sigmoid(hidden)
        logits = tf.matmul(hidden, weights_2) + biases_2

        labels_one_hot = tf.one_hot(indices=self._real_Y,
                                    depth=NUM_CLASSES,
                                    dtype=tf.float32)

        loss = tf.nn.softmax_cross_entropy_with_logits(labels=labels_one_hot,
                                                       logits=logits)

        loss = tf.reduce_mean(loss)

        probs = tf.nn.softmax(logits)
        predicted_labels = tf.argmax(probs, axis=1)
        predicted_labels = tf.squeeze(predicted_labels)

        return predicted_labels, loss

    def trainer(self, loss, learning_rate):
        train_op = tf.compat.v1.train.AdamOptimizer(learning_rate).minimize(loss)
        return train_op

In [4]:
class DataReader:
    def __init__(self, data_path, batch_size, vocab_size):
        self._batch_size = batch_size
        with open(data_path) as f:
            d_lines = f.read().splitlines()

        self._data = []
        self._labels = []

        for data_id, line in enumerate(d_lines):
            vector = [0.0 for _ in range(vocab_size)]
            features = line.split('<fff>')
            label, doc_id = int(features[0]), int(features[1])
            tokens = features[2].split()
            for token in tokens:
                index, value = int(token.split(':')[0]), float(token.split(':')[1])
                vector[index] = value
            self._data.append(vector)
            self._labels.append(label)

        self._data = np.array(self._data)
        self._labels = np.array(self._labels)

        self._num_epoch = 0
        self._batch_id = 0

    def next_batch(self):
        start = self._batch_id * self._batch_size
        end = start + self._batch_size
        self._batch_id += 1

        if end + self._batch_size > len(self._data):
            end = len(self._data)
            self._num_epoch += 1
            self._batch_id = 0
            indices = list(range(len(self._data)))
            random.seed(2020)
            random.shuffle(indices)
            self._data, self._labels = self._data[indices], self._labels[indices]

        return self._data[start:end], self._labels[start:end]

In [5]:
def load_dataset():
    train_data_reader = DataReader(
        data_path='/content/drive/MyDrive/Session3/20news-train-tfidf.txt',
        batch_size=50,
        vocab_size=vocab_size
    )
    test_data_reader = DataReader(
        data_path='/content/drive/MyDrive/Session3/20news-test-tfidf.txt',
        batch_size=50,
        vocab_size=vocab_size
    )
    return train_data_reader, test_data_reader

In [6]:
def save_parameters(name, value, epoch):
    filename = name.replace(':', '-colon-') + '-epoch-{}.txt'.format(epoch)
    if len(value.shape) == 1:
        string_form = ','.join([str(number) for number in value])
    else:
        string_form = '\n'.join([','.join([str(number) for number in value[row]]) for row in range(value.shape[0])])

    with open('/content/drive/MyDrive/Session3/saved-paras/' + filename, "w") as f:
        f.write(string_form)

In [7]:
def restore_parameters(name, epoch):
    # use saved parameters
    filename = name.replace(':', '-colon-') + '-epoch-{}.txt'.format(epoch)
    with open('/content/drive/MyDrive/Session3/saved-paras/' + filename) as f:
        lines = f.read().splitlines()
    if len(lines) == 1:
        value = [float(number) for number in lines[0].split(',')]
    else:
        value = [[float(number) for number in lines[row].split(',')] for row in range(len(lines))]
    return value

Main:

In [8]:
tf.compat.v1.disable_eager_execution()

In [9]:
# Create a computation graph
with open('/content/drive/MyDrive/Session3/words_idfs.txt') as f:
    vocab_size = len(f.read().splitlines())
mlp = MLP(
    vocab_size=vocab_size,
    hidden_size=50
)
predicted_labels, loss = mlp.build_graph()
train_op = mlp.trainer(loss=loss, learning_rate=0.1)

In [10]:
# Open a session to run
with tf.compat.v1.Session() as sess:
    train_data_reader, test_data_reader = load_dataset()
    step, MAX_STEP = 0, 1000

    sess.run(tf.compat.v1.global_variables_initializer())
    while step <= MAX_STEP:
        train_data, train_labels = train_data_reader.next_batch()
        plabels_eval, loss_eval, _ = sess.run(
            [predicted_labels, loss, train_op],
            feed_dict={
                mlp._X: train_data,
                mlp._real_Y: train_labels
            }
        )
        step += 1
        print('step: {}, loss: {}'.format(step, loss_eval))

        # save parameters
        trainable_variables = tf.compat.v1.trainable_variables()
        for variable in trainable_variables:
            save_parameters(
                name=variable.name,
                value=variable.eval(),
                epoch=train_data_reader._num_epoch
            )

step: 1, loss: 2.8072032928466797
step: 2, loss: 0.05456365644931793
step: 3, loss: 0.0005230632959865034
step: 4, loss: 9.753648555488326e-06
step: 5, loss: 5.078313733974937e-07
step: 6, loss: 0.0
step: 7, loss: 0.0
step: 8, loss: 0.0
step: 9, loss: 0.0
step: 10, loss: 10.867098808288574
step: 11, loss: 24.5706844329834
step: 12, loss: 20.508913040161133
step: 13, loss: 15.349947929382324
step: 14, loss: 10.322065353393555
step: 15, loss: 5.678048133850098
step: 16, loss: 2.1246843338012695
step: 17, loss: 0.408387154340744
step: 18, loss: 0.10515070706605911
step: 19, loss: 0.05951683595776558
step: 20, loss: 0.08250648528337479
step: 21, loss: 0.06250625103712082
step: 22, loss: 5.287806510925293
step: 23, loss: 6.58612060546875
step: 24, loss: 5.132270336151123
step: 25, loss: 4.346368312835693
step: 26, loss: 3.3659327030181885
step: 27, loss: 2.869305372238159
step: 28, loss: 2.481956720352173
step: 29, loss: 2.276973009109497
step: 30, loss: 1.9516394138336182
step: 31, loss: 1

In [13]:
# Evaluate model on test data
test_data_reader = DataReader(
    data_path='/content/drive/MyDrive/Session3/20news-test-tfidf.txt',
    batch_size=50,
    vocab_size=vocab_size
)
with tf.compat.v1.Session() as sess:
    epoch = 4

    trainable_variables = tf.compat.v1.trainable_variables()
    for variable in trainable_variables:
        saved_value = restore_parameters(variable.name, epoch)
        assign_op = variable.assign(saved_value)
        sess.run(assign_op)

    num_true_preds = 0
    while True:
        test_data, test_labels = test_data_reader.next_batch()
        test_plabels_eval = sess.run(
            predicted_labels,
            feed_dict={
                mlp._X: test_data,
                mlp._real_Y: test_labels
            }
        )
        matches = np.equal(test_plabels_eval, test_labels)
        num_true_preds += np.sum(matches.astype(float))

        if test_data_reader._batch_id == 0:
            break
    print('Epoch:', epoch)
    print('Accuracy on test data:', num_true_preds / len(test_data_reader._data))

Epoch: 4
Accuracy on test data: 0.7450876261285183
