In [0]:
import numpy as np
import re

In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
    def collectDataFrom(parentPath, newsgroupList, wordCount=None):
        data = []
        for groupId, newsgroup in enumerate(newsgroupList):
            dirPath = parentPath + '/' + newsgroup + '/'
            files = [(fileName, dirPath + fileName)
                     for fileName in listdir(dirPath)
                     if isfile(dirPath + fileName)]
            files.sort()
            label = groupId
            print('Processing: {}-{}'.format(groupId, newsgroup))

            for fileName, filePath in files:
                with open(filePath, 'r', encoding = 'utf-8', errors='ignore') as f:
                    text = f.read().lower()
                    words = re.split('\W+', text)
                    if wordCount is not None:
                        for word in words:
                            wordCount[word] += 1
                    content = ' '.join(words)
                    assert len(content.splitlines()) == 1
                    data.append(str(label) + '<fff>'
                                + fileName + '<fff>' + content)

        return data


In [0]:
from os import listdir
from os.path import isfile
from collections import defaultdict

def getDataAndVocab():
    wordCount = defaultdict(int)
    path = '/content/drive/My Drive/ML_DS_training/20news-bydate/'
    parts = [path + dirName + '/' for dirName in listdir(path)
             if not isfile(path + dirName)]

    trainPath, testPath = (parts[0], parts[1]) \
        if 'train' in parts[0] else (parts[1], parts[0])

    newsgroupList = [newsgroup for newsgroup in listdir(trainPath)]
    newsgroupList.sort()

    trainData = collectDataFrom(parentPath=trainPath,
                               newsgroupList=newsgroupList,
                                wordCount=wordCount)

    vocab = [word for word, freq in zip(wordCount.keys(), wordCount.values()) \
             if freq > 10]
    vocab.sort()
    with open('/content/drive/My Drive/ML_DS_training/datasets/w2v/vocab-raw.txt', 'w') as f:
        f.write('\n'.join(vocab))

    testData = collectDataFrom(parentPath=testPath,
                              newsgroupList=newsgroupList)

    with open('/content/drive/My Drive/ML_DS_training/datasets/w2v/20news-train-raw.txt', 'w') as f:
        f.write('\n'.join(trainData))
    with open('/content/drive/My Drive/ML_DS_training/datasets/w2v/20news-test-raw.txt', 'w') as f:
        f.write('\n'.join(testData))


In [0]:
getDataAndVocab()

In [0]:
MAX_DOC_LENGTH = 500
unknownID = 0
paddingID = 1

def encodeData(dataPath, vocabPath):
    with open(vocabPath) as f:
        vocab = dict([(word, wordID + 2)
                      for wordID, word in enumerate(f.read().splitlines())])
    with open(dataPath) as f:
        documents = [(line.split('<fff>')[0], line.split('<fff>')[1], line.split('<fff>')[2])
                     for line in f.read().splitlines()]
    encodedData = []
    for document in documents:
        label, docID, text = document
        words = text.split()[:MAX_DOC_LENGTH]
        sentenceLength = len(words)
        encodedText = []
        for word in words:
            if word in vocab:
                encodedText.append(str(vocab[word]))
            else:
                encodedText.append(str(unknownID))

        if len(words) < MAX_DOC_LENGTH:
            numPadding = MAX_DOC_LENGTH - len(words)
            for _ in range(numPadding):
                encodedText.append(str(paddingID))

        encodedData.append(str(label) + '<fff>' + str(docID) + '<fff>' +
                           str(sentenceLength) + '<fff>' + ' '.join(encodedText))

    dirName = '/'.join(dataPath.split('/')[:-1])
    fileName = '-'.join(dataPath.split('/')[-1].split('-')[:-1]) + '-encoded.txt'
    with open(dirName + '/' + fileName, 'w') as f:
        f.write('\n'.join(encodedData))


In [0]:
encodeData('/content/drive/My Drive/ML_DS_training/datasets/w2v/20news-train-raw.txt', '/content/drive/My Drive/ML_DS_training/datasets/w2v/vocab-raw.txt')
encodeData('/content/drive/My Drive/ML_DS_training/datasets/w2v/20news-test-raw.txt', '/content/drive/My Drive/ML_DS_training/datasets/w2v/vocab-raw.txt')

In [0]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

In [0]:
MAX_DOC_LENGTH = 500
NUM_CLASSES = 20

In [0]:
class RNN:
    def __init__(self, vocabSize, embeddingSize, lstmSize, batchSize):
        self._vocabSize = vocabSize
        self._embeddingSize = embeddingSize
        self._lstmSize = lstmSize
        self._batchSize = batchSize

        self._data = tf.placeholder(tf.int32, shape=[batchSize, MAX_DOC_LENGTH])
        self._labels = tf.placeholder(tf.int32, shape=[batchSize, ])
        self._sentence_lengths = tf.placeholder(tf.int32, shape=[batchSize, ])
        self._finalTokens = tf.placeholder(tf.int32, shape=[batchSize, ])

    def embeddingLayer(self, indices):
        pretrainedVectors= []
        pretrainedVectors.append(np.zeros(self._embeddingSize))
        np.random.seed(2020)
        for _ in range(self._vocabSize + 1):
            pretrainedVectors.append(np.random.normal(loc=0., scale=1., size=self._embeddingSize))

        pretrainedVectors = np.array(pretrainedVectors)
        with tf.variable_scope("rnn_variables", reuse=tf.AUTO_REUSE) as scope:
            self._embeddingMatrix = tf.get_variable(
              name= 'embedding',
              shape= (self._vocabSize + 2, self._embeddingSize),
              initializer= tf.constant_initializer(pretrainedVectors)
            )

        return tf.nn.embedding_lookup(self._embeddingMatrix, indices)

    def LSTMLayer(self, embeddings):
        lstmCell = tf.nn.rnn_cell.BasicLSTMCell(self._lstmSize)
        zeroState = tf.zeros(shape=(self._batchSize, self._lstmSize))
        initialState = tf.nn.rnn_cell.LSTMStateTuple(zeroState, zeroState)

        lstmInputs = tf.unstack(tf.transpose(embeddings, perm=[1, 0, 2]))
        lstmOutputs, lastState = tf.nn.static_rnn(
            cell=lstmCell,
            inputs=lstmInputs,
            initial_state=initialState,
            sequence_length=self._sentence_lengths
        )
        lstmOutputs = tf.unstack(
            tf.transpose(lstmOutputs, perm=[1, 0, 2])
        )
        lstmOutputs = tf.concat(
            lstmOutputs, axis=0
        )  # [num docs * MAX_SENT_LENGTH, lstm_size]

        # self._mask : [num docs * MAX_SENT_LENGTH, ]
        mask = tf.sequence_mask(
            lengths=self._sentence_lengths,
            maxlen=MAX_DOC_LENGTH,
            dtype=tf.float32
        )  # [num docs, MAX_SENTENCE_LENGTH]
        mask = tf.concat(tf.unstack(mask, axis=0), axis=0)
        mask = tf.expand_dims(mask, -1)
        lstmOutputs = mask * lstmOutputs
        lstmOutputsSplit = tf.split(lstmOutputs, num_or_size_splits=self._batchSize)
        lstmOutputsSum = tf.reduce_sum(lstmOutputsSplit, axis=1)  # [ num_docs, lstm_size]
        lstmOutputsAverage = lstmOutputsSum / tf.expand_dims(
            tf.cast(self._sentence_lengths, tf.float32),
            # expand_dims only works with tensor of float type
            -1)  # [num_docs, lstm_size]
        return lstmOutputsAverage

    def buildGraph(self):
        embeddings = self.embeddingLayer(self._data)
        lstmOutputs = self.LSTMLayer(embeddings)

        weights = tf.get_variable(
            name='final_layer_weights',
            shape=(self._lstmSize, NUM_CLASSES),
            initializer=tf.random_normal_initializer(seed=2020)
        )
        biases = tf.get_variable(
            name='final_layer_biases',
            shape=NUM_CLASSES,
            initializer=tf.random_normal_initializer(seed=2020)
        )

        logits = tf.matmul(lstmOutputs, weights) + biases
        labels_one_hot = tf.one_hot(
            indices=self._labels,
            depth=NUM_CLASSES,
            dtype=tf.float32
        )

        loss = tf.nn.softmax_cross_entropy_with_logits_v2(
            labels=labels_one_hot,
            logits=logits
        )
        loss = tf.reduce_mean(loss)

        probs = tf.nn.softmax(logits)
        predictedLabels = tf.argmax(probs, axis=1)
        predictedLabels = tf.squeeze(predictedLabels)

        return predictedLabels, loss

    def trainer(self, loss, learningRate):
        with tf.variable_scope("rnn_variables", reuse=tf.AUTO_REUSE) as scope:
          trainOp = tf.train.AdamOptimizer(learningRate).minimize(loss)
        return trainOp



In [0]:
class DataReader:
    def __init__(self, dataPath, batchSize, vocabSize):
        self._batch_size = batchSize
        with open(dataPath) as f:
            lines = f.read().splitlines()

        self._data = []
        self._labels = []
        self._sentence_lengths = []

        for data_id, data_d in enumerate(lines):
            if len(data_d) > 1:
                (label, doc_id, sentenceLength, vec) = data_d.split("<fff>")
                label = int(label)
                doc_id = int(doc_id)
                sentenceLength = int(sentenceLength)
                vector = [int(ele) for ele in vec.split()]

                self._data.append(vector)
                self._labels.append(label)
                self._sentence_lengths.append(sentenceLength)

        self._data = np.array(self._data)
        self._labels = np.array(self._labels)
        self._sentence_lengths = np.array(self._sentence_lengths)

        self._num_epoch = 0
        self._batch_id = 0

    def nextBatch(self):
        start = self._batch_size * self._batch_id
        end = start + self._batch_size
        self._batch_id += 1
        if end > len(self._data):
            end = len(self._data)
            self._num_epoch += 1
            self._batch_id = 0

            arr = np.array(range(len(self._data)))
            np.random.seed(2020)
            np.random.shuffle(arr)

            self._data, self._labels, self._sentence_lengths = self._data[arr], self._labels[arr], \
                                                               self._sentence_lengths[arr]

        return self._data[start:end], self._labels[start:end], self._sentence_lengths[start:end]


In [0]:
def trainAndEvaluateRNN():
    with open('/content/drive/My Drive/ML_DS_training/datasets/w2v/vocab-raw.txt') as f:
        vocabSize = len(f.read().splitlines())

    tf.set_random_seed(2020)
    rnn = RNN(
        vocabSize=vocabSize,
        embeddingSize=300,
        lstmSize=50,
        batchSize=50
    )
    predictedLabels, loss = rnn.buildGraph()
    trainOp = rnn.trainer(loss=loss, learningRate=0.01)

    with tf.Session() as sess:
        trainDataReader = DataReader(
            dataPath='/content/drive/My Drive/ML_DS_training/datasets/w2v/20news-train-encoded.txt',
            batchSize=50,
            vocabSize=vocabSize
        )

        testDataReader = DataReader(
            dataPath='/content/drive/My Drive/ML_DS_training/datasets/w2v/20news-test-encoded.txt',
            batchSize=50,
            vocabSize=vocabSize
        )

        step = 0
        MAX_STEP = 1500

        sess.run(tf.global_variables_initializer())
        while step < MAX_STEP:
            nextTrainBatch = trainDataReader.nextBatch()
            trainData, trainLabels, trainSentenceLengths = nextTrainBatch
            plabelsEval, lossEval, _ = sess.run(
                [predictedLabels, loss, trainOp],
                feed_dict={
                    rnn._data: trainData,
                    rnn._labels: trainLabels,
                    rnn._sentence_lengths: trainSentenceLengths,
                }
            )
            step += 1
            if step % 20 == 0:
                print("step: " + str(step) + " loss: " + str(lossEval))
            if trainDataReader._batch_id == 0:
                numTruePreds = 0
                while True:
                    nextTestBatch = testDataReader.nextBatch()
                    testData, testLabels, testSentenceLengths = nextTestBatch

                    testPlabelsEval, lossEval, _ = sess.run(
                        [predictedLabels, loss, trainOp],
                        feed_dict={
                            rnn._data: testData,
                            rnn._labels: testLabels,
                            rnn._sentence_lengths: testSentenceLengths,
                        }
                    )
                    matches = np.equal(testPlabelsEval, testLabels)
                    numTruePreds += np.sum(matches.astype(float))

                    if testDataReader._batch_id == 0: break
                print('Epoch: ', trainDataReader._num_epoch)
                print('Accuracy on test data: ', numTruePreds * 100. / len(testDataReader._data))



In [0]:
with tf.variable_scope("rnn_variables", reuse=tf.AUTO_REUSE) as scope:
        trainAndEvaluateRNN()