In [28]:
from os import listdir
from os.path import  isfile
from collections import defaultdict
import re

def gen_data_and_vocab():
    def collect_data_from(par_path, newsgroup_list, word_count=None):
        data = []
        for group_id, newsgroup in enumerate(newsgroup_list):
            dir_path = par_path + '\\' + newsgroup + '\\'
            files = [(filename, dir_path+filename) for filename in listdir(dir_path) if isfile(dir_path+filename)]
            files.sort()
            label = group_id

            for filename, file_path in files:
                with open(file_path) as f:
                    text = f.read().lower()
                    words = re.split('\W+', text)
                    if word_count is not None:    # only get words from traning data
                        for word in words:
                            word_count[word] += 1
                    content = ' '.join(words)
                    assert len(content.splitlines()) == 1
                    data.append(str(label) + '<fff>' + filename + '<fff>' + content)
        return data

    word_count = defaultdict(int)

    path = 'D:\\movedFromC\\123\\20192\\PRJ2\\20news-bydate\\'
    parts = [path + dir_name + '\\' for dir_name in listdir(path) if not isfile(path + dir_name)]
    train_path, test_path = (parts[0], parts[1]) if 'train' in parts[0] else (parts[1], parts[0])

    newsgroups_list = [newsgroup for newsgroup in listdir(train_path)]
    newsgroups_list.sort()

    train_data = collect_data_from(par_path=train_path, newsgroup_list=newsgroups_list, word_count=word_count)
    vocab = [word for word, fre in zip(word_count.keys(), word_count.values()) if fre > 10]
    vocab.sort()
    with open('D:\\movedFromC\\123\\20192\\PRJ2\\data_RNN\\vocab_raw.txt', 'w') as f:
        f.write('\n'.join(vocab))

    test_data = collect_data_from(par_path=test_path, newsgroup_list=newsgroups_list)

    with open('D:\\movedFromC\\123\\20192\\PRJ2\\data_RNN\\train_raw.txt', 'w') as f:
        f.write('\n'.join(train_data))
    with open('D:\\movedFromC\\123\\20192\\PRJ2\\data_RNN\\test_raw.txt', 'w') as f:
        f.write('\n'.join(test_data))


MAX_DOC_LENGTH = 500
unknown_ID = 0
padding_ID = 1
def encode_data(data_path, vocab_path):
    with open(vocab_path) as f:
        vocab = dict([(word, word_ID+2) for word_ID, word in enumerate(f.read().splitlines())])
    
    with open(data_path) as f:
        docs = [(line.split('<fff>')[0], line.split('<fff>')[1], line.split('<fff>')[2])
               for line in f.read().splitlines()]
    
    encoded_data = []
    for doc in docs:
        label, doc_id, text = doc
        words = text.split()[:MAX_DOC_LENGTH]
        sentence_len = len(words)
        
        encode_text = []
        for word in words:
            if word in vocab:
                encode_text.append(str(vocab[word]))
            else:
                encode_text.append(str(unknown_ID))
        if sentence_len < MAX_DOC_LENGTH:
            num_padding = MAX_DOC_LENGTH - sentence_len
            for i in range(num_padding):
                encode_text.append(str(padding_ID))
        
        encoded_data.append(str(label) + '<fff>' + str(doc_id) + '<fff>' + str(sentence_len) + '<fff>' + ' '.join(encode_text))
        
        dir_name = '\\'.join(data_path.split('\\')[:-1])
        file_name = '_'.join(data_path.split('\\')[-1].split('_')[:-1]) + '_encoded.txt'
        with open(dir_name + '\\' + file_name, 'w') as f:
            f.write('\n'.join(encoded_data))



In [26]:
gen_data_and_vocab()

In [30]:
train_data_path = 'D:\\movedFromC\\123\\20192\\PRJ2\\data_RNN\\train_raw.txt'
test_data_path = 'D:\\movedFromC\\123\\20192\\PRJ2\\data_RNN\\test_raw.txt'
vocab_path = 'D:\\movedFromC\\123\\20192\\PRJ2\\data_RNN\\vocab_raw.txt'
encode_data(train_data_path, vocab_path)
encode_data(test_data_path, vocab_path)

In [None]:
import numpy as np
import random

class DataReader:
    def __init__(self, data_path, bs, vocab_size):
    self._bs = bs
    with open(data_path) as f:
        d_lines = f.read().splitlines()

    self._data = []
    self._labels = []
    self._sentence_len = []
    for data_id, line in enumerate(d_lines):
        if len(line) > 1:
            feature = line.split('<fff>')
            label, doc_id, sentence_len = int(feature[0]), int(feature[1]), int(feature[2])
            tokens = feature[3].split()
            vector = [int(token) for token in tokens]
            self._data.append(vector)
            self._labels.append(label)
            self._sentence_len.append(sentence_len)
    self._data = np.array(self._data)
    self._labels = np.array(self._labels)
    self._sentence_len = np.array(self._sentence_len)

    self._num_epoch = 0
    self._batch_id = 0

    def next_batch(self):
    start = self._batch_id * self._bs
    end = start + self._bs
    self._batch_id += 1

    if (end > len(self._data)):
        end = len(self._data)
        start = end - self._bs
        self._num_epoch += 1
        self._batch_id = 0
        indices = list(range(len(self._data)))
        random.seed(2101)
        random.shuffle(indices)
        self._data = self._data[indices]
        self_labels = self._labels[indices]
        self._sentence_len = self._sentence_len[indices]

    return self._data[start:end], self._labels[start:end], self._sentence_len[start:end]

In [None]:
device_name = tf.test.gpu_device_name()
print(device_name)

In [None]:
import tensorflow.compat.v1 as tf
tf.disable_eager_execution()
NUM_CLASSES = 20
MAX_DOC_LENGTH = 500

class RNN:
    def __init__(self, vocab_size, embedding_size, lstm_size, bs):
    self._vocab_size = vocab_size
    self._embedding_size = embedding_size
    self._lstm_size = lstm_size
    self._bs = bs

    with tf.device(device_name):

        self._data = tf.placeholder(tf.int32, shape=[bs, MAX_DOC_LENGTH])
        self._labels = tf.placeholder(tf.int32, shape=[bs,])
        self._sentence_len = tf.placeholder(tf.int32, shape=[bs,])

    def embedding_layer(self, indices):
    pretrain_vecs = []
    pretrain_vecs.append(np.zeros(self._embedding_size))
    np.random.seed(2101)
    for i in range(self._vocab_size+1):
        pretrain_vecs.append(np.random.normal(
            loc=0.,
            scale=1.,
            size=self._embedding_size
        ))
    pretrain_vecs = np.array(pretrain_vecs)

    self._embedding_maxtrix = tf.get_variable(
        name = 'embedding',
        shape = (self._vocab_size+2, self._embedding_size),
        initializer = tf.constant_initializer(pretrain_vecs)
    )

    return tf.nn.embedding_lookup(self._embedding_maxtrix, indices)

    def LSTM_layer(self, embeddings):
    lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(self._lstm_size)
    zero_state = tf.zeros(shape=(self._bs, self._lstm_size))
    initial_state = tf.nn.rnn_cell.LSTMStateTuple(zero_state, zero_state)

    lstm_inputs = tf.unstack(tf.transpose(embeddings, perm=[1,0,2]))
    lstm_outputs, last_state = tf.nn.static_rnn(
        cell = lstm_cell,
        inputs = lstm_inputs,
        initial_state = initial_state,
        sequence_length = self._sentence_len
    )
    lstm_outputs = tf.unstack(tf.transpose(lstm_outputs, perm=[1,0,2]))
    lstm_outputs = tf.concat(lstm_outputs, axis=0)

    mask = tf.sequence_mask(
        lengths = self._sentence_len,
        maxlen = MAX_DOC_LENGTH,
        dtype=tf.float32
    )
    mask = tf.concat(tf.unstack(mask, axis=0), axis=0)
    mask = tf.expand_dims(mask, -1)

    lstm_outputs = mask*lstm_outputs
    lstm_outputs_split = tf.split(lstm_outputs, num_or_size_splits=self._bs)
    lstm_outputs_sum = tf.reduce_sum(lstm_outputs_split, axis=1)
    lstm_outputs_aver = lstm_outputs_sum / tf.expand_dims(tf.cast(self._sentence_len, tf.float32), -1)

    return lstm_outputs_aver

    def build_graph(self):
    embeddings = self.embedding_layer(self._data)
    lstm_outputs = self.LSTM_layer(embeddings)

    weights = tf.get_variable(
        name = 'final_layer_weights',
        shape = (self._lstm_size, NUM_CLASSES),
        initializer = tf.random_normal_initializer(seed=2101)
    )
    biases = tf.get_variable(
        name = 'final_layer_biases',
        shape = (NUM_CLASSES),
        initializer = tf.random_normal_initializer(seed=2101)
    )
    logits = tf.matmul(lstm_outputs, weights) + biases

    label_one_hot = tf.one_hot(
        indices = self._labels,
        depth = NUM_CLASSES,
        dtype = tf.float32
    )

    loss = tf.nn.softmax_cross_entropy_with_logits(
        labels = label_one_hot,
        logits = logits
    )

    loss = tf.reduce_mean(loss)

    probs = tf.nn.softmax(logits)
    pred_labels = tf.argmax(probs, axis=1)
    pred_labels = tf.squeeze(pred_labels)

    return pred_labels, loss

    def trainer(self, loss, lr):
    optimizer = tf.train.AdamOptimizer(lr).minimize(loss)
    return optimizer


In [None]:
def train_and_eval_RNN():
    with open('/content/drive/My Drive/Colab Notebooks/LAB/data_20newsgroups/w2v/vocab_raw.txt', 'rb') as f:
        vocab_size = len(f.readlines())

    tf.set_random_seed(2101)

    rnn = RNN(
        vocab_size=vocab_size,
        embedding_size=300,
        lstm_size=50,
        bs=20
    )

    pred_labels, loss = rnn.build_graph()
    op = rnn.trainer(loss=loss, lr=0.0005)

    with tf.Session() as sess:
        train_data_reader = DataReader(
            data_path='/content/drive/My Drive/Colab Notebooks/LAB/data_20newsgroups/w2v/train_encoded.txt',
            bs = 20,
            vocab_size = vocab_size
        )
        test_data_reader = DataReader(
            data_path='/content/drive/My Drive/Colab Notebooks/LAB/data_20newsgroups/w2v/test_encoded.txt',
            bs = 20,
            vocab_size = vocab_size
        )

        step, MAX_STEP = 0, 2000

        sess.run(tf.global_variables_initializer())

        while step < MAX_STEP:
            next_train_batch = train_data_reader.next_batch()
            train_data, train_labels, train_sentence_len = next_train_batch
            plabel_eval,loss_eval, _ = sess.run(
                [pred_labels, loss, op],
                feed_dict={
                    rnn._data: train_data,
                    rnn._labels: train_labels,
                    rnn._sentence_len: train_sentence_len
                }
            )
            step += 1
            if step % 20 == 0:
                print("step: " + str(step) + " loss: " + str(loss_eval))
            if train_data_reader._batch_id == 0:
                corr = 0
                while True:
                    next_test_batch = test_data_reader.next_batch()
                    test_data, test_labels, test_sentence_len = next_test_batch
                    test_plabel_eval= sess.run(
                        pred_labels,
                        feed_dict={
                            rnn._data: test_data,
                            rnn._labels: test_labels,
                            rnn._sentence_len: test_sentence_len
                        }
                    )
                    matches = np.equal(test_plabel_eval, test_labels)
                    corr += np.sum(matches.astype(float))
                    if test_data_reader._batch_id == 0:
                        break
            print("Epoch: ", train_data_reader._num_epoch)
            print("Test accuracy: ", corr*100. / len(test_data_reader._data))

In [None]:
tf.reset_default_graph()
train_and_eval_RNN()