Demonstrate the use of RNN model in text classification
---

Some of the codes are token from https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/skflow/text_classification_builtin_rnn_model.py

In [1]:
import numpy as np
from sklearn import metrics
import pandas

import tensorflow as tf
from tensorflow.contrib import learn

from utils import *

In [2]:
### Training data
# load the labeled df
df_labeled = pd.read_csv('data/Labeled_GSEs_texts_with_labels.csv').set_index('id')
df_labeled = df_labeled.fillna('')
print df_labeled.shape
df_labeled.head()

(1785, 5)


Unnamed: 0_level_0,Series_summary,Series_title,label,label_code,split
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GSE1001,Sprague-Dawley rat retina post-injury and cont...,retina injury timecourse,dz,1,0
GSE10064,This study aims to determine if global gene ex...,Gene expression in immortalized B-lymphocytes ...,dz,1,0
GSE10082,Conventional biochemical and molecular techniq...,Aryl Hydrocarbon Receptor Regulates Distinct D...,gene,2,0
GSE1009,Gene expression profiling in glomeruli from hu...,Diabetic nephropathy,dz,1,0
GSE1010,RNA samples prepared from lymphoblastic cells ...,FCHL study,dz,1,0


In [3]:
### Process vocabulary

MAX_DOCUMENT_LENGTH = 100

vocab_processor = learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH)

X = np.array(list(vocab_processor.fit_transform(df_labeled['Series_summary'])))
print X.shape

n_words = len(vocab_processor.vocabulary_)
print 'Total words: %d' % n_words

(1785, 100)
Total words: 19474


In [4]:
### Models

EMBEDDING_SIZE = 50

# Customized function to transform batched X into embeddings
def input_op_fn(X):
    # Convert indexes of words into embeddings.
    # This creates embeddings matrix of [n_words, EMBEDDING_SIZE] and then
    # maps word indexes of the sequence into [batch_size, sequence_length,
    # EMBEDDING_SIZE].
    word_vectors = learn.ops.categorical_variable(X, n_classes=n_words,
        embedding_size=EMBEDDING_SIZE, name='words')
    # Split into list of embedding per word, while removing doc length dim.
    # word_list results to be a list of tensors [batch_size, EMBEDDING_SIZE].
    word_list = learn.ops.split_squeeze(1, MAX_DOCUMENT_LENGTH, word_vectors)
    return word_list


In [5]:
# split train, test
splits = df_labeled['split']
y = df_labeled['label_code'].values

train_idx = np.where(splits != 0)[0]
valid_idx = np.where(splits == 0)[0]
X_train, y_train = X[train_idx], y[train_idx]
X_test, y_test = X[valid_idx], y[valid_idx]

In [8]:
# Single direction GRU with a single layer
classifier = learn.TensorFlowRNNClassifier(rnn_size=EMBEDDING_SIZE, 
    n_classes=3, cell_type='gru', input_op_fn=input_op_fn,
    num_layers=1, bidirectional=False, sequence_length=None,
    steps=200, optimizer='Adam', learning_rate=0.01, continue_training=True)

classifier.fit(X_train, y_train)
score = metrics.accuracy_score(y_test, classifier.predict(X_test))
print('Accuracy: {0:f}'.format(score))


Step #100, epoch #2, avg. train loss: 0.69750
Step #200, epoch #5, avg. train loss: 0.07701
Accuracy: 0.748322


In [9]:
# Single direction GRU with a single layer
classifier = learn.TensorFlowRNNClassifier(rnn_size=EMBEDDING_SIZE, 
    n_classes=3, cell_type='gru', input_op_fn=input_op_fn,
    num_layers=1, bidirectional=False, sequence_length=None,
    steps=500, optimizer='Adam', learning_rate=0.01, continue_training=True)

classifier.fit(X_train, y_train)
score = metrics.accuracy_score(y_test, classifier.predict(X_test))
print('Accuracy: {0:f}'.format(score))


Step #100, epoch #2, avg. train loss: 0.69750
Step #200, epoch #5, avg. train loss: 0.07701
Step #300, epoch #7, avg. train loss: 0.02212
Step #400, epoch #10, avg. train loss: 0.01688
Step #500, epoch #13, avg. train loss: 0.02350
Accuracy: 0.775168
