Paragraph Vector for text classification
---

This notebook use the `gensim` implementation of Distributed Memory Model of Paragraph Vector (PV-DM). Some of the codes were taken from https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb

In [1]:
import cPickle as pickle
import numpy as np
import pandas as pd
from nltk.tokenize import RegexpTokenizer

import gensim
from gensim.models.doc2vec import TaggedDocument
from collections import namedtuple

from utils import *

In [2]:
# Load the GSEs df
df = pd.read_csv('data/GSEs_texts_with_labels.csv').set_index('id')
df[['Series_summary', 'Series_title']] = df[['Series_summary', 'Series_title']].fillna('')
print df.shape
df.head()

(31905, 3)


Unnamed: 0_level_0,Series_summary,Series_title,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GSE1,This series represents a group of cutaneous ma...,NHGRI_Melanoma_class,
GSE1000,Amino acid conjugated surfaces and controls at...,Osteosarcoma TE85 cell tissue culture study,
GSE10000,We previously observed that formation of aorta...,Age-dependent aorta transcriptomes in wild-typ...,
GSE10001,The thyroid hormone receptor (TR) has been pro...,Gene expression profiling in NCoR deficient mo...,
GSE10002,Primitive erythropoiesis in the mouse yolk sac...,Identification of Erythroid-Enriched Gene Expr...,


In [3]:
# load the labeled df
df_labeled = pd.read_csv('data/Labeled_GSEs_texts_with_labels.csv').set_index('id')
df_labeled = df_labeled.fillna('')
print df_labeled.shape
df_labeled.head()

(1785, 5)


Unnamed: 0_level_0,Series_summary,Series_title,label,label_code,split
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GSE1001,Sprague-Dawley rat retina post-injury and cont...,retina injury timecourse,dz,1,0
GSE10064,This study aims to determine if global gene ex...,Gene expression in immortalized B-lymphocytes ...,dz,1,0
GSE10082,Conventional biochemical and molecular techniq...,Aryl Hydrocarbon Receptor Regulates Distinct D...,gene,2,0
GSE1009,Gene expression profiling in glomeruli from hu...,Diabetic nephropathy,dz,1,0
GSE1010,RNA samples prepared from lymphoblastic cells ...,FCHL study,dz,1,0


In [4]:
# A short hand for defining a subclass for tuple, this is the required input format for gensim.doc2vec
LabeledDocument = namedtuple('LabeledDocument', 'tags words label')

tokenizer = RegexpTokenizer(r"(?u)\b\w\w+\b")

alldocs = []  # will hold all docs in original order
for i, row in df.iterrows():
# for i, row in df_labeled.iterrows():
    tokens = tokenizer.tokenize(row['Series_summary'])
    labeled_doc = LabeledDocument([i], tokens, row['label'])
    if i == 'GSE1':
        print labeled_doc
    alldocs.append(labeled_doc)

print '%d documents collected.' % len(alldocs)    

LabeledDocument(tags=['GSE1'], words=['This', 'series', 'represents', 'group', 'of', 'cutaneous', 'malignant', 'melanomas', 'and', 'unrelated', 'controls', 'which', 'were', 'clustered', 'based', 'on', 'correlation', 'coefficients', 'calculated', 'through', 'comparison', 'of', 'gene', 'expression', 'profiles', 'Keywords', 'other'], label=nan)
31905 documents collected.


In [5]:
from gensim.models import Doc2Vec
import gensim.models.doc2vec

assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise"

In [19]:
simple_models = []
# PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size
pvdm = Doc2Vec(dm=1, dm_concat=1, size=200, window=5, negative=5, hs=0, min_count=2, workers=8)
# PV-DBOW 
# pvdbow = Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=8),
# PV-DM w/average
pvdm2 = Doc2Vec(dm=1, dm_mean=1, size=200, window=10, negative=5, hs=0, min_count=2, workers=8)

# PV-DM/concat requires one special NULL word so it serves as template
pvdm.build_vocab(alldocs)

# sharing results of 1st model's vocabulary scan
pvdm2.reset_from(pvdm)

# from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
# concat_model = ConcatenatedDoc2Vec([pvdm, pvdm2])

In [20]:
doc_list = alldocs[:] # for reshuffling per pass
print len(doc_list)

31905


# Bulk Training

Using explicit multiple-pass, alpha-reduction approach as sketched in gensim doc2vec blog post – with added shuffling of corpus on each pass.

Note that vector training is occurring on all documents of the dataset, which includes all TRAIN/TEST/DEV docs.

Evaluation of each model's sentiment-predictive power is repeated after each pass, as an error rate (lower is better), to see the rates-of-relative-improvement. The base numbers reuse the TRAIN and TEST vectors stored in the models for the logistic regression, while the inferred results use newly-inferred TEST vectors.

In [21]:
from random import shuffle
import datetime

alpha, min_alpha, passes = (0.025, 0.001, 20)
alpha_delta = (alpha - min_alpha) / passes

print("START %s" % datetime.datetime.now())

for epoch in range(passes):
    shuffle(doc_list)  # shuffling gets best results
    
    # train
    for model in [pvdm, pvdm2]:
        model.alpha, concat_model.min_alpha = alpha, alpha
        model.train(doc_list)

    print('completed pass %i at alpha %f' % (epoch + 1, alpha))
    alpha -= alpha_delta
    
print("END %s" % str(datetime.datetime.now()))


START 2016-05-24 16:45:13.895411
completed pass 1 at alpha 0.025000
completed pass 2 at alpha 0.023800
completed pass 3 at alpha 0.022600
completed pass 4 at alpha 0.021400
completed pass 5 at alpha 0.020200
completed pass 6 at alpha 0.019000
completed pass 7 at alpha 0.017800
completed pass 8 at alpha 0.016600
completed pass 9 at alpha 0.015400
completed pass 10 at alpha 0.014200
completed pass 11 at alpha 0.013000
completed pass 12 at alpha 0.011800
completed pass 13 at alpha 0.010600
completed pass 14 at alpha 0.009400
completed pass 15 at alpha 0.008200
completed pass 16 at alpha 0.007000
completed pass 17 at alpha 0.005800
completed pass 18 at alpha 0.004600
completed pass 19 at alpha 0.003400
completed pass 20 at alpha 0.002200
END 2016-05-24 17:05:28.123754


In [9]:
docvecs = concat_model.docvecs
embedding_mat = docvecs[df_labeled.index] 
print embedding_mat.shape

(3570, 100)


In [22]:
# Concat the paragraph embedding matrices from the two models
embedding_mat = np.hstack((pvdm.docvecs[df_labeled.index], pvdm2.docvecs[df_labeled.index]))
np.savetxt('models/pvdm_200_pvdm2_200_embedding.mat' ,embedding_mat)
print embedding_mat.shape

(1785, 400)


In [24]:
# Load the concatenated embedding matrix
embedding_mat = np.loadtxt('models/pvdm_100_pvdm2_100_embedding.mat')
print embedding_mat.shape

(1785, 200)


In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (f1_score, log_loss, accuracy_score)

In [25]:
clf = LogisticRegression(penalty='l2')
y = df_labeled['label_code'].values

scores = evaluate_clf(clf, embedding_mat, y, df_labeled['split'])
print scores.mean(axis=0) # accuracy  0.761923 (concat((pvdm_100, pvdm2_100)))

f1          0.755071
accuracy    0.761923
logloss     0.658141
dtype: float64


In [36]:
from tensorflow.contrib import learn
clf = learn.TensorFlowDNNClassifier(hidden_units=[100, 10], n_classes=3,
    steps=500, learning_rate=0.01, batch_size=128)

# 0.784892

scores = evaluate_clf(clf, embedding_mat, y, df_labeled['split'])
print scores.mean(axis=0)

Step #100, epoch #10, avg. train loss: 0.95136
Step #200, epoch #20, avg. train loss: 0.58923
Step #300, epoch #30, avg. train loss: 0.46338
Step #400, epoch #40, avg. train loss: 0.39758
Step #500, epoch #50, avg. train loss: 0.35676
Step #100, epoch #10, avg. train loss: 0.94522
Step #200, epoch #20, avg. train loss: 0.57103
Step #300, epoch #30, avg. train loss: 0.44425
Step #400, epoch #40, avg. train loss: 0.37967
Step #500, epoch #50, avg. train loss: 0.34232
Step #100, epoch #10, avg. train loss: 0.96696
Step #200, epoch #20, avg. train loss: 0.61894
Step #300, epoch #30, avg. train loss: 0.50531
Step #400, epoch #40, avg. train loss: 0.44441
Step #500, epoch #50, avg. train loss: 0.39959
f1          0.778588
accuracy    0.784321
logloss     0.545451
dtype: float64


In [15]:
import xgboost as xgb
RNG = 2016
clf = xgb.XGBClassifier(n_estimators=1000, colsample_bytree=1, 
                         learning_rate=0.05, max_depth=8, subsample=0.9, 
                         min_child_weight=1, seed=RNG, nthread=4, silent=0)
scores = evaluate_clf(clf, embedding_mat, y, df_labeled['split'])
print scores.mean(axis=0)

f1          0.706875
accuracy    0.729408
logloss     0.761081
dtype: float64
