In [1]:
import sys
import cPickle as pickle
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.contrib import learn
from nltk.tokenize import RegexpTokenizer

from utils import *

sys.path.append('../tensorflow-playground/')
from doc2vec import Doc2Vec

In [2]:
# Load the GSEs df
df = pd.read_csv('data/GSEs_texts_with_labels.csv').set_index('id')
df[['Series_summary', 'Series_title']] = df[['Series_summary', 'Series_title']].fillna('')
print df.shape
df.head()

(31905, 3)


Unnamed: 0_level_0,Series_summary,Series_title,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GSE1,This series represents a group of cutaneous ma...,NHGRI_Melanoma_class,
GSE1000,Amino acid conjugated surfaces and controls at...,Osteosarcoma TE85 cell tissue culture study,
GSE10000,We previously observed that formation of aorta...,Age-dependent aorta transcriptomes in wild-typ...,
GSE10001,The thyroid hormone receptor (TR) has been pro...,Gene expression profiling in NCoR deficient mo...,
GSE10002,Primitive erythropoiesis in the mouse yolk sac...,Identification of Erythroid-Enriched Gene Expr...,


In [3]:
# load the labeled df
df_labeled = pd.read_csv('data/Labeled_GSEs_texts_with_labels.csv').set_index('id')
df_labeled = df_labeled.fillna('')
print df_labeled.shape
df_labeled.head()

(1785, 5)


Unnamed: 0_level_0,Series_summary,Series_title,label,label_code,split
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GSE1001,Sprague-Dawley rat retina post-injury and cont...,retina injury timecourse,dz,1,0
GSE10064,This study aims to determine if global gene ex...,Gene expression in immortalized B-lymphocytes ...,dz,1,0
GSE10082,Conventional biochemical and molecular techniq...,Aryl Hydrocarbon Receptor Regulates Distinct D...,gene,2,0
GSE1009,Gene expression profiling in glomeruli from hu...,Diabetic nephropathy,dz,1,0
GSE1010,RNA samples prepared from lymphoblastic cells ...,FCHL study,dz,1,0


In [4]:
tokenizer = RegexpTokenizer(r"(?u)\b\w\w+\b")

alldocs = []
for i, row in df.iterrows():
# for i, row in df_labeled.iterrows():
    tokens = tokenizer.tokenize(row['Series_summary'])
    alldocs.append(tokens)

print '%d documents collected.' % len(alldocs)    

31905 documents collected.


In [27]:
# PV-DM model concatenating word vectors and doc vector
pvdm = Doc2Vec(batch_size=128, window_size=8, 
               concat=True, architecture='pvdm', 
               embedding_size_w=100, 
               embedding_size_d=100,
               vocabulary_size=50000, document_size=len(alldocs),
               loss_type='sampled_softmax_loss', n_neg_samples=64,
               optimize='Adagrad', learning_rate=0.025, n_steps=5001)
pvdm.fit(alldocs)

Initialized
Average loss at step 0: 6.987235
Average loss at step 2000: 4.023542
Average loss at step 4000: 3.491124


Doc2Vec(architecture='pvdm', batch_size=128, concat=True, document_size=31905,
    embedding_size_d=100, embedding_size_w=100, learning_rate=0.025,
    loss_type='sampled_softmax_loss', n_neg_samples=64, n_steps=5001,
    optimize='Adagrad', vocabulary_size=50000, window_size=8)

Initialized
Average loss at step 0: 6.908161
Average loss at step 2000: 4.370573
Average loss at step 4000: 3.641112
Average loss at step 6000: 3.474434
Average loss at step 8000: 3.225485
Average loss at step 10000: 3.139582
Average loss at step 12000: 3.067585
Average loss at step 14000: 2.994210
Average loss at step 16000: 2.962597
Average loss at step 18000: 2.881052
Average loss at step 20000: 2.854527
Average loss at step 22000: 2.799672
Average loss at step 24000: 2.688952
Average loss at step 26000: 1.378508
Average loss at step 28000: 1.444336
Average loss at step 30000: 1.508199
Average loss at step 32000: 1.509807
Average loss at step 34000: 1.542219
Average loss at step 36000: 1.561338
Average loss at step 38000: 1.581813
Average loss at step 40000: 1.598038
Average loss at step 42000: 1.599657
Average loss at step 44000: 1.614385
Average loss at step 46000: 1.615477
Average loss at step 48000: 1.591394
Average loss at step 50000: 1.186858
Average loss at step 52000: 1.2379

Doc2Vec(architecture='pvdm', batch_size=128, concat=True, document_size=31905,
    embedding_size_d=100, embedding_size_w=100, learning_rate=0.1,
    loss_type='sampled_softmax_loss', n_neg_samples=64, n_steps=100001,
    optimize='Adagrad', vocabulary_size=50000, window_size=8)

In [8]:
print pvdm.save('models/pvdm_100/')

Model saved in file: models/pvdm_100/model.ckpt
models/pvdm_100/model.ckpt


In [28]:
# PV-DM model without concatenating word vectors and doc vector
pvdm2 = Doc2Vec(batch_size=128, window_size=8, 
               concat=False, architecture='pvdm', 
               embedding_size_w=100, 
               embedding_size_d=100,
               vocabulary_size=50000, document_size=len(alldocs),
               loss_type='sampled_softmax_loss', n_neg_samples=64,
               optimize='Adagrad', learning_rate=0.025, n_steps=5001)
pvdm2.fit(alldocs)

Initialized
Average loss at step 0: 8.556091
Average loss at step 2000: 4.576419
Average loss at step 4000: 4.149445


Doc2Vec(architecture='pvdm', batch_size=128, concat=False,
    document_size=31905, embedding_size_d=100, embedding_size_w=100,
    learning_rate=0.025, loss_type='sampled_softmax_loss',
    n_neg_samples=64, n_steps=5001, optimize='Adagrad',
    vocabulary_size=50000, window_size=8)

In [10]:
print pvdm2.save('models/pvdm2_100/')

Model saved in file: models/pvdm2_100/model.ckpt
models/pvdm2_100/model.ckpt


In [29]:
embedding_mat = np.hstack((pvdm.doc_embeddings, pvdm2.doc_embeddings))[np.in1d(df.index, df_labeled.index)]
print embedding_mat.shape

(1785, 200)


In [32]:
y = df_labeled['label_code'].values

clf = learn.TensorFlowDNNClassifier(hidden_units=[100, 10], n_classes=3,
    steps=500, learning_rate=0.05, batch_size=128)

# 0.784892

# 0.637533

scores = evaluate_clf(clf, embedding_mat, y, df_labeled['split'])
print scores.mean(axis=0)

Step #100, epoch #10, avg. train loss: 1.03064
Step #200, epoch #20, avg. train loss: 0.89497
Step #300, epoch #30, avg. train loss: 0.86816
Step #400, epoch #40, avg. train loss: 0.84719
Step #500, epoch #50, avg. train loss: 0.82077
Step #100, epoch #10, avg. train loss: 1.03507
Step #200, epoch #20, avg. train loss: 0.90456
Step #300, epoch #30, avg. train loss: 0.88563
Step #400, epoch #40, avg. train loss: 0.86475
Step #500, epoch #50, avg. train loss: 0.84810
Step #100, epoch #10, avg. train loss: 1.03011
Step #200, epoch #20, avg. train loss: 0.90006
Step #300, epoch #30, avg. train loss: 0.87678
Step #400, epoch #40, avg. train loss: 0.86105
Step #500, epoch #50, avg. train loss: 0.83865
f1          0.459738
accuracy    0.581510
logloss     0.989088
dtype: float64
