In [1]:
import cPickle as pickle
import numpy as np
import pandas as pd
RNG = 2016

# for language processing
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from wordcloud import WordCloud

# for classification
from sklearn.metrics import (f1_score, log_loss, accuracy_score)
from sklearn import (naive_bayes, ensemble, svm)
import xgboost as xgb

from sklearn.pipeline import Pipeline

from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE

from utils import evaluate_clf

In [2]:
# load the labeled df
df = pd.read_csv('data/Labeled_GSEs_texts_with_labels.csv').set_index('id')
df = df.fillna('')
print df.shape
df.head()

(1785, 5)


Unnamed: 0_level_0,Series_summary,Series_title,label,label_code,split
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GSE1001,Sprague-Dawley rat retina post-injury and cont...,retina injury timecourse,dz,1,0
GSE10064,This study aims to determine if global gene ex...,Gene expression in immortalized B-lymphocytes ...,dz,1,0
GSE10082,Conventional biochemical and molecular techniq...,Aryl Hydrocarbon Receptor Regulates Distinct D...,gene,2,0
GSE1009,Gene expression profiling in glomeruli from hu...,Diabetic nephropathy,dz,1,0
GSE1010,RNA samples prepared from lymphoblastic cells ...,FCHL study,dz,1,0


In [3]:
# tokenize texts 
tokenizer = RegexpTokenizer(r"(?u)\b\w\w+\b")
stemmer = PorterStemmer()

doc = df.ix[1]['Series_summary']
print 'Original document: "%s"' % doc

tokens = tokenizer.tokenize(doc)
print '\nAfter tokenizing:', tokens

stems = [stemmer.stem(t) for t in tokens]
print '\nAfter stemming:',  stems

Original document: "This study aims to determine if global gene expression and transcription factor networks in B-lympocytes of siblings with MS were different from healthy siblings. Keywords: Multiple sclerosis, sibling comparisons"

After tokenizing: ['This', 'study', 'aims', 'to', 'determine', 'if', 'global', 'gene', 'expression', 'and', 'transcription', 'factor', 'networks', 'in', 'lympocytes', 'of', 'siblings', 'with', 'MS', 'were', 'different', 'from', 'healthy', 'siblings', 'Keywords', 'Multiple', 'sclerosis', 'sibling', 'comparisons']

After stemming: [u'Thi', u'studi', u'aim', u'to', u'determin', u'if', u'global', u'gene', u'express', u'and', u'transcript', u'factor', u'network', u'in', u'lympocyt', u'of', u'sibl', u'with', u'MS', u'were', u'differ', u'from', u'healthi', u'sibl', u'Keyword', u'Multipl', u'sclerosi', u'sibl', u'comparison']


In [4]:
preprocess_func = lambda x: ' '.join( tokenizer.tokenize(x) )
df['Series_summary'] = df['Series_summary'].apply(preprocess_func)

## Bag-of-words

We use [`CountVectorizer`](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) to count the words in each document and generate a sparse word count matrix with the shape = (n_documents, n_tokens)

In [5]:
# count tokens 
ctvec = CountVectorizer(min_df=1,
                        max_df=0.8, # max document frequency, words with higher frequency than this will be ignored  
                        max_features=None, 
                        strip_accents='unicode', 
                        decode_error='ignore',
                        lowercase=True,
                        tokenizer=None,
                        analyzer='word', 
                        ngram_range=(1, 2), # only keep uni-grams
                        binary=True, # whether to return binary numbers or word counts
                        stop_words='english')

X = ctvec.fit_transform(df['Series_summary'])
print X.shape

(1785, 115703)


In [6]:
clf = naive_bayes.BernoulliNB()
y = df['label_code'].values

scores = evaluate_clf(clf, X, y, df['split'])
print scores.mean(axis=0)

f1           0.476865
accuracy     0.612329
logloss     13.248159
dtype: float64


  'precision', 'predicted', average, warn_for)


In [7]:
# count tokens 
ctvec = CountVectorizer(min_df=1,
                        max_df=1000, # max document frequency, words with higher frequency than this will be ignored  
                        max_features=None, 
                        strip_accents='unicode', 
                        decode_error='ignore',
                        lowercase=True,
                        tokenizer=None,
                        analyzer='word', 
                        ngram_range=(1, 2), # only keep uni-grams
                        binary=False, # whether to return binary numbers or word counts
                        stop_words='english')

X = ctvec.fit_transform(df['Series_summary'])
print X.shape

(1785, 115701)


In [8]:
clf = naive_bayes.MultinomialNB()

scores = evaluate_clf(clf, X, y, df['split'])
print scores.mean(axis=0)

f1          0.795405
accuracy    0.807280
logloss     3.095285
dtype: float64


## Latent semantic analysis


In [9]:
tfidf = TfidfVectorizer(min_df=1, max_df=1000, 
                        max_features=None, strip_accents='unicode', 
                        decode_error='ignore',
                        analyzer='word', 
                        ngram_range=(1, 2), 
                        use_idf=True, smooth_idf=True, 
                        sublinear_tf=True, stop_words = 'english')
svd = TruncatedSVD(n_components=90, algorithm='randomized', n_iter=5, random_state=RNG, tol=0.0)

pipeline = Pipeline([
        ('tfidf', tfidf),
        ('svd', svd)
    ])

X = pipeline.fit_transform(df['Series_summary'])
print X.shape

(1785, 90)


In [10]:
clf = xgb.XGBClassifier(n_estimators=1000, colsample_bytree=1, 
                         learning_rate=0.05, max_depth=8, subsample=0.9, 
                         min_child_weight=1, seed=RNG, nthread=4, silent=0)

scores = evaluate_clf(clf, X, y, df['split'])
print scores.mean(axis=0)

f1          0.809426
accuracy    0.816796
logloss     0.608025
dtype: float64


In [11]:
rf = ensemble.RandomForestClassifier(n_estimators=2000, criterion='entropy', 
                                     max_depth=None, random_state=RNG, n_jobs=4, verbose=0)
et = ensemble.ExtraTreesClassifier(n_estimators=1500, criterion='entropy', max_depth=None, 
                                   random_state=RNG, n_jobs=4, verbose=0)
svc = svm.SVC(C=100, kernel='rbf', probability=True, random_state=RNG, verbose=0)

for clf in [rf, et, svc]:
    scores = evaluate_clf(rf, X, y, df['split'])
    print clf
    print scores.mean(axis=0)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=4,
            oob_score=False, random_state=2016, verbose=0,
            warm_start=False)
f1          0.751212
accuracy    0.777590
logloss     0.617417
dtype: float64
ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1500, n_jobs=4,
           oob_score=False, random_state=2016, verbose=0, warm_start=False)
f1          0.751212
accuracy    0.777590
logloss     0.617417
dtype: float64
SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter