In [1]:
import cPickle as pickle
import numpy as np
import pandas as pd
RNG = 2016

# for language processing
from nltk.tokenize import (RegexpTokenizer, WordPunctTokenizer, TreebankWordTokenizer)
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from wordcloud import WordCloud

# for classification
from sklearn.metrics import (f1_score, log_loss, accuracy_score)
from sklearn import (naive_bayes, ensemble, svm)
import xgboost as xgb

from sklearn.pipeline import Pipeline

from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE

from utils import evaluate_clf

In [2]:
# load the labeled df
df = pd.read_csv('data/Labeled_GSEs_texts_with_labels.csv').set_index('id')
df = df.fillna('')
print df.shape
df.head()

(1785, 5)


Unnamed: 0_level_0,Series_summary,Series_title,label,label_code,split
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GSE1001,Sprague-Dawley rat retina post-injury and cont...,retina injury timecourse,dz,1,0
GSE10064,This study aims to determine if global gene ex...,Gene expression in immortalized B-lymphocytes ...,dz,1,0
GSE10082,Conventional biochemical and molecular techniq...,Aryl Hydrocarbon Receptor Regulates Distinct D...,gene,2,0
GSE1009,Gene expression profiling in glomeruli from hu...,Diabetic nephropathy,dz,1,0
GSE1010,RNA samples prepared from lymphoblastic cells ...,FCHL study,dz,1,0


# Natural Language Processing Basics

## 1. Text normalizations

###  1.1. Word Tokenization: segmenting words in running text
+ **Token**: a word, phrase, symbol, or other meaningful elements in a running text.
+ Tokenization is not a trival problem:
    + Finland's capital
    + What're, I'm, isn't
    + state-of-the-art
    + Lowercase
    + New York
    + Ph.D
+ Other languages is even harder!
    + **French**: l'ensemble -> un ensemble
    + **German**: Rindfleischetikettierungsueberwachungsaufgabenuebertragungsgesetz, meaning "law delegating beef label monitoring", [the longest German word](http://www.bbc.com/news/world-europe-22762040).
    + **Chinese**: 自然語言處理是人工智慧和語言學領域的分支學科   
    `自然 語言 處理 是 人工智慧 和 語言學 領域 的 分支 學科`   
    `Natural language processing is artificial intelligence and linguistics 's branch field`

---

Demonstration of different tokenization algorithms:

In [None]:
example_text = """
Finland's capital, what're, I'm, state-of-the-art San Francisco Ph.D post-injury
"""
for tokenizer in [
    RegexpTokenizer(r"(?u)\b\w\w+\b"), # white space tokenizer
    WordPunctTokenizer(),
    TreebankWordTokenizer()
                 ]:
    print '-' * 10
    print tokenizer
    print tokenizer.tokenize(example_text)


### 1.2. Word normalization
+ **Lemma**: words with the same stem, e.g. `dog` and `dogs`.
+ **Lemmatization**: reduce the variant forms to the base form
    + dogs -> dog
    + am, are, is -> be
+ **Stem**: the core meaning-bearing unit of a word
+ **Stemming**: reduce terms to their stems, a simpler form of Lemmatization.
    + automates, automatic, automation -> automat

+ Differences between **Lemmatization** and **Stemming** [(to read more)](http://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html)   
**Stemming** returns the stems of words.    
**Lemmatization** returns the dictionary form of word.

---

Demonstration of stemming and Lemmatization:

In [None]:
tokenizer = RegexpTokenizer(r"(?u)\b\w\w+\b") # white space tokenizer

stemmer = PorterStemmer()
lmmr = WordNetLemmatizer()

doc = df.ix[1]['Series_summary']
print 'Original document: \n"%s"' % doc

tokens = tokenizer.tokenize(doc)
print '\nAfter tokenizing:', tokens

stems = [stemmer.stem(t) for t in tokens]
print '\nAfter stemming:',  stems

lemmas = [lmmr.lemmatize(t) for t in tokens]
print '\nAfter lemantization:',  lemmas

In [None]:
preprocess_func = lambda x: ' '.join( tokenizer.tokenize(x) )
df['Series_summary'] = df['Series_summary'].apply(preprocess_func)

## Bag-of-words

We use [`CountVectorizer`](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) to count the words in each document and generate a sparse word count matrix with the shape = (n_documents, n_tokens)

In [None]:
# count tokens 
ctvec = CountVectorizer(min_df=1,
                        max_df=0.8, # max document frequency, words with higher frequency than this will be ignored  
                        max_features=None, 
                        strip_accents='unicode', 
                        decode_error='ignore',
                        lowercase=True,
                        tokenizer=None,
                        analyzer='word', 
                        ngram_range=(1, 2), # only keep uni-grams
                        binary=True, # whether to return binary numbers or word counts
                        stop_words='english')

X = ctvec.fit_transform(df['Series_summary'])
print X.shape

In [None]:
clf = naive_bayes.BernoulliNB()
y = df['label_code'].values

scores = evaluate_clf(clf, X, y, df['split'])
print scores.mean(axis=0)

In [None]:
# count tokens 
ctvec = CountVectorizer(min_df=1,
                        max_df=1000, # max document frequency, words with higher frequency than this will be ignored  
                        max_features=None, 
                        strip_accents='unicode', 
                        decode_error='ignore',
                        lowercase=True,
                        tokenizer=None,
                        analyzer='word', 
                        ngram_range=(1, 2), # only keep uni-grams
                        binary=False, # whether to return binary numbers or word counts
                        stop_words='english')

X = ctvec.fit_transform(df['Series_summary'])
print X.shape

In [None]:
clf = naive_bayes.MultinomialNB()

scores = evaluate_clf(clf, X, y, df['split'])
print scores.mean(axis=0)

## Latent semantic analysis


In [None]:
tfidf = TfidfVectorizer(min_df=1, max_df=1000, 
                        max_features=None, strip_accents='unicode', 
                        decode_error='ignore',
                        analyzer='word', 
                        ngram_range=(1, 2), 
                        use_idf=True, smooth_idf=True, 
                        sublinear_tf=True, stop_words = 'english')
svd = TruncatedSVD(n_components=90, algorithm='randomized', n_iter=5, random_state=RNG, tol=0.0)

pipeline = Pipeline([
        ('tfidf', tfidf),
        ('svd', svd)
    ])

X = pipeline.fit_transform(df['Series_summary'])
print X.shape

In [None]:
clf = xgb.XGBClassifier(n_estimators=1000, colsample_bytree=1, 
                         learning_rate=0.05, max_depth=8, subsample=0.9, 
                         min_child_weight=1, seed=RNG, nthread=4, silent=0)

scores = evaluate_clf(clf, X, y, df['split'])
print scores.mean(axis=0)

In [None]:
rf = ensemble.RandomForestClassifier(n_estimators=2000, criterion='entropy', 
                                     max_depth=None, random_state=RNG, n_jobs=4, verbose=0)
et = ensemble.ExtraTreesClassifier(n_estimators=1500, criterion='entropy', max_depth=None, 
                                   random_state=RNG, n_jobs=4, verbose=0)
svc = svm.SVC(C=100, kernel='rbf', probability=True, random_state=RNG, verbose=0)

for clf in [rf, et, svc]:
    scores = evaluate_clf(rf, X, y, df['split'])
    print clf
    print scores.mean(axis=0)