In [1]:
# Here I am using gensim for learning the word embeddings from the EUADR corpus
# This is just a rough start to learn 
import gensim
import logging
import os
import pandas as pd

In [2]:
# start the log
logging.basicConfig(
    format='%(asctime)s : %(levelname)s : %(message)s',
    level=logging.INFO)

In [3]:
# get the current working directory
data_path = os.path.abspath(os.path.join(os.path.dirname( '__file__' ), '..', 'Datasets'))+'/'

# Although the dataset says csv, it is tab delimited. In addition to this, they have severe codels problems. 
# So best to parse throught codes first. 
# UnicodeDecodeError: 'utf-8' codec can't decode byte 0xfc in position 2: invalid start byte

#open for reading with "universal" type set

import codecs

doc_d_t = codecs.open(data_path+'EUADR_Corpus_IBIgroup/'+'EUADR_drug_target'+'.csv','rU','UTF-8') 
EUADR_drug_target = pd.read_csv(doc_d_t, sep='\t', na_filter = False)
EUADR_drug_target['CLASS'] = 'drug_gene'

doc_t_d = codecs.open(data_path+'EUADR_Corpus_IBIgroup/'+'EUADR_target_disease'+'.csv','rU','UTF-8',errors='ignore') 
EUADR_target_disease = pd.read_csv(doc_t_d, sep='\t', na_filter = False)
EUADR_target_disease['CLASS'] = 'gene_disease'
       
doc_d_d = codecs.open(data_path+'EUADR_Corpus_IBIgroup/'+'EUADR_drug_disease'+'.csv','rU','UTF-8')                       
EUADR_drug_disease = pd.read_csv(doc_d_d, sep='\t', na_filter = False)
EUADR_drug_disease['CLASS'] = 'drug_disease'

logging.info("Done reading data files")

2019-07-04 17:27:12,394 : INFO : Done reading data files


In [4]:
FO_data =  EUADR_drug_target.append(EUADR_target_disease).append(EUADR_drug_disease)


In [5]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# use boolean flags to customize function
def normalize_text(text, lowercase=True, punctuation=True, prune_stopwords=True, stem=True, join=True):
    
    # Step 1: Separate hyphenated words so they are not filtered when punctuation is removed (e.g. London-based)
    output = text.replace('-', ' ')
    
    # Step 2: Tokenize
    output = word_tokenize(output)
    
    # Step 3: Convert to lowercase (optional)
    if lowercase:
        output = [word.lower() for word in output]
        
    # Step 4: Remove punctuation and numbers (optional)
    if punctuation:
        output = [word for word in output if word.isalpha()]
    
    # Step 5: Remove stopwords (optional)
    if prune_stopwords:
        stop = set(stopwords.words('english'))
        output = [word for word in output if not word in stop]
    
    # Step 6: Stem words (optional)
    if stem:
        porter = PorterStemmer()
        output = [porter.stem(word) for word in output]
        
    # Step 7: Join words with spaces to obtain a single string (optional)
    if join: 
        output = ' '.join(output)

    return(output)

[nltk_data] Downloading package punkt to /home/stirunag/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/stirunag/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
FO_data['article_normalized'] = [normalize_text(article, lowercase=True, punctuation=True, prune_stopwords=True, stem=False, join=False) for article in FO_data['SENTENCE']]

In [7]:
# convert text into doc2vec input format i.e. tuple containing list of tokens and index for each doc
from gensim.models.doc2vec import TaggedDocument

def doc_tagger(data, tokens_col_name, class_col_name):
    return(data.apply(lambda r: TaggedDocument(words=r[tokens_col_name], tags=[r[class_col_name]]), axis=1))


# Specify parameters for doc2vec: DBOW with word2vec training
from gensim.models.doc2vec import Doc2Vec

def train_doc2vec_model(corpus, vec_size, window, n_epochs, n_cpu):
    
    docs = corpus.values
    doc2vec_model = Doc2Vec(docs, vector_size=vec_size, window=window, epochs=n_epochs, dm=0, dbow_words=1, workers=n_cpu)
    
    return(doc2vec_model)

def learn_vectors(doc2vec_model, corpus):
    
    docs = corpus.values
    labels, feats = zip(*[(doc.tags[0], doc2vec_model.infer_vector(doc[0], steps=20)) for doc in docs])
    
    return(feats, labels)

In [8]:
from sklearn import model_selection

FO_train, FO_test = model_selection.train_test_split(FO_data)

In [9]:
FO_train_tagged = doc_tagger(FO_train, 'article_normalized', 'CLASS')
FO_test_tagged = doc_tagger(FO_test, 'article_normalized', 'CLASS')

In [10]:
import multiprocessing

FO_doc2vec_model = train_doc2vec_model(FO_train_tagged, vec_size=100, window=10, n_epochs=20, n_cpu=multiprocessing.cpu_count())

2019-07-04 17:27:14,165 : INFO : collecting all words and their counts
2019-07-04 17:27:14,167 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2019-07-04 17:27:14,174 : INFO : collected 2229 word types and 3 unique tags from a corpus of 634 examples and 10765 words
2019-07-04 17:27:14,176 : INFO : Loading a fresh vocabulary
2019-07-04 17:27:14,179 : INFO : min_count=5 retains 629 unique words (28% of original 2229, drops 1600)
2019-07-04 17:27:14,179 : INFO : min_count=5 leaves 7695 word corpus (71% of original 10765, drops 3070)
2019-07-04 17:27:14,182 : INFO : deleting the raw counts dictionary of 2229 items
2019-07-04 17:27:14,183 : INFO : sample=0.001 downsamples 86 most-common words
2019-07-04 17:27:14,184 : INFO : downsampling leaves estimated 6753 word corpus (87.8% of prior 7695)
2019-07-04 17:27:14,185 : INFO : estimated required memory for 629 words and 100 dimensions: 819500 bytes
2019-07-04 17:27:14,187 : INFO : resetting layer weights
2019-0

2019-07-04 17:27:14,652 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-07-04 17:27:14,655 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-07-04 17:27:14,659 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-07-04 17:27:14,664 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-07-04 17:27:14,666 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-04 17:27:14,670 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-04 17:27:14,677 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-04 17:27:14,722 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-04 17:27:14,723 : INFO : EPOCH - 5 : training on 10765 raw words (7388 effective words) took 0.1s, 74111 effective words/s
2019-07-04 17:27:14,733 : INFO : worker thread finished; awaiting finish of 15 more threads
2019-07-04 17:27:14,735 : INFO : worker thread fi

2019-07-04 17:27:15,145 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-07-04 17:27:15,150 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-04 17:27:15,153 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-04 17:27:15,160 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-04 17:27:15,213 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-04 17:27:15,214 : INFO : EPOCH - 10 : training on 10765 raw words (7382 effective words) took 0.1s, 75348 effective words/s
2019-07-04 17:27:15,227 : INFO : worker thread finished; awaiting finish of 15 more threads
2019-07-04 17:27:15,230 : INFO : worker thread finished; awaiting finish of 14 more threads
2019-07-04 17:27:15,232 : INFO : worker thread finished; awaiting finish of 13 more threads
2019-07-04 17:27:15,233 : INFO : worker thread finished; awaiting finish of 12 more threads
2019-07-04 17:27:15,235 : INFO : worker threa

2019-07-04 17:27:15,644 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-04 17:27:15,686 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-04 17:27:15,687 : INFO : EPOCH - 15 : training on 10765 raw words (7425 effective words) took 0.1s, 92790 effective words/s
2019-07-04 17:27:15,696 : INFO : worker thread finished; awaiting finish of 15 more threads
2019-07-04 17:27:15,698 : INFO : worker thread finished; awaiting finish of 14 more threads
2019-07-04 17:27:15,701 : INFO : worker thread finished; awaiting finish of 13 more threads
2019-07-04 17:27:15,704 : INFO : worker thread finished; awaiting finish of 12 more threads
2019-07-04 17:27:15,708 : INFO : worker thread finished; awaiting finish of 11 more threads
2019-07-04 17:27:15,711 : INFO : worker thread finished; awaiting finish of 10 more threads
2019-07-04 17:27:15,715 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-07-04 17:27:15,718 : INFO : worker thr

2019-07-04 17:27:16,150 : INFO : training on a 215300 raw words (147933 effective words) took 1.9s, 75898 effective words/s


In [11]:
# vectorize training data for classification
FO_train_X, FO_train_y = learn_vectors(FO_doc2vec_model, FO_train_tagged)
FO_test_X, FO_test_y = learn_vectors(FO_doc2vec_model, FO_test_tagged)

In [12]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

FO_train_y_le = le.fit_transform(FO_train_y)
FO_test_y_le = le.fit_transform(FO_test_y)

In [30]:
from sklearn import metrics

# general classifier model function
def classifier(model, train_feats, train_labels, test_feats, test_labels):
    
    # define model
    clf = model
    
    # train model
    clf.fit(train_feats, train_labels)
    
    # predict labels for test set
    preds = clf.predict(test_feats)
    accuracy = metrics.accuracy_score(test_labels, preds)
    
    return accuracy

In [31]:
import numpy as np
from sklearn.linear_model import LogisticRegression

O_logreg_test_acc = np.round(100*classifier(LogisticRegression(), FO_train_X, FO_train_y_le, FO_test_X, FO_test_y_le), 2)

FO_logreg_test_acc



83.49

In [None]:
FO_doc2vec_model.wv.most_similar('cancer', topn=10)

In [37]:
text = 'On multivariate analysis the CYP3A5 A6986G genotype *3/*3 (OR 8.205, 95% CI 1.616-41.667, p = 0.011) and smaller number of treatment cycles (OR 0.156, 95% CI 0.037-0.659, p = 0.011) were independent factors for leukocytopenia (grade 3 or greater) throughout the period of chemotherapy.'
text_normed = normalize_text(text, lowercase=True, punctuation=True, prune_stopwords=True, stem=False, join=False)
vec = FO_doc2vec_model.infer_vector(text_normed, steps=20)


In [32]:
from sklearn import metrics

# general classifier model function
def classifier(model, train_feats, train_labels, test_feats, test_labels):
    
    # define model
    clf = model
    
    # train model
    clf.fit(train_feats, train_labels)
    
    # predict labels for test set
    preds = clf.predict(test_feats)
    accuracy = metrics.accuracy_score(test_labels, preds)
    
    return clf

model_lr = classifier(LogisticRegression(), FO_train_X, FO_train_y_le, FO_test_X, FO_test_y_le)




In [47]:
print(model_lr.predict_proba(vec.reshape(1, -1)))

le.inverse_transform(model_lr.predict(vec.reshape(1, -1)))

[[0.57649396 0.23467165 0.18883439]]


array(['drug_disease'], dtype='<U12')