In [1]:
# Here I am using gensim for learning the word embeddings from the EUADR corpus
# This is just a rough start to learn 
import gensim
import logging
import os
import pandas as pd

In [2]:
# start the log
logging.basicConfig(
    format='%(asctime)s : %(levelname)s : %(message)s',
    level=logging.INFO)

In [3]:
# get the current working directory
data_path = os.path.abspath(os.path.join(os.path.dirname( '__file__' ), '..', 'Datasets'))+'/'

# Although the dataset says csv, it is tab delimited. In addition to this, they have severe codels problems. 
# So best to parse throught codes first. 
# UnicodeDecodeError: 'utf-8' codec can't decode byte 0xfc in position 2: invalid start byte

#open for reading with "universal" type set

import codecs

doc_d_t = codecs.open(data_path+'EUADR_Corpus_IBIgroup/'+'EUADR_drug_target'+'.csv','rU','UTF-8') 
EUADR_drug_target = pd.read_csv(doc_d_t, sep='\t', na_filter = False)
EUADR_drug_target['CLASS'] = 'drug_gene'

doc_t_d = codecs.open(data_path+'EUADR_Corpus_IBIgroup/'+'EUADR_target_disease'+'.csv','rU','UTF-8',errors='ignore') 
EUADR_target_disease = pd.read_csv(doc_t_d, sep='\t', na_filter = False)
EUADR_target_disease['CLASS'] = 'gene_disease'
       
doc_d_d = codecs.open(data_path+'EUADR_Corpus_IBIgroup/'+'EUADR_drug_disease'+'.csv','rU','UTF-8')                       
EUADR_drug_disease = pd.read_csv(doc_d_d, sep='\t', na_filter = False)
EUADR_drug_disease['CLASS'] = 'drug_disease'

logging.info("Done reading data files")

2019-07-04 16:53:29,542 : INFO : Done reading data files


In [4]:
dataset =  EUADR_drug_target.append(EUADR_target_disease).append(EUADR_drug_disease)


In [5]:
dataset.head(10)

Unnamed: 0,ASSOCIATION_TYPE,PMID,NUM_SENTENCE,ENTITY1_TEXT,ENTITY1_INI,ENTITY1_END,ENTITY1_TYPE,ENTITY2_TEXT,ENTITY2_INI,ENTITY2_END,ENTITY2_TYPE,SENTENCE,CLASS
0,PA,17938326,0,ABCG2,71,76,Genes & Molecular Sequences,doxorubicin,43,54,Chemicals & Drugs,Gefitinib inhibition of drug resistance to dox...,drug_gene
1,PA,17938326,5,EGFR,16,20,Genes & Molecular Sequences,gefitinib,31,40,Chemicals & Drugs,Inactivation of EGFR kinase by gefitinib was a...,drug_gene
2,PA,17938326,0,ABCG2,71,76,Genes & Molecular Sequences,Gefitinib,0,9,Chemicals & Drugs,Gefitinib inhibition of drug resistance to dox...,drug_gene
3,PA,17938326,2,ABCG2 drug transporter,161,183,Genes & Molecular Sequences,doxorubicin,140,151,Chemicals & Drugs,Extrusion assays using flow cytometry analysis...,drug_gene
4,PA,17938326,12,EGFR,20,24,Genes & Molecular Sequences,gefitinib,35,44,Chemicals & Drugs,Inactivation of the EGFR kinase by gefitinib p...,drug_gene
5,PA,17938326,8,ABCG2 drug transporter,80,102,Genes & Molecular Sequences,doxorubicin,173,184,Chemicals & Drugs,Inhibition of EGFR kinase activity by gefitini...,drug_gene
6,PA,17938326,6,ABCG2,128,133,Genes & Molecular Sequences,doxorubicin,172,183,Chemicals & Drugs,A terminal deoxynucleotidyl transferase-mediat...,drug_gene
7,PA,17938326,9,ABCG2,126,131,Genes & Molecular Sequences,doxorubicin,93,104,Chemicals & Drugs,Both ARO and WRO demonstrated differential ABC...,drug_gene
8,PA,17938326,12,EGFR,20,24,Genes & Molecular Sequences,doxorubicin,173,184,Chemicals & Drugs,Inactivation of the EGFR kinase by gefitinib p...,drug_gene
9,PA,17938326,8,EGFR,14,18,Genes & Molecular Sequences,doxorubicin,173,184,Chemicals & Drugs,Inhibition of EGFR kinase activity by gefitini...,drug_gene


In [6]:
import nltk

from nltk.tokenize import word_tokenize

# use boolean flags to customize function
def tokenise_text(text):

    # Step 2: Tokenize
    output = word_tokenize(text)
    return(output)

In [7]:
dataset['SENTENCE_normalised'] = [tokenise_text(article) for article in dataset['SENTENCE']]

In [8]:
# convert text into doc2vec input format i.e. tuple containing list of tokens and index for each doc
from gensim.models.doc2vec import TaggedDocument

def doc_tagger(data, tokens_col_name, class_col_name):
    return(data.apply(lambda r: TaggedDocument(words=r[tokens_col_name], tags=[r[class_col_name]]), axis=1))


# Specify parameters for doc2vec: DBOW with word2vec training
from gensim.models.doc2vec import Doc2Vec

def train_doc2vec_model(corpus, vec_size, window, n_epochs, n_cpu):
    
    docs = corpus.values
    doc2vec_model = Doc2Vec(docs, vector_size=vec_size, window=window, epochs=n_epochs, dm=0, dbow_words=1, workers=n_cpu)
    
    return(doc2vec_model)

def learn_vectors(doc2vec_model, corpus):
    
    docs = corpus.values
    labels, feats = zip(*[(doc.tags[0], doc2vec_model.infer_vector(doc[0], steps=20)) for doc in docs])
    
    return(feats, labels)

In [9]:
train_tagged = doc_tagger(dataset, 'SENTENCE_normalised', 'CLASS')

In [39]:
import multiprocessing

doc2vec_model = train_doc2vec_model(train_tagged, vec_size=10, window=10, n_epochs=20, n_cpu=multiprocessing.cpu_count())

2019-07-04 17:04:05,081 : INFO : collecting all words and their counts
2019-07-04 17:04:05,084 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2019-07-04 17:04:05,098 : INFO : collected 3311 word types and 3 unique tags from a corpus of 846 examples and 27842 words
2019-07-04 17:04:05,099 : INFO : Loading a fresh vocabulary
2019-07-04 17:04:05,103 : INFO : min_count=5 retains 1048 unique words (31% of original 3311, drops 2263)
2019-07-04 17:04:05,105 : INFO : min_count=5 leaves 23239 word corpus (83% of original 27842, drops 4603)
2019-07-04 17:04:05,108 : INFO : deleting the raw counts dictionary of 3311 items
2019-07-04 17:04:05,109 : INFO : sample=0.001 downsamples 45 most-common words
2019-07-04 17:04:05,110 : INFO : downsampling leaves estimated 15972 word corpus (68.7% of prior 23239)
2019-07-04 17:04:05,112 : INFO : estimated required memory for 1048 words and 10 dimensions: 608560 bytes
2019-07-04 17:04:05,113 : INFO : resetting layer weights
20

2019-07-04 17:04:05,698 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-07-04 17:04:05,703 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-07-04 17:04:05,705 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-07-04 17:04:05,709 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-07-04 17:04:05,711 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-07-04 17:04:05,713 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-04 17:04:05,790 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-04 17:04:05,800 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-04 17:04:05,811 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-04 17:04:05,813 : INFO : EPOCH - 5 : training on 27842 raw words (16808 effective words) took 0.1s, 126226 effective words/s
2019-07-04 17:04:05,824 : INFO : worker thread f

2019-07-04 17:04:06,460 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-07-04 17:04:06,464 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-07-04 17:04:06,467 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-04 17:04:06,563 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-04 17:04:06,587 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-04 17:04:06,590 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-04 17:04:06,591 : INFO : EPOCH - 10 : training on 27842 raw words (16866 effective words) took 0.2s, 100844 effective words/s
2019-07-04 17:04:06,605 : INFO : worker thread finished; awaiting finish of 15 more threads
2019-07-04 17:04:06,608 : INFO : worker thread finished; awaiting finish of 14 more threads
2019-07-04 17:04:06,612 : INFO : worker thread finished; awaiting finish of 13 more threads
2019-07-04 17:04:06,614 : INFO : worker thre

2019-07-04 17:04:07,308 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-04 17:04:07,325 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-04 17:04:07,331 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-04 17:04:07,332 : INFO : EPOCH - 15 : training on 27842 raw words (16816 effective words) took 0.1s, 123470 effective words/s
2019-07-04 17:04:07,344 : INFO : worker thread finished; awaiting finish of 15 more threads
2019-07-04 17:04:07,348 : INFO : worker thread finished; awaiting finish of 14 more threads
2019-07-04 17:04:07,353 : INFO : worker thread finished; awaiting finish of 13 more threads
2019-07-04 17:04:07,357 : INFO : worker thread finished; awaiting finish of 12 more threads
2019-07-04 17:04:07,360 : INFO : worker thread finished; awaiting finish of 11 more threads
2019-07-04 17:04:07,361 : INFO : worker thread finished; awaiting finish of 10 more threads
2019-07-04 17:04:07,363 : INFO : worker t

2019-07-04 17:04:08,035 : INFO : EPOCH - 20 : training on 27842 raw words (16863 effective words) took 0.1s, 134473 effective words/s
2019-07-04 17:04:08,036 : INFO : training on a 556840 raw words (336357 effective words) took 2.9s, 115444 effective words/s


In [40]:
doc2vec_model.wv.most_similar('MDR1', topn=10)

2019-07-04 17:04:09,768 : INFO : precomputing L2-norms of word weight vectors


[('adults', 0.9315659999847412),
 ('Previous', 0.9239931106567383),
 ('systemic', 0.9200870990753174),
 ('Recently', 0.9191992282867432),
 ('a', 0.9117146730422974),
 ('TNFSF4', 0.9076061844825745),
 ('genome-wide', 0.9035455584526062),
 ('HIV-1', 0.8988310694694519),
 ('COL1A2', 0.8981502652168274),
 ('there', 0.8970949649810791)]

In [44]:
# from scipy import spatial

sentence = 'However, the majority of colon cancer cells have deregulation of the Wnt/beta-catenin pathway.'

inferred_vector=doc2vec_model.infer_vector(sentence.split())

# print (1-  spatial.distance.cosine(inferred_vector,doc2vec_model.docvecs['SA']))
sims = doc2vec_model.docvecs.most_similar([inferred_vector], topn=len(doc2vec_model.docvecs))
sims

[('drug_gene', 0.8196185231208801),
 ('drug_disease', 0.5184629559516907),
 ('gene_disease', 0.4750712513923645)]