In [1]:
# Here I am using gensim for learning the word embeddings from the EUADR corpus
# This is just a rough start to learn 
import gensim
import logging
import os
import pandas as pd

In [2]:
# start the log
logging.basicConfig(
    format='%(asctime)s : %(levelname)s : %(message)s',
    level=logging.INFO)

In [3]:
# get the current working directory
data_path = os.path.abspath(os.path.join(os.path.dirname( '__file__' ), '..', 'Datasets'))+'/'

# Although the dataset says csv, it is tab delimited. In addition to this, they have severe codels problems. 
# So best to parse throught codes first. 
# UnicodeDecodeError: 'utf-8' codec can't decode byte 0xfc in position 2: invalid start byte

#open for reading with "universal" type set

import codecs

doc_d_t = codecs.open(data_path+'EUADR_Corpus_IBIgroup/'+'EUADR_drug_target'+'.csv','rU','UTF-8') 
EUADR_drug_target = pd.read_csv(doc_d_t, sep='\t', na_filter = False)
EUADR_drug_target['CLASS'] = 'drug_gene'

doc_t_d = codecs.open(data_path+'EUADR_Corpus_IBIgroup/'+'EUADR_target_disease'+'.csv','rU','UTF-8',errors='ignore') 
EUADR_target_disease = pd.read_csv(doc_t_d, sep='\t', na_filter = False)
EUADR_target_disease['CLASS'] = 'gene_disease'
       
doc_d_d = codecs.open(data_path+'EUADR_Corpus_IBIgroup/'+'EUADR_drug_disease'+'.csv','rU','UTF-8')                       
EUADR_drug_disease = pd.read_csv(doc_d_d, sep='\t', na_filter = False)
EUADR_drug_disease['CLASS'] = 'drug_disease'

logging.info("Done reading data files")

2019-07-04 16:53:29,542 : INFO : Done reading data files


In [4]:
dataset =  EUADR_drug_target.append(EUADR_target_disease).append(EUADR_drug_disease)


In [5]:
dataset.head(10)

Unnamed: 0,ASSOCIATION_TYPE,PMID,NUM_SENTENCE,ENTITY1_TEXT,ENTITY1_INI,ENTITY1_END,ENTITY1_TYPE,ENTITY2_TEXT,ENTITY2_INI,ENTITY2_END,ENTITY2_TYPE,SENTENCE,CLASS
0,PA,17938326,0,ABCG2,71,76,Genes & Molecular Sequences,doxorubicin,43,54,Chemicals & Drugs,Gefitinib inhibition of drug resistance to dox...,drug_gene
1,PA,17938326,5,EGFR,16,20,Genes & Molecular Sequences,gefitinib,31,40,Chemicals & Drugs,Inactivation of EGFR kinase by gefitinib was a...,drug_gene
2,PA,17938326,0,ABCG2,71,76,Genes & Molecular Sequences,Gefitinib,0,9,Chemicals & Drugs,Gefitinib inhibition of drug resistance to dox...,drug_gene
3,PA,17938326,2,ABCG2 drug transporter,161,183,Genes & Molecular Sequences,doxorubicin,140,151,Chemicals & Drugs,Extrusion assays using flow cytometry analysis...,drug_gene
4,PA,17938326,12,EGFR,20,24,Genes & Molecular Sequences,gefitinib,35,44,Chemicals & Drugs,Inactivation of the EGFR kinase by gefitinib p...,drug_gene
5,PA,17938326,8,ABCG2 drug transporter,80,102,Genes & Molecular Sequences,doxorubicin,173,184,Chemicals & Drugs,Inhibition of EGFR kinase activity by gefitini...,drug_gene
6,PA,17938326,6,ABCG2,128,133,Genes & Molecular Sequences,doxorubicin,172,183,Chemicals & Drugs,A terminal deoxynucleotidyl transferase-mediat...,drug_gene
7,PA,17938326,9,ABCG2,126,131,Genes & Molecular Sequences,doxorubicin,93,104,Chemicals & Drugs,Both ARO and WRO demonstrated differential ABC...,drug_gene
8,PA,17938326,12,EGFR,20,24,Genes & Molecular Sequences,doxorubicin,173,184,Chemicals & Drugs,Inactivation of the EGFR kinase by gefitinib p...,drug_gene
9,PA,17938326,8,EGFR,14,18,Genes & Molecular Sequences,doxorubicin,173,184,Chemicals & Drugs,Inhibition of EGFR kinase activity by gefitini...,drug_gene


In [6]:
import nltk

from nltk.tokenize import word_tokenize

# use boolean flags to customize function
def tokenise_text(text):

    # Step 2: Tokenize
    output = word_tokenize(text)
    return(output)

In [7]:
dataset['SENTENCE_normalised'] = [tokenise_text(article) for article in dataset['SENTENCE']]

In [8]:
# convert text into doc2vec input format i.e. tuple containing list of tokens and index for each doc
from gensim.models.doc2vec import TaggedDocument

def doc_tagger(data, tokens_col_name, class_col_name):
    return(data.apply(lambda r: TaggedDocument(words=r[tokens_col_name], tags=[r[class_col_name]]), axis=1))


# Specify parameters for doc2vec: DBOW with word2vec training
from gensim.models.doc2vec import Doc2Vec

def train_doc2vec_model(corpus, vec_size, window, n_epochs, n_cpu):
    
    docs = corpus.values
    doc2vec_model = Doc2Vec(docs, vector_size=vec_size, window=window, epochs=n_epochs, dm=0, dbow_words=1, workers=n_cpu)
    
    return(doc2vec_model)

def learn_vectors(doc2vec_model, corpus):
    
    docs = corpus.values
    labels, feats = zip(*[(doc.tags[0], doc2vec_model.infer_vector(doc[0], steps=20)) for doc in docs])
    
    return(feats, labels)

In [None]:
train_tagged = doc_tagger(dataset, 'SENTENCE_normalised', 'CLASS')

In [None]:
import multiprocessing

doc2vec_model = train_doc2vec_model(train_tagged, vec_size=300, window=10, n_epochs=200, n_cpu=multiprocessing.cpu_count())

In [None]:
doc2vec_model.wv.most_similar('MDR1', topn=10)

In [None]:
# from scipy import spatial

sentence = 'However, the majority of colon cancer cells have deregulation of the Wnt/beta-catenin pathway.'


inferred_vector=doc2vec_model.infer_vector(sentence.split())

# print (1-  spatial.distance.cosine(inferred_vector,doc2vec_model.docvecs['SA']))
sims = doc2vec_model.docvecs.most_similar([inferred_vector], topn=len(doc2vec_model.docvecs))
sims

In [None]:
doc2vec_model.docvecs.most_similar('PA')