In [1]:
# Here I am using gensim for learning the word embeddings from the EUADR corpus
# This is just a rough start to learn 
import gensim
import logging
import os
import pandas as pd

In [2]:
# start the log
logging.basicConfig(
    format='%(asctime)s : %(levelname)s : %(message)s',
    level=logging.INFO)

In [4]:
# get the current working directory
data_path = os.path.abspath(os.path.join(os.path.dirname( '__file__' ), '..', 'Datasets'))+'/'

# Although the dataset says csv, it is tab delimited. In addition to this, they have severe codels problems. 
# So best to parse throught codes first. 
# UnicodeDecodeError: 'utf-8' codec can't decode byte 0xfc in position 2: invalid start byte

#open for reading with "universal" type set

import codecs

doc_d_t = codecs.open(data_path+'EUADR_Corpus_IBIgroup/'+'EUADR_drug_target'+'.csv','rU','UTF-8') 
EUADR_drug_target = pd.read_csv(doc_d_t, sep='\t')


doc_t_d = codecs.open(data_path+'EUADR_Corpus_IBIgroup/'+'EUADR_target_disease'+'.csv','rU','UTF-8',errors='ignore') 
EUADR_target_disease = pd.read_csv(doc_t_d, sep='\t')
       
doc_d_d = codecs.open(data_path+'EUADR_Corpus_IBIgroup/'+'EUADR_drug_disease'+'.csv','rU','UTF-8')                       
EUADR_drug_disease = pd.read_csv(doc_d_d, sep='\t')

logging.info("Done reading data files")

2019-06-24 11:33:18,458 : INFO : Done reading data files


In [5]:
EUADR_target_disease.columns

Index(['ASSOCIATION_TYPE', 'PMID', 'NUM_SENTENCE', 'ENTITY1_TEXT',
       'ENTITY1_INI', 'ENTITY1_END', 'ENTITY1_TYPE', 'ENTITY2_TEXT',
       'ENTITY2_INI', 'ENTITY2_END', 'ENTITY2_TYPE', 'SENTENCE'],
      dtype='object')

In [6]:
sentences =  EUADR_drug_target['SENTENCE'].tolist() + EUADR_target_disease['SENTENCE'].tolist()+ EUADR_drug_disease['SENTENCE'].tolist()


In [7]:
sentences[1:10]

['Inactivation of EGFR kinase by gefitinib was analyzed by Western blot analysis and immunofluorescence.',
 'Gefitinib inhibition of drug resistance to doxorubicin by inactivating ABCG2 in thyroid cancer cell lines.',
 'Extrusion assays using flow cytometry analysis were used to determine the ability of thyroid cancer cells to extrude the chemotherapy drug, doxorubicin, via the ABCG2 drug transporter in the presence or absence of gefitinib.',
 'Inactivation of the EGFR kinase by gefitinib potentiates the cytotoxic effect of doxorubicin in thyroid cancer, most likely by decreasing the ability of the cell to extrude doxorubicin.',
 'Inhibition of EGFR kinase activity by gefitinib causes the translocation of the ABCG2 drug transporter away from the plasma membrane, resulting in a concomitant decrease in doxorubicin extrusion in thyroid cancer cell lines.',
 'A terminal deoxynucleotidyl transferase-mediated deoxyuridine triphosphate nick-end labeling assay was performed to demonstrate ABCG

In [8]:
#########################  tokenisation ###############################################
documents = []
for row in sentences:
    try:
        documents.append(row.lower().split())
    except AttributeError:
        pass


In [9]:
documents[1:3]

[['inactivation',
  'of',
  'egfr',
  'kinase',
  'by',
  'gefitinib',
  'was',
  'analyzed',
  'by',
  'western',
  'blot',
  'analysis',
  'and',
  'immunofluorescence.'],
 ['gefitinib',
  'inhibition',
  'of',
  'drug',
  'resistance',
  'to',
  'doxorubicin',
  'by',
  'inactivating',
  'abcg2',
  'in',
  'thyroid',
  'cancer',
  'cell',
  'lines.']]

In [11]:
model = gensim.models.Word2Vec(
        documents,
        size=300,
        window=10,
        min_count=0,
        workers=10)

model.train(documents, total_examples=len(documents), epochs=100)

2019-06-24 11:33:42,069 : INFO : collecting all words and their counts
2019-06-24 11:33:42,070 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-06-24 11:33:42,082 : INFO : collected 3704 word types from a corpus of 23456 raw words and 846 sentences
2019-06-24 11:33:42,084 : INFO : Loading a fresh vocabulary
2019-06-24 11:33:42,097 : INFO : effective_min_count=0 retains 3704 unique words (100% of original 3704, drops 0)
2019-06-24 11:33:42,099 : INFO : effective_min_count=0 leaves 23456 word corpus (100% of original 23456, drops 0)
2019-06-24 11:33:42,109 : INFO : deleting the raw counts dictionary of 3704 items
2019-06-24 11:33:42,110 : INFO : sample=0.001 downsamples 35 most-common words
2019-06-24 11:33:42,112 : INFO : downsampling leaves estimated 19182 word corpus (81.8% of prior 23456)
2019-06-24 11:33:42,120 : INFO : estimated required memory for 3704 words and 300 dimensions: 10741600 bytes
2019-06-24 11:33:42,121 : INFO : resetting layer weights
2

2019-06-24 11:33:42,506 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-06-24 11:33:42,508 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-06-24 11:33:42,510 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-06-24 11:33:42,513 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-06-24 11:33:42,516 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-06-24 11:33:42,527 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-06-24 11:33:42,528 : INFO : EPOCH - 2 : training on 23456 raw words (19135 effective words) took 0.0s, 419812 effective words/s
2019-06-24 11:33:42,537 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-06-24 11:33:42,545 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-06-24 11:33:42,546 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-06-24 11:33:42,547 : INFO : worker thread f

2019-06-24 11:33:42,901 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-06-24 11:33:42,903 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-06-24 11:33:42,905 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-06-24 11:33:42,908 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-06-24 11:33:42,910 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-06-24 11:33:42,916 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-06-24 11:33:42,925 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-06-24 11:33:42,926 : INFO : EPOCH - 10 : training on 23456 raw words (19176 effective words) took 0.0s, 494033 effective words/s
2019-06-24 11:33:42,937 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-06-24 11:33:42,942 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-06-24 11:33:42,948 : INFO : worker thread 

2019-06-24 11:33:43,304 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-06-24 11:33:43,306 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-06-24 11:33:43,311 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-06-24 11:33:43,315 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-06-24 11:33:43,318 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-06-24 11:33:43,321 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-06-24 11:33:43,323 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-06-24 11:33:43,329 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-06-24 11:33:43,330 : INFO : EPOCH - 18 : training on 23456 raw words (19140 effective words) took 0.0s, 451229 effective words/s
2019-06-24 11:33:43,349 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-06-24 11:33:43,354 : INFO : worker thread 

2019-06-24 11:33:43,705 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-06-24 11:33:43,717 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-06-24 11:33:43,718 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-06-24 11:33:43,722 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-06-24 11:33:43,724 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-06-24 11:33:43,725 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-06-24 11:33:43,725 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-06-24 11:33:43,730 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-06-24 11:33:43,737 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-06-24 11:33:43,739 : INFO : EPOCH - 26 : training on 23456 raw words (19194 effective words) took 0.0s, 506040 effective words/s
2019-06-24 11:33:43,759 : INFO : worker thread 

2019-06-24 11:33:44,046 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-06-24 11:33:44,050 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-06-24 11:33:44,056 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-06-24 11:33:44,060 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-06-24 11:33:44,063 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-06-24 11:33:44,064 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-06-24 11:33:44,066 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-06-24 11:33:44,069 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-06-24 11:33:44,075 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-06-24 11:33:44,082 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-06-24 11:33:44,084 : INFO : EPOCH - 34 : training on 23456 raw words (19216 effective

2019-06-24 11:33:44,469 : INFO : EPOCH - 41 : training on 23456 raw words (19178 effective words) took 0.0s, 440359 effective words/s
2019-06-24 11:33:44,478 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-06-24 11:33:44,484 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-06-24 11:33:44,486 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-06-24 11:33:44,487 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-06-24 11:33:44,489 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-06-24 11:33:44,489 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-06-24 11:33:44,491 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-06-24 11:33:44,500 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-06-24 11:33:44,508 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-06-24 11:33:44,509 : INFO : worker thread 

2019-06-24 11:33:44,811 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-06-24 11:33:44,813 : INFO : EPOCH - 49 : training on 23456 raw words (19176 effective words) took 0.0s, 811895 effective words/s
2019-06-24 11:33:44,825 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-06-24 11:33:44,828 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-06-24 11:33:44,829 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-06-24 11:33:44,831 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-06-24 11:33:44,833 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-06-24 11:33:44,834 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-06-24 11:33:44,835 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-06-24 11:33:44,841 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-06-24 11:33:44,843 : INFO : worker thread 

2019-06-24 11:33:45,169 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-06-24 11:33:45,173 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-06-24 11:33:45,174 : INFO : EPOCH - 57 : training on 23456 raw words (19114 effective words) took 0.0s, 665384 effective words/s
2019-06-24 11:33:45,186 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-06-24 11:33:45,192 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-06-24 11:33:45,196 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-06-24 11:33:45,198 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-06-24 11:33:45,199 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-06-24 11:33:45,201 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-06-24 11:33:45,201 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-06-24 11:33:45,202 : INFO : worker thread 

2019-06-24 11:33:45,537 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-06-24 11:33:45,543 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-06-24 11:33:45,550 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-06-24 11:33:45,552 : INFO : EPOCH - 65 : training on 23456 raw words (19183 effective words) took 0.0s, 514130 effective words/s
2019-06-24 11:33:45,563 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-06-24 11:33:45,567 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-06-24 11:33:45,569 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-06-24 11:33:45,574 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-06-24 11:33:45,576 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-06-24 11:33:45,577 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-06-24 11:33:45,578 : INFO : worker thread 

2019-06-24 11:33:45,911 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-06-24 11:33:45,916 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-06-24 11:33:45,923 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-06-24 11:33:45,930 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-06-24 11:33:45,932 : INFO : EPOCH - 73 : training on 23456 raw words (19173 effective words) took 0.0s, 538250 effective words/s
2019-06-24 11:33:45,942 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-06-24 11:33:45,951 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-06-24 11:33:45,954 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-06-24 11:33:45,956 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-06-24 11:33:45,958 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-06-24 11:33:45,959 : INFO : worker thread 

2019-06-24 11:33:46,291 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-06-24 11:33:46,292 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-06-24 11:33:46,298 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-06-24 11:33:46,302 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-06-24 11:33:46,308 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-06-24 11:33:46,309 : INFO : EPOCH - 81 : training on 23456 raw words (19195 effective words) took 0.0s, 537458 effective words/s
2019-06-24 11:33:46,317 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-06-24 11:33:46,319 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-06-24 11:33:46,322 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-06-24 11:33:46,328 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-06-24 11:33:46,330 : INFO : worker thread 

2019-06-24 11:33:46,667 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-06-24 11:33:46,669 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-06-24 11:33:46,669 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-06-24 11:33:46,670 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-06-24 11:33:46,676 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-06-24 11:33:46,684 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-06-24 11:33:46,685 : INFO : EPOCH - 89 : training on 23456 raw words (19222 effective words) took 0.0s, 623821 effective words/s
2019-06-24 11:33:46,706 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-06-24 11:33:46,710 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-06-24 11:33:46,712 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-06-24 11:33:46,713 : INFO : worker thread 

2019-06-24 11:33:47,049 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-06-24 11:33:47,052 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-06-24 11:33:47,055 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-06-24 11:33:47,060 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-06-24 11:33:47,063 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-06-24 11:33:47,068 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-06-24 11:33:47,073 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-06-24 11:33:47,074 : INFO : EPOCH - 97 : training on 23456 raw words (19218 effective words) took 0.0s, 437123 effective words/s
2019-06-24 11:33:47,084 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-06-24 11:33:47,088 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-06-24 11:33:47,091 : INFO : worker thread 

(1917905, 2345600)

In [12]:
w1 = "ABCG2".lower()

print("Most similar to {0}".format('ABCG2'), model.wv.most_similar(positive=w1))

2019-06-24 11:33:47,246 : INFO : precomputing L2-norms of word weight vectors


Most similar to ABCG2 [('gefitinib', 0.9102101922035217), ('transporter', 0.8825482726097107), ('doxorubicin', 0.8278737664222717), ('drug', 0.8244372010231018), ('causes', 0.821057915687561), ('potentiates', 0.8089941143989563), ('away', 0.799569308757782), ('inactivating', 0.7715502381324768), ('membrane,', 0.7569605112075806), ('cancer,', 0.736693799495697)]


In [13]:
w1 = "glaucoma".lower()

print("Most similar to {0}".format('glaucoma'), model.wv.most_similar(positive=w1))

Most similar to glaucoma [('(poag).', 0.9869673252105713), ('open-angle', 0.9821245670318604), ('loxl1', 0.8221797347068787), ('intracranial', 0.6899855732917786), ('familial', 0.676161527633667), ('been', 0.6667029857635498), ('aneurysms', 0.6640276312828064), ('rs3825942,', 0.663216769695282), ('rs1048661,', 0.6599628329277039), ('(ias)', 0.6545814871788025)]


In [None]:
w1 = "ias".lower()

print("Most similar to {0}".format(w1), model.wv.most_similar(positive=w1))

In [None]:
word = 'glaucoma'
new_sentence = word.split()
model.wv.most_similar(positive=filter(lambda x: x in model.wv.vocab, new_sentence), topn=100)
