In [1]:
# Here I am using gensim for learning the word embeddings from the EUADR corpus
# This is just a rough start to learn 
import gensim
import logging
import os
import pandas as pd

In [2]:
# start the log
logging.basicConfig(
    format='%(asctime)s : %(levelname)s : %(message)s',
    level=logging.INFO)

In [3]:
# get the current working directory
data_path = os.path.abspath(os.path.join(os.path.dirname( '__file__' ), '..', 'Datasets'))+'/'

# Although the dataset says csv, it is tab delimited. In addition to this, they have severe codels problems. 
# So best to parse throught codes first. 
# UnicodeDecodeError: 'utf-8' codec can't decode byte 0xfc in position 2: invalid start byte

#open for reading with "universal" type set

import codecs

doc_d_t = codecs.open(data_path+'EUADR_Corpus_IBIgroup/'+'EUADR_drug_target'+'.csv','rU','UTF-8') 
EUADR_drug_target = pd.read_csv(doc_d_t, sep='\t')


doc_t_d = codecs.open(data_path+'EUADR_Corpus_IBIgroup/'+'EUADR_target_disease'+'.csv','rU','UTF-8',errors='ignore') 
EUADR_target_disease = pd.read_csv(doc_t_d, sep='\t')
       
doc_d_d = codecs.open(data_path+'EUADR_Corpus_IBIgroup/'+'EUADR_drug_disease'+'.csv','rU','UTF-8')                       
EUADR_drug_disease = pd.read_csv(doc_d_d, sep='\t')

logging.info("Done reading data files")

2019-07-03 16:13:56,471 : INFO : Done reading data files


In [4]:
EUADR_target_disease.columns

Index(['ASSOCIATION_TYPE', 'PMID', 'NUM_SENTENCE', 'ENTITY1_TEXT',
       'ENTITY1_INI', 'ENTITY1_END', 'ENTITY1_TYPE', 'ENTITY2_TEXT',
       'ENTITY2_INI', 'ENTITY2_END', 'ENTITY2_TYPE', 'SENTENCE'],
      dtype='object')

In [5]:
sentences =  EUADR_drug_target['SENTENCE'].tolist() + EUADR_target_disease['SENTENCE'].tolist()+ EUADR_drug_disease['SENTENCE'].tolist()


In [6]:
sentences[1:10]

['Inactivation of EGFR kinase by gefitinib was analyzed by Western blot analysis and immunofluorescence.',
 'Gefitinib inhibition of drug resistance to doxorubicin by inactivating ABCG2 in thyroid cancer cell lines.',
 'Extrusion assays using flow cytometry analysis were used to determine the ability of thyroid cancer cells to extrude the chemotherapy drug, doxorubicin, via the ABCG2 drug transporter in the presence or absence of gefitinib.',
 'Inactivation of the EGFR kinase by gefitinib potentiates the cytotoxic effect of doxorubicin in thyroid cancer, most likely by decreasing the ability of the cell to extrude doxorubicin.',
 'Inhibition of EGFR kinase activity by gefitinib causes the translocation of the ABCG2 drug transporter away from the plasma membrane, resulting in a concomitant decrease in doxorubicin extrusion in thyroid cancer cell lines.',
 'A terminal deoxynucleotidyl transferase-mediated deoxyuridine triphosphate nick-end labeling assay was performed to demonstrate ABCG

In [19]:
#########################  tokenisation ###############################################
documents = []
for row in sentences:
    try:
        documents.append(row.split())
    except AttributeError:
        pass


In [20]:
documents[1:3]

[['Inactivation',
  'of',
  'EGFR',
  'kinase',
  'by',
  'gefitinib',
  'was',
  'analyzed',
  'by',
  'Western',
  'blot',
  'analysis',
  'and',
  'immunofluorescence.'],
 ['Gefitinib',
  'inhibition',
  'of',
  'drug',
  'resistance',
  'to',
  'doxorubicin',
  'by',
  'inactivating',
  'ABCG2',
  'in',
  'thyroid',
  'cancer',
  'cell',
  'lines.']]

In [21]:
from gensim.models import Word2Vec, Phrases, phrases, KeyedVectors


# PMC_wv = KeyedVectors.load_word2vec_format('/home/stirunag/pre-trained_word_embeddings/PubMed-and-PMC-w2v.bin', binary=True)

min_count = 1
PMC_wv_model = Word2Vec(size=200, min_count=min_count, iter=100, window=10, workers=10)
PMC_wv_model.build_vocab(documents)
training_examples_count = PMC_wv_model.corpus_count
# below line will make it 1, so saving it before
PMC_wv_model.build_vocab([list(PMC_wv.vocab.keys())], update=True)
PMC_wv_model.intersect_word2vec_format("/home/stirunag/pre-trained_word_embeddings/PubMed-and-PMC-w2v.bin", binary=True, lockf=1.0)

PMC_wv_model.train(sentences, total_examples=training_examples_count, epochs=PMC_wv_model.iter)


2019-07-03 16:23:56,531 : INFO : collecting all words and their counts
2019-07-03 16:23:56,534 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-07-03 16:23:56,543 : INFO : collected 3838 word types from a corpus of 23456 raw words and 846 sentences
2019-07-03 16:23:56,545 : INFO : Loading a fresh vocabulary
2019-07-03 16:23:56,555 : INFO : min_count=1 retains 3838 unique words (100% of original 3838, drops 0)
2019-07-03 16:23:56,558 : INFO : min_count=1 leaves 23456 word corpus (100% of original 23456, drops 0)
2019-07-03 16:23:56,567 : INFO : deleting the raw counts dictionary of 3838 items
2019-07-03 16:23:56,568 : INFO : sample=0.001 downsamples 33 most-common words
2019-07-03 16:23:56,570 : INFO : downsampling leaves estimated 19389 word corpus (82.7% of prior 23456)
2019-07-03 16:23:56,576 : INFO : estimated required memory for 3838 words and 200 dimensions: 8059800 bytes
2019-07-03 16:23:56,579 : INFO : resetting layer weights
2019-07-03 16:23:56,69

2019-07-03 16:25:31,367 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-07-03 16:25:31,371 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-03 16:25:31,372 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-03 16:25:31,376 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-03 16:25:31,377 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-03 16:25:31,378 : INFO : EPOCH - 6 : training on 168987 raw words (146377 effective words) took 0.2s, 908964 effective words/s
2019-07-03 16:25:31,488 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-07-03 16:25:31,489 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-07-03 16:25:31,491 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-07-03 16:25:31,504 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-07-03 16:25:31,517 : INFO : worker thread

2019-07-03 16:25:32,632 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-07-03 16:25:32,636 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-07-03 16:25:32,638 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-03 16:25:32,640 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-03 16:25:32,642 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-03 16:25:32,644 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-03 16:25:32,645 : INFO : EPOCH - 14 : training on 168987 raw words (146377 effective words) took 0.2s, 929580 effective words/s
2019-07-03 16:25:32,750 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-07-03 16:25:32,752 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-07-03 16:25:32,755 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-07-03 16:25:32,775 : INFO : worker threa

2019-07-03 16:25:33,897 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-07-03 16:25:33,910 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-07-03 16:25:33,917 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-07-03 16:25:33,922 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-03 16:25:33,923 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-03 16:25:33,927 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-03 16:25:33,932 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-03 16:25:33,934 : INFO : EPOCH - 22 : training on 168987 raw words (146377 effective words) took 0.2s, 916317 effective words/s
2019-07-03 16:25:34,053 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-07-03 16:25:34,057 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-07-03 16:25:34,060 : INFO : worker threa

2019-07-03 16:25:35,182 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-07-03 16:25:35,196 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-07-03 16:25:35,199 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-07-03 16:25:35,201 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-07-03 16:25:35,203 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-03 16:25:35,211 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-03 16:25:35,213 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-03 16:25:35,214 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-03 16:25:35,215 : INFO : EPOCH - 30 : training on 168987 raw words (146377 effective words) took 0.1s, 987784 effective words/s
2019-07-03 16:25:35,316 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-07-03 16:25:35,319 : INFO : worker threa

2019-07-03 16:25:36,437 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-07-03 16:25:36,439 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-07-03 16:25:36,441 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-07-03 16:25:36,455 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-07-03 16:25:36,458 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-07-03 16:25:36,460 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-03 16:25:36,466 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-03 16:25:36,475 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-03 16:25:36,477 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-03 16:25:36,478 : INFO : EPOCH - 38 : training on 168987 raw words (146377 effective words) took 0.1s, 989765 effective words/s
2019-07-03 16:25:36,588 : INFO : worker threa

2019-07-03 16:25:37,676 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-07-03 16:25:37,677 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-07-03 16:25:37,678 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-07-03 16:25:37,693 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-07-03 16:25:37,696 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-07-03 16:25:37,709 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-07-03 16:25:37,717 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-03 16:25:37,719 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-03 16:25:37,721 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-03 16:25:37,723 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-03 16:25:37,724 : INFO : EPOCH - 46 : training on 168987 raw words (146377 effecti

2019-07-03 16:25:38,843 : INFO : EPOCH - 53 : training on 168987 raw words (146377 effective words) took 0.2s, 920232 effective words/s
2019-07-03 16:25:38,959 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-07-03 16:25:38,960 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-07-03 16:25:38,962 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-07-03 16:25:38,963 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-07-03 16:25:38,979 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-07-03 16:25:38,985 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-07-03 16:25:38,986 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-03 16:25:38,994 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-03 16:25:38,996 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-03 16:25:39,001 : INFO : worker threa

2019-07-03 16:25:40,120 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-03 16:25:40,121 : INFO : EPOCH - 61 : training on 168987 raw words (146377 effective words) took 0.2s, 932012 effective words/s
2019-07-03 16:25:40,229 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-07-03 16:25:40,231 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-07-03 16:25:40,233 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-07-03 16:25:40,243 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-07-03 16:25:40,258 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-07-03 16:25:40,260 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-07-03 16:25:40,262 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-03 16:25:40,265 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-03 16:25:40,266 : INFO : worker threa

2019-07-03 16:25:41,386 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-03 16:25:41,388 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-03 16:25:41,390 : INFO : EPOCH - 69 : training on 168987 raw words (146377 effective words) took 0.2s, 909158 effective words/s
2019-07-03 16:25:41,501 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-07-03 16:25:41,503 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-07-03 16:25:41,507 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-07-03 16:25:41,508 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-07-03 16:25:41,531 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-07-03 16:25:41,535 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-07-03 16:25:41,537 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-03 16:25:41,543 : INFO : worker threa

2019-07-03 16:25:42,645 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-03 16:25:42,652 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-03 16:25:42,654 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-03 16:25:42,655 : INFO : EPOCH - 77 : training on 168987 raw words (146377 effective words) took 0.2s, 925777 effective words/s
2019-07-03 16:25:42,769 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-07-03 16:25:42,774 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-07-03 16:25:42,775 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-07-03 16:25:42,778 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-07-03 16:25:42,798 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-07-03 16:25:42,803 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-07-03 16:25:42,805 : INFO : worker threa

2019-07-03 16:25:43,911 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-03 16:25:43,915 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-03 16:25:43,916 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-03 16:25:43,920 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-03 16:25:43,921 : INFO : EPOCH - 85 : training on 168987 raw words (146377 effective words) took 0.1s, 994627 effective words/s
2019-07-03 16:25:44,029 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-07-03 16:25:44,031 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-07-03 16:25:44,035 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-07-03 16:25:44,052 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-07-03 16:25:44,061 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-07-03 16:25:44,067 : INFO : worker threa

2019-07-03 16:25:45,158 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-07-03 16:25:45,160 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-03 16:25:45,162 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-03 16:25:45,173 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-03 16:25:45,175 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-03 16:25:45,177 : INFO : EPOCH - 93 : training on 168987 raw words (146377 effective words) took 0.2s, 934760 effective words/s
2019-07-03 16:25:45,293 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-07-03 16:25:45,298 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-07-03 16:25:45,299 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-07-03 16:25:45,301 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-07-03 16:25:45,318 : INFO : worker threa

(14637700, 16898700)

In [22]:
model = gensim.models.Word2Vec(
        documents,
        size=200,
        window=10,
        min_count=1,
        workers=10)

model.train(documents, total_examples=len(documents), epochs=100)

2019-07-03 16:25:55,792 : INFO : collecting all words and their counts
2019-07-03 16:25:55,793 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-07-03 16:25:55,802 : INFO : collected 3838 word types from a corpus of 23456 raw words and 846 sentences
2019-07-03 16:25:55,803 : INFO : Loading a fresh vocabulary
2019-07-03 16:25:55,816 : INFO : min_count=1 retains 3838 unique words (100% of original 3838, drops 0)
2019-07-03 16:25:55,818 : INFO : min_count=1 leaves 23456 word corpus (100% of original 23456, drops 0)
2019-07-03 16:25:55,835 : INFO : deleting the raw counts dictionary of 3838 items
2019-07-03 16:25:55,836 : INFO : sample=0.001 downsamples 33 most-common words
2019-07-03 16:25:55,837 : INFO : downsampling leaves estimated 19389 word corpus (82.7% of prior 23456)
2019-07-03 16:25:55,843 : INFO : estimated required memory for 3838 words and 200 dimensions: 8059800 bytes
2019-07-03 16:25:55,844 : INFO : resetting layer weights
2019-07-03 16:25:55,88

2019-07-03 16:25:56,181 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-07-03 16:25:56,183 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-07-03 16:25:56,187 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-03 16:25:56,187 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-03 16:25:56,194 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-03 16:25:56,205 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-03 16:25:56,207 : INFO : EPOCH - 2 : training on 23456 raw words (19324 effective words) took 0.0s, 430949 effective words/s
2019-07-03 16:25:56,219 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-07-03 16:25:56,220 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-07-03 16:25:56,223 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-07-03 16:25:56,227 : INFO : worker thread f

2019-07-03 16:25:56,532 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-07-03 16:25:56,535 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-07-03 16:25:56,539 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-07-03 16:25:56,543 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-03 16:25:56,551 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-03 16:25:56,558 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-03 16:25:56,570 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-03 16:25:56,571 : INFO : EPOCH - 10 : training on 23456 raw words (19376 effective words) took 0.0s, 395913 effective words/s
2019-07-03 16:25:56,583 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-07-03 16:25:56,587 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-07-03 16:25:56,596 : INFO : worker thread 

2019-07-03 16:25:56,910 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-07-03 16:25:56,911 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-07-03 16:25:56,915 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-07-03 16:25:56,918 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-07-03 16:25:56,920 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-03 16:25:56,923 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-03 16:25:56,931 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-03 16:25:56,939 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-03 16:25:56,940 : INFO : EPOCH - 18 : training on 23456 raw words (19375 effective words) took 0.0s, 507632 effective words/s
2019-07-03 16:25:56,951 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-07-03 16:25:56,957 : INFO : worker thread 

2019-07-03 16:25:57,275 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-07-03 16:25:57,278 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-07-03 16:25:57,280 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-07-03 16:25:57,282 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-07-03 16:25:57,283 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-07-03 16:25:57,284 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-03 16:25:57,290 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-03 16:25:57,291 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-03 16:25:57,303 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-03 16:25:57,304 : INFO : EPOCH - 26 : training on 23456 raw words (19388 effective words) took 0.0s, 510062 effective words/s
2019-07-03 16:25:57,312 : INFO : worker thread 

2019-07-03 16:25:57,629 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-07-03 16:25:57,635 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-07-03 16:25:57,637 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-07-03 16:25:57,639 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-07-03 16:25:57,640 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-07-03 16:25:57,642 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-07-03 16:25:57,643 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-03 16:25:57,645 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-03 16:25:57,647 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-03 16:25:57,651 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-03 16:25:57,652 : INFO : EPOCH - 34 : training on 23456 raw words (19349 effective

2019-07-03 16:25:57,962 : INFO : EPOCH - 41 : training on 23456 raw words (19390 effective words) took 0.0s, 533342 effective words/s
2019-07-03 16:25:57,970 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-07-03 16:25:57,976 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-07-03 16:25:57,978 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-07-03 16:25:57,980 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-07-03 16:25:57,982 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-07-03 16:25:57,983 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-07-03 16:25:57,985 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-03 16:25:57,987 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-03 16:25:57,994 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-03 16:25:57,997 : INFO : worker thread 

2019-07-03 16:25:58,316 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-03 16:25:58,317 : INFO : EPOCH - 49 : training on 23456 raw words (19434 effective words) took 0.0s, 575286 effective words/s
2019-07-03 16:25:58,329 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-07-03 16:25:58,332 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-07-03 16:25:58,338 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-07-03 16:25:58,342 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-07-03 16:25:58,344 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-07-03 16:25:58,345 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-07-03 16:25:58,346 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-03 16:25:58,353 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-03 16:25:58,355 : INFO : worker thread 

2019-07-03 16:25:58,652 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-03 16:25:58,659 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-03 16:25:58,659 : INFO : EPOCH - 57 : training on 23456 raw words (19331 effective words) took 0.0s, 483437 effective words/s
2019-07-03 16:25:58,668 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-07-03 16:25:58,671 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-07-03 16:25:58,676 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-07-03 16:25:58,680 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-07-03 16:25:58,683 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-07-03 16:25:58,684 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-07-03 16:25:58,685 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-03 16:25:58,692 : INFO : worker thread 

2019-07-03 16:25:59,003 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-03 16:25:59,007 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-03 16:25:59,015 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-03 16:25:59,017 : INFO : EPOCH - 65 : training on 23456 raw words (19351 effective words) took 0.0s, 512989 effective words/s
2019-07-03 16:25:59,028 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-07-03 16:25:59,031 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-07-03 16:25:59,036 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-07-03 16:25:59,037 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-07-03 16:25:59,042 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-07-03 16:25:59,044 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-07-03 16:25:59,046 : INFO : worker thread 

2019-07-03 16:25:59,356 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-03 16:25:59,356 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-03 16:25:59,361 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-03 16:25:59,370 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-03 16:25:59,371 : INFO : EPOCH - 73 : training on 23456 raw words (19358 effective words) took 0.0s, 568215 effective words/s
2019-07-03 16:25:59,382 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-07-03 16:25:59,383 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-07-03 16:25:59,386 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-07-03 16:25:59,392 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-07-03 16:25:59,396 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-07-03 16:25:59,398 : INFO : worker thread 

2019-07-03 16:25:59,687 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-07-03 16:25:59,688 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-03 16:25:59,690 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-03 16:25:59,693 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-03 16:25:59,699 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-03 16:25:59,700 : INFO : EPOCH - 81 : training on 23456 raw words (19460 effective words) took 0.0s, 963179 effective words/s
2019-07-03 16:25:59,712 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-07-03 16:25:59,718 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-07-03 16:25:59,721 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-07-03 16:25:59,722 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-07-03 16:25:59,723 : INFO : worker thread 

2019-07-03 16:26:00,035 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-07-03 16:26:00,036 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-07-03 16:26:00,040 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-03 16:26:00,042 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-03 16:26:00,045 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-03 16:26:00,047 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-03 16:26:00,048 : INFO : EPOCH - 89 : training on 23456 raw words (19402 effective words) took 0.0s, 801583 effective words/s
2019-07-03 16:26:00,058 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-07-03 16:26:00,060 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-07-03 16:26:00,066 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-07-03 16:26:00,068 : INFO : worker thread 

2019-07-03 16:26:00,370 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-07-03 16:26:00,373 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-07-03 16:26:00,375 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-07-03 16:26:00,378 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-03 16:26:00,380 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-03 16:26:00,382 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-03 16:26:00,389 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-03 16:26:00,391 : INFO : EPOCH - 97 : training on 23456 raw words (19414 effective words) took 0.0s, 521945 effective words/s
2019-07-03 16:26:00,404 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-07-03 16:26:00,406 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-07-03 16:26:00,408 : INFO : worker thread 

(1939165, 2345600)

In [25]:
w1 = "ABCG2"

print("Most similar to {0}".format('ABCG2'), model.wv.most_similar(positive=w1))

Most similar to ABCG2 [('gefitinib', 0.8900555968284607), ('transporter', 0.8850417733192444), ('causes', 0.8334230184555054), ('potentiates', 0.8311495184898376), ('drug', 0.8252782821655273), ('doxorubicin', 0.8115504384040833), ('away', 0.7972399592399597), ('Gefitinib', 0.7803680896759033), ('inactivating', 0.7681549787521362), ('Inactivation', 0.7612934708595276)]


In [26]:
w1 = "ABCG2"

print("Most similar to {0}".format('ABCG2'), PMC_wv_model.wv.most_similar(positive=w1))

2019-07-03 16:26:20,733 : INFO : precomputing L2-norms of word weight vectors


Most similar to ABCG2 [('ABCC2', 0.8482607007026672), ('BCRP', 0.8397014141082764), ('MDR1', 0.8259393572807312), ('MRP1', 0.8125120401382446), ('ABCB1', 0.8084306716918945), ('ABCC1', 0.8067789673805237), ('ABCC3', 0.8049155473709106), ('ABCC4', 0.7944767475128174), ('ABCB5', 0.7827481627464294), ('BCRP1', 0.7802361845970154)]


In [28]:
w1 = "glaucoma"

print("Most similar to {0}".format('glaucoma'), model.wv.most_similar(positive=w1))

Most similar to glaucoma [('(POAG).', 0.9856012463569641), ('open-angle', 0.9833759069442749), ('LOXL1', 0.7793064117431641), ('been', 0.6976262331008911), ('far.', 0.6939014196395874), ('familial', 0.6724284291267395), ('thoroughly', 0.6578390598297119), ('investigated.', 0.6562251448631287), ('intracranial', 0.6520474553108215), ('aneurysms', 0.6509785652160645)]


In [29]:
w1 = "glaucoma"

print("Most similar to {0}".format('glaucoma'), PMC_wv_model.wv.most_similar(positive=w1))

Most similar to glaucoma [('glaucomas', 0.8366590142250061), ('PACG', 0.8033803701400757), ('POAG', 0.800217866897583), ('NVG', 0.7882044911384583), ('open-angle', 0.7721532583236694), ('keratoconus', 0.7651363015174866), ('pseudoexfoliation', 0.7581974864006042), ('cataract', 0.7508412003517151), ('aphakic', 0.7416537404060364), ('myopia', 0.740774393081665)]


In [35]:
w1 = "MDR1"

print("Most similar to {0}".format(w1), model.wv.most_similar(positive=w1))

Most similar to MDR1 [('G2677T/A', 0.773526132106781), ('concentration-dependent.', 0.6436530947685242), ("Crohn's", 0.6064810156822205), ('Therefore,', 0.5913057923316956), ('be', 0.5565676689147949), ('found', 0.5557518601417542), ('rs42524', 0.5531126856803894), ('COL1A2', 0.5488646626472473), ('Ala394Thr', 0.5357307195663452), ('observed', 0.5354852080345154)]


In [36]:
w1 = "MDR1"

print("Most similar to {0}".format(w1), PMC_wv_model.wv.most_similar(positive=w1))

Most similar to MDR1 [('MDR-1', 0.9147017598152161), ('mdr1', 0.8428719639778137), ('ABCG2', 0.8259393572807312), ('ABCB1', 0.8249936103820801), ('MRP1', 0.8175784945487976), ('BCRP', 0.8138895630836487), ('mdr-1', 0.8121477365493774), ('ABCC2', 0.8070204854011536), ('MRP2', 0.791068434715271), ('MRP3', 0.7856488227844238)]


In [30]:
word = 'glaucoma'
new_sentence = word.split()
model.wv.most_similar(positive=filter(lambda x: x in model.wv.vocab, new_sentence), topn=100)


[('(POAG).', 0.9856012463569641),
 ('open-angle', 0.9833759069442749),
 ('LOXL1', 0.7793064117431641),
 ('been', 0.6976262331008911),
 ('far.', 0.6939014196395874),
 ('familial', 0.6724284291267395),
 ('thoroughly', 0.6578390598297119),
 ('investigated.', 0.6562251448631287),
 ('intracranial', 0.6520474553108215),
 ('aneurysms', 0.6509785652160645),
 ('hand,', 0.6433296799659729),
 ('Recently,', 0.6347602605819702),
 ('rs1048661,', 0.6341402530670166),
 ('(IAs)', 0.6138873100280762),
 ('rs3825942,', 0.613766610622406),
 ('rs2165241,', 0.6131080985069275),
 ('Individual', 0.5989713668823242),
 ('old', 0.5954360365867615),
 ('peginterferon', 0.5791369080543518),
 ('primary', 0.5782892107963562),
 ('prevalent', 0.573296308517456),
 ('insulin-induced', 0.5589690804481506),
 ('POAG', 0.5558907985687256),
 ('teratogenic', 0.5538408756256104),
 ('Caucasian', 0.5495583415031433),
 ('United', 0.5466031432151794),
 ('Cardiovascular', 0.5464194416999817),
 ('potential.', 0.5460395216941833),
 ('E