In [1]:
import json 
import spacy
import scispacy
from scispacy.linking import EntityLinker
import pubmed_parser as pp
from tqdm import tqdm

nlp = spacy.load("en_core_sci_lg")
nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})

nlp2 = spacy.load('en_ner_bc5cdr_md')
nlp2.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})

dicts_out = pp.parse_medline_xml('../data/pubmed_data/processed_data/processed_full_data.xml',
                                 year_info_only=False,
                                 nlm_category=False,
                                 author_list=False,
                                 reference_list=False)

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [2]:
## HoldOut 125 GS + 111 No/Little Abstract

pmids_HO = set()
with open('../data/HoldOut_Total.txt','r') as f:
    for line in f.readlines():
        pmids_HO.add(line.strip())
print("Hold Out Data: ",len(pmids_HO))

Hold Out Data:  236


In [3]:
N=len(dicts_out)
i=0
j=0
while i<N:
    if dicts_out[i]['pmid'] not in pmids_HO:
        dicts_out[j]=dicts_out[i]
        j+=1
    i+=1
dicts_out[j:]=[]

print("Full Data for further processing: ",len(dicts_out))

Full Data for further processing:  3100


In [4]:
comb_arr=[]
pmid_arr =[]
for paper in dicts_out:
    comb_arr.append(paper['title']+' '+paper['abstract'])
    pmid_arr.append(paper['pmid'])
    
idx=0
processed_texts = {}
scispacy_kw = {}
no_kws_trimmed =[]
no_kws=[]
post_no_kws =[]
outlier_kws=[
    'http','www','university','department','antibiotic','antimicrobial','institute','ministry', 'pubmed',
    '.gov','.org','.com','.edu','.net',
    'city','disease','injury','trauma','syndrome','country','national','regimen','swelling','cholesterol','cerebrovascular','leukemia'
    'surgery', 'medication','infection','stroke','diabetes','bleeding','comorbid','java','python',
    "united states",'united kingdom','india','china','germany','france','ghana','australia','italy','england','japan',
    'english','spanish','french','british','spain'
    ]

remove_sent_no_outliers = []
with tqdm(nlp.pipe(comb_arr),total=len(comb_arr)) as pbar:
    for doc in pbar:
        sents = [[token.orth_.lower() for token in sent if not token.is_punct | token.is_space] for sent in doc.sents]
        if len(sents)<4:
            remove_sent_no_outliers.append(idx)
            idx+=1
            continue
        text = [token.orth_ for token in doc if not token.is_punct | token.is_space] 
        rule_out_ents = [str(_) for _ in nlp2(comb_arr[idx]).ents]
        rule_out_ents = [_ for _ in rule_out_ents if not _.isupper()]
        kws = [str(_) for _ in doc.ents]
        # print('-'*50)
        # print(len(kws))
        temp_kws_len = len(kws)
        # print(kws)
        if rule_out_ents==[]:
            kws = [_ for _ in kws]
        else:
            kws = [_ for _ in kws if _ not in rule_out_ents]
        # print(kws)
        kws = [_ for _ in kws if all([True if n_ not in _.lower() else False for n_ in outlier_kws])]
        no_kws_trimmed.append(temp_kws_len-len(kws))
        no_kws.append(temp_kws_len)
        post_no_kws.append(len(kws))
        # print(len(kws))
        # print(rule_out_ents)
        # print(kws)
        # print('-'*50)
        kws.sort(reverse=True, key=len)
        
        dicts_out[idx]['fullText'] = text
        dicts_out[idx]['sentences'] = sents
        dicts_out[idx]["keywords"]=kws=kws
        
        idx+=1
        pbar.set_description("The %s document is processed" % (idx + 1))

## Filter Docs with less than minimum senetences
i=0
j=0
print("Total Articles removed: ",len(remove_sent_no_outliers))
print([pmid_arr[x] for x in remove_sent_no_outliers])
while i<idx:
    if i not in remove_sent_no_outliers:
        dicts_out[j]=dicts_out[i]
        j+=1
    i+=1
dicts_out[j:]=[]

dataset_length = len(dicts_out)

print("Total Dataset Length: ",dataset_length)
print("Train Dataset Length: ",dataset_length//3)
print("Test  Dataset Length: ",dataset_length-dataset_length//3)

with open("GNA_Files/train.json", "w") as outfile:
    json.dump(dicts_out[:dataset_length//3], outfile)

with open("GNA_Files/test.json", "w") as outfile:
    json.dump(dicts_out[dataset_length//3:], outfile)

print("Total KWS generated:", sum(no_kws))
print("Avg. KWS generated/abstract:", sum(no_kws)/len(no_kws))

print("Total KWS trimmed: ",sum(no_kws_trimmed))
print("Avg. KWS trimmed:", sum(no_kws_trimmed)/len(no_kws_trimmed))

print("Total KWS - post processing: ",sum(post_no_kws))
print("Avg. KWS - post processing:", sum(post_no_kws)/len(post_no_kws))

  extended_neighbors[empty_vectors_boolean_flags] = numpy.array(neighbors)[:-1]
  extended_distances[empty_vectors_boolean_flags] = numpy.array(distances)[:-1]
The 3101 document is processed: 100%|██████████| 3100/3100 [04:39<00:00, 11.10it/s]
Total Articles removed:  48
['34042851', '31349252', '31225983', '30399617', '28269917', '28086985', '27762202', '27235214', '26262257', '26262255', '23920886', '23920721', '23861123', '23851288', '22929351', '22219234', '22060071', '21295751', '21095686', '20592007', '20501532', '20407168', '18999221', '18999012', '18998900', '18998875', '18694272', '18694237', '18580428', '18487843', '17945725', '17911954', '17238710', '17238621', '17238566', '17238503', '16779188', '15846970', '15770911', '15455842', '14728543', '12365297', '11813521', '10587916', '10338700', '9700425', '10177918', '10177693']
Total Dataset Length:  3052
Train Dataset Length:  1017
Test  Dataset Length:  2035
Total KWS generated: 224833
Avg. KWS generated/abstract: 73.66743119

In [5]:
train_data = dicts_out[:dataset_length//3]
test_data = dicts_out[dataset_length//3:]

In [6]:
train_corpus_sentences = []
for x in train_data:
    train_corpus_sentences.extend(x['sentences'])
print("Total Sentences in Train Corpus: ", len(train_corpus_sentences))

test_corpus_sentences = []
for x in test_data:
    test_corpus_sentences.extend(x['sentences'])
print("Total Sentences in Test Corpus: ", len(test_corpus_sentences))

Total Sentences in Train Corpus:  12023
Total Sentences in Test Corpus:  20555


In [7]:
print('Sample: ', train_corpus_sentences[0])

Sample:  ['continuous', 'cardiorespiratory', 'monitoring', 'is', 'a', 'dominant', 'source', 'of', 'predictive', 'signal', 'in', 'machine', 'learning', 'for', 'risk', 'stratification', 'and', 'clinical', 'decision', 'support']


In [8]:
import gensim

modelW2V = gensim.models.Word2Vec(sentences=train_corpus_sentences,size=300,window=5,min_count=0,workers=10,sg=1,seed=42,compute_loss=True)

In [9]:
# modelW2V.save('GNA_Files/word2vec.embedding')
modelW2V.wv.save_word2vec_format('GNA_Files/word2vec.bin', binary=True)

In [10]:
# modelW2V = gensim.models.Word2Vec.load('GNA_Files/word2vec.bin')
from gensim.models import KeyedVectors

filename = 'GNA_Files/word2vec.bin'
model = KeyedVectors.load_word2vec_format(filename, binary=True)

In [11]:
model.vectors

array([[ 1.1465875e-02,  2.4020582e-01,  2.2017255e-02, ...,
        -1.6683304e-01, -9.4120137e-02, -2.5385439e-01],
       [-2.4647003e-02,  8.3521746e-02, -5.6839019e-02, ...,
        -1.9443564e-01, -2.1609481e-01, -4.0310875e-02],
       [ 4.9027428e-02,  1.1255401e-01,  4.6202015e-02, ...,
        -2.1141855e-01, -2.1330103e-01, -6.1953351e-02],
       ...,
       [-1.0731264e-02, -1.6498059e-02,  7.8044878e-03, ...,
        -2.8128992e-04, -1.4324478e-04, -9.6140197e-03],
       [-1.5212214e-02, -2.6840825e-02,  9.3136178e-03, ...,
         6.8916059e-03, -4.0749582e-03, -1.8798176e-02],
       [-2.3664381e-02, -1.8525474e-02,  7.0460709e-03, ...,
        -8.2431443e-04, -6.1397515e-03, -7.5974977e-03]], dtype=float32)

In [12]:
modelW2V.wv.most_similar(positive=['clinical','decision'], topn = 10)

[('computer', 0.9198338985443115),
 ('computerized', 0.91490238904953),
 ('decision-support', 0.9090330600738525),
 ('guideline-based', 0.9056036472320557),
 ('web-based', 0.9044430255889893),
 ('computerised', 0.9039899110794067),
 ('cpoe', 0.9003762602806091),
 ('knowledge-based', 0.8994190692901611),
 ('point-of-care', 0.8981181979179382),
 ('real-time', 0.8958467841148376)]

In [13]:
modelW2V.wv.most_similar(positive=['cardiorespiratory'], topn = 10)

[('speech', 0.9970884919166565),
 ('navigate', 0.9967279434204102),
 ('endpoint-directed', 0.9965219497680664),
 ('irrational', 0.9964563846588135),
 ('∼80', 0.9964200258255005),
 ('editing', 0.9964097738265991),
 ('massachusetts', 0.996346116065979),
 ('replicable', 0.9962502121925354),
 ('unrelated', 0.996247410774231),
 ('citizens', 0.9961835741996765)]

In [14]:
modelW2V.wv.most_similar(positive=['machine','learning'], topn = 10)

[('ml', 0.948789119720459),
 ('artificial', 0.9476273655891418),
 ('techniques', 0.9451085329055786),
 ('intelligence', 0.9407348036766052),
 ('deep', 0.9374027252197266),
 ('fuzzy', 0.9372469782829285),
 ('neural', 0.9352610111236572),
 ('networks', 0.9342448115348816),
 ('algorithm', 0.9317860007286072),
 ('bayesian', 0.9288842678070068)]

In [15]:
modelW2V.wv.most_similar(positive=['clinical','cdss'], topn = 10)

[('computer', 0.9187260866165161),
 ('dss', 0.9186125993728638),
 ('guideline-based', 0.9170367121696472),
 ('computerised', 0.915226936340332),
 ('cdsss', 0.9142353534698486),
 ('knowledge-based', 0.9130967855453491),
 ('ccds', 0.9096958637237549),
 ('ccdss', 0.909058690071106),
 ('cpoe', 0.9084491729736328),
 ('web-based', 0.9077417254447937)]

In [16]:
modelW2V.wv.most_similar(positive=['clinical decision','cdss'], topn = 10)

KeyError: "word 'clinical decision' not in vocabulary"

In [18]:
## Word2Vec 300 Dim vector for a word
# model.wv['machine']

In [19]:
from gensim.models.fasttext import FastText as FT_gensim
modelFT = FT_gensim(size=300)

# build the vocabulary
modelFT.build_vocab(sentences=train_corpus_sentences)
# help(model.build_vocab)

# train the model
modelFT.train(
    sentences=train_corpus_sentences, epochs=modelFT.epochs,
    total_examples=modelFT.corpus_count, total_words=modelFT.corpus_total_words
)

In [20]:
modelFT.wv.save_word2vec_format('GNA_Files/fastText.bin', binary=True)

In [21]:
modelFT.save('GNA_Files/fastText.embedding')
modelFT = FT_gensim.load('GNA_Files/fastText.embedding')

In [22]:
from pprint import pprint
pprint('breakfast' in modelFT.wv.vocab)

False


In [23]:
# model['breakfast']
print(modelFT.most_similar("clinical decision"))

  print(modelFT.most_similar("clinical decision"))
[('decision-making', 0.9911418557167053), ('decision-support', 0.9900878667831421), ('decisional', 0.9900567531585693), ('decisions', 0.9860702753067017), ('decision', 0.9816697835922241), ('provision', 0.9731867909431458), ('deciding', 0.9678906202316284), ('compute', 0.9678195118904114), ('use', 0.9672898650169373), ('information', 0.967155933380127)]


In [24]:
modelFT.wv.most_similar(positive=['machine','learning'], topn = 10)

[('machine-learning', 0.9997397661209106),
 ('machines', 0.9910284876823425),
 ('took', 0.9904155135154724),
 ('form', 0.990329384803772),
 ('algorithm', 0.9895416498184204),
 ('rule', 0.988652229309082),
 ('cdst', 0.9880639314651489),
 ('systematic', 0.9875674247741699),
 ('cdsas', 0.9873975515365601),
 ('algorithm-based', 0.9872178435325623)]

In [25]:
modelFT.wv.most_similar(positive=['cardiorespiratory'], topn = 10)

[('tinnitus', 0.9976578950881958),
 ('diverse', 0.9976482391357422),
 ('injuries', 0.9971669316291809),
 ('t2d', 0.9963470697402954),
 ('diastolic', 0.9961583018302917),
 ('failure', 0.9960494041442871),
 ('comorbidities', 0.995898962020874),
 ('failures', 0.9958155155181885),
 ('drugs', 0.9955398440361023),
 ('ventricular', 0.9952231645584106)]

In [26]:
modelFT.wv.most_similar(positive=['clinical','decision'], topn = 10)

[('decision-support', 0.9952415823936462),
 ('decision-making', 0.9858211278915405),
 ('decisional', 0.9835367798805237),
 ('compute', 0.9824199676513672),
 ('computing', 0.9763392210006714),
 ('decisions', 0.974543571472168),
 ('implementing', 0.973444938659668),
 ('use', 0.9734196066856384),
 ('cds', 0.9723677039146423),
 ('ecosystem', 0.970734179019928)]

In [27]:
modelFT.wv.most_similar(positive=['cdss'], topn = 10)

[('cdss-t', 0.9993053078651428),
 ('cdsss', 0.9965502619743347),
 ('cdr', 0.9922662973403931),
 ('cdsi', 0.9913590550422668),
 ('developing', 0.9905264377593994),
 ('cdc', 0.9903756380081177),
 ('ccdss', 0.989611029624939),
 ('cdsas', 0.9889644384384155),
 ('cdsms', 0.98882657289505),
 ('computerized', 0.9875862002372742)]

In [28]:
modelFT.wv.most_similar(positive=['clinical decision support system'], topn = 10)

[('decision-support', 0.9942960143089294),
 ('compute', 0.9920271635055542),
 ('cds', 0.9911463260650635),
 ('systems', 0.9896308183670044),
 ('cdc', 0.9884072542190552),
 ('ecosystem', 0.9880969524383545),
 ('system', 0.9880873560905457),
 ('cdsi', 0.986912727355957),
 ('clinical', 0.9853878021240234),
 ('tools', 0.9849950671195984)]

In [29]:
modelFT.wv.most_similar(positive=['clinical','CDS'], topn = 10)

[('into', 0.6904014945030212),
 ('integral', 0.688778817653656),
 ('integrate', 0.6847514510154724),
 ('making', 0.684546709060669),
 ('clinic', 0.6836258172988892),
 ('provide', 0.6830564141273499),
 ('integrates', 0.6796035766601562),
 ('intelligent', 0.6759086847305298),
 ('providing', 0.6752824783325195),
 ('inform', 0.6748147010803223)]

In [30]:
# modelW2V.wv['machine']

In [31]:
# modelFT.wv['machine']

In [32]:
# modelW2V.wv.most_similar(positive=['machine'])

In [33]:
# modelFT.wv.most_similar(positive=['machine'])

In [34]:
print(modelFT.wv.similarity('cdss', 'clinical'))
print(modelFT.wv.similarity('cdss', 'decision'))
print(modelFT.wv.similarity('cdss', 'support'))
print(modelFT.wv.similarity('cdss', 'systems'))
print(modelFT.wv.similarity('cdss', 'clinical decision support systems'))

0.95006484
0.90896904
0.9493738
0.9691944
0.97116977


In [35]:
print(modelW2V.wv.similarity('cdss', 'clinical'))
print(modelW2V.wv.similarity('cdss', 'decision'))
print(modelW2V.wv.similarity('cdss', 'support'))
print(modelW2V.wv.similarity('cdss', 'systems'))

0.789354
0.7820957
0.7978011
0.75958157


In [36]:
print(modelW2V.wv.similarity('cdss', 'clinical decision support systems'))

KeyError: "word 'clinical decision support systems' not in vocabulary"

In [37]:
print("Word2Vec Vocab",len(modelW2V.wv.vocab))
print("FastText Vocab",len(modelFT.wv.vocab))

Word2Vec Vocab 15755
FastText Vocab 4705


In [41]:
w2v_vocabulary = model.vocab.keys()
print(w2v_vocabulary)

n', '863', 'polysomnography', 'antidepressant', 'low-back', 'flags', 'imaged', '.974', 'returned', '.170', 'hepatology', 'gynecology', 'fairly', 'modularized', 'chaining', 'exactness', '99.01', '84.73', 'explicable', 'objectivity', 'postburn', 'resuscitated', '24.9', '29.9', 'inches', 'overresuscitation', 'impetus', 'interplay', 'nursing-sensitive', '5515', 'intra', 'http://www.crd.york.ac.uk/prospero/display_record.asp?id', 'webcite', 'http://www.webcitation.org/6pihmlbzh', 'radiographs', '2012-december', 'credentialed', '14,642', '632', 'control-group', '239/258=92.6', '231/374=61.8', '209/258=81.0', '238/374=63.6', '166/258=64.3', '183/374=48.9', '141/258=54.6', '202/374=54.0', 'p=0.95', '26/307=8.5', '18/385=4.7', 'interventionally', 'web-centric', 'enriching', 'navigate', 'intrauterine', 'confers', 'macrosomia', 'offspring', 'artificial-intelligence-augmented', 'seventh', '247', '(re)usability', 'tmr', 'futile', '95.3', 'epidemiology', 'strobe', 'stard', '389', '42.0', 'ahd', '245

In [44]:
ft_vocabulary = modelFT.wv.vocab.keys()
print(ft_vocabulary)

