Sample data from [STI Treatment Guidelines](https://www.cdc.gov/std/treatment-guidelines/default.htm)

In [None]:
!wget https://www.cdc.gov/std/treatment-guidelines/STI-Guidelines-2021.pdf 

--2022-07-31 19:59:04--  https://www.cdc.gov/std/treatment-guidelines/STI-Guidelines-2021.pdf
Resolving www.cdc.gov (www.cdc.gov)... 104.102.138.220, 2600:1408:9000:798::2461, 2600:1408:9000:79a::2461
Connecting to www.cdc.gov (www.cdc.gov)|104.102.138.220|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4448203 (4.2M) [application/pdf]
Saving to: ‘STI-Guidelines-2021.pdf’


2022-07-31 19:59:04 (20.2 MB/s) - ‘STI-Guidelines-2021.pdf’ saved [4448203/4448203]



In [None]:
%%capture
!pip install tika        #Tika is a Python Library to extract Text from Pdf
!pip install fasttext

In [None]:
%%capture
import nltk
from pprint import pprint
from gensim.models import FastText
import fasttext
nltk.download('punkt')
from nltk import word_tokenize
from pprint import pprint
from tika import parser
import string
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [None]:
raw_text = parser.from_file('/content/STI-Guidelines-2021.pdf')

In [None]:
print(type(raw_text))    #Tika parser returns a dictionary
print(raw_text.items())  #Returned dictionary contains metadata and content

<class 'dict'>


In [None]:
rawtext_list = raw_text['content'].splitlines()                        
rawtext_list = [item.lower() for item in rawList if item.strip()]    
print(len(rawtext_list))   
print(rawtext_list[5000])   #sample sentence

23392
follow-up titer should not be repeated until approximately 


In [None]:
sti_str = ' '.join(rawtext_list)     #Join list of sentences into string
for c in string.punctuation:
    sti_str = sti_str.replace(c, "")  #remove punctuations
len(sti_str)                                        

1294718

In [None]:
pprint(sti_str)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 'httpswwwncbinlmnihgoventrezqueryfcgicmdretrievedbpubmedlistuids18701877doptabstract '
 'httpswwwncbinlmnihgoventrezqueryfcgicmdretrievedbpubmedlistuids21307823doptabstract '
 'httpswwwncbinlmnihgoventrezqueryfcgicmdretrievedbpubmedlistuids22572752doptabstract '
 'httpsdoiorg101097inf0b013e31825d3152 httpsdoiorg101097inf0b013e31825d3152 '
 'httpswwwncbinlmnihgoventrezqueryfcgicmdretrievedbpubmedlistuids32578864doptabstract '
 'httpsdoiorg101093cidciaa307 httpsdoiorg101093cidciaa307 '
 'httpswwwncbinlmnihgoventrezqueryfcgicmdretrievedbpubmedlistuids27835625doptabstract '
 'httpsdoiorg101097olq0000000000000524 httpsdoiorg101097olq0000000000000524 '
 'httpswwwncbinlmnihgoventrezqueryfcgicmdretrievedbpubmedlistuids21930610doptabstract '
 'httpsdoiorg101093infdisjir524 httpsdoiorg101093infdisjir524 '
 'httpswwwncbinlmnihgoventrezqueryfcgicmdretrievedbpubmedlistuids21880852doptabstract '
 'httpsdoiorg101128cvi0533511 httpsdoio

In [None]:
with open("sti2021.txt", "w") as text_file:
    text_file.write(sti_str)

In [None]:
%%time
sti_model_10 = fasttext.train_unsupervised('/content/sti2021.txt',epoch=10,dim=300)    #train model using fasttext
sti_model_10.save_model('sti2021_10.bin')    #save trained model

CPU times: user 1min 27s, sys: 8.93 s, total: 1min 36s
Wall time: 1min 40s


In [None]:
%%time
sti_model_20 = fasttext.train_unsupervised('/content/sti2021.txt',epoch=20,dim=300)    #train model using fasttext
sti_model_20.save_model('sti2021_20.bin')    #save trained model

CPU times: user 2min 54s, sys: 6.69 s, total: 3min
Wall time: 3min 9s


In [None]:
%%time
sti_model_50 = fasttext.train_unsupervised('/content/sti2021.txt',epoch=50,dim=300)    #train model using fasttext
sti_model_50.save_model('sti2021_50.bin')    #save trained model

CPU times: user 7min 23s, sys: 4.84 s, total: 7min 28s
Wall time: 7min 33s


In [None]:
fasttext_trained_model_10 = FastText.load_fasttext_format('/content/sti2021_10.bin') #Load Model using gensim's Fasttext 
fasttext_trained_model_20 = FastText.load_fasttext_format('/content/sti2021_20.bin') #Load Model using gensim's Fasttext 
fasttext_trained_model_50 = FastText.load_fasttext_format('/content/sti2021_50.bin') #Load Model using gensim's Fasttext 

In [None]:
print("For 10 epochs model{}\n" .format(fasttext_trained_model_10.wv.most_similar(["Chlamydia"], topn=5)))
print("For 20 epochs model{}\n" .format(fasttext_trained_model_20.wv.most_similar(["Chlamydia"], topn=5)))
print("For 50 epochs model{}\n" .format(fasttext_trained_model_50.wv.most_similar(["Chlamydia"], topn=5)))

For 10 epochs model[('chlamydia', 0.9820185899734497), ('chlamydial', 0.9320517182350159), ('gonorrhea', 0.8598592281341553), ('trachomatis', 0.804235577583313), ('presumptively', 0.7511805295944214)]

For 20 epochs model[('chlamydia', 0.9670469760894775), ('chlamydial', 0.8564761877059937), ('gonorrhea', 0.6671810150146484), ('trachomatis', 0.6380741596221924), ('149', 0.4469074606895447)]

For 50 epochs model[('chlamydia', 0.9502490758895874), ('chlamydial', 0.7842384576797485), ('gonorrhea', 0.5364363193511963), ('trachomatis', 0.38704758882522583), ('gonococcal', 0.2985709011554718)]



In [None]:
print("For 10 epochs model{}\n" .format(fasttext_trained_model_10.wv.most_similar(["doxycycline"], topn=5)))
print("For 20 epochs model{}\n" .format(fasttext_trained_model_20.wv.most_similar(["doxycycline"], topn=5)))
print("For 50 epochs model{}\n" .format(fasttext_trained_model_50.wv.most_similar(["doxycycline"], topn=5)))

For 10 epochs model[('tetracycline', 0.9082231521606445), ('500', 0.8898569345474243), ('mg', 0.880082905292511), ('orally', 0.8670501112937927), ('ciprofloxacin', 0.864250898361206)]

For 20 epochs model[('tetracycline', 0.6941334009170532), ('mg', 0.6287527084350586), ('100', 0.620211124420166), ('timesday', 0.6138949394226074), ('orally', 0.6033620834350586)]

For 50 epochs model[('tetracycline', 0.5501561164855957), ('100', 0.48643141984939575), ('azithromycin', 0.4158981740474701), ('decline', 0.38202741742134094), ('efficacious', 0.381359338760376)]



In [None]:
print("For 10 epochs model{}\n" .format(fasttext_trained_model_10.wv.most_similar(["doxycline"], topn=5)))
print("For 20 epochs model{}\n" .format(fasttext_trained_model_20.wv.most_similar(["doxycline"], topn=5)))
print("For 50 epochs model{}\n" .format(fasttext_trained_model_50.wv.most_similar(["doxycline"], topn=5)))

For 10 epochs model[('doxycycline', 0.994559109210968), ('tetracycline', 0.8745899200439453), ('500', 0.8650543093681335), ('mg', 0.8578678369522095), ('levofloxacin', 0.8533572554588318)]

For 20 epochs model[('doxycycline', 0.9940784573554993), ('tetracycline', 0.6550569534301758), ('100', 0.6134833097457886), ('mg', 0.608830988407135), ('timesday', 0.5856325626373291)]

For 50 epochs model[('doxycycline', 0.9960484504699707), ('tetracycline', 0.5226961374282837), ('100', 0.48615390062332153), ('azithromycin', 0.4029478430747986), ('decline', 0.3787992000579834)]



In [None]:
print("For 10 epochs model {}\n" .format(fasttext_trained_model_10.wv.doesnt_match(["paracetamol", "headache","diarrhoea","dizziness"])))  #incorrect spelling of word
print("For 20 epochs model {}\n" .format(fasttext_trained_model_20.wv.doesnt_match(["paracetamol", "headache","diarrhoea","dizziness"])))
print("For 50 epochs model {}\n" .format(fasttext_trained_model_50.wv.doesnt_match(["paracetamol", "headache","diarrhoea","dizziness"])))

For 10 epochs model paracetamol

For 20 epochs model paracetamol

For 50 epochs model paracetamol



  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


In [None]:
print("For 10 epochs model {}\n" .format(fasttext_trained_model_10.wv.doesnt_match(["Hydrochorothiazide", "doxycycline","tetracycline","azithromycin"])))  
print("For 20 epochs model {}\n" .format(fasttext_trained_model_20.wv.doesnt_match(["Hydrochorothiazide", "doxycycline","tetracycline","azithromycin"])))
print("For 50 epochs model {}\n" .format(fasttext_trained_model_50.wv.doesnt_match(["Hydrochorothiazide", "doxycycline","tetracycline","azithromycin"])))

For 10 epochs model Hydrochorothiazide

For 20 epochs model Hydrochorothiazide

For 50 epochs model Hydrochorothiazide



  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


In [None]:
print("For 10 epochs model {}\n" .format(fasttext_trained_model_10.wv.doesnt_match(["doxycycline","tetracycline","azithromycin"])))  
print("For 20 epochs model {}\n" .format(fasttext_trained_model_20.wv.doesnt_match(["doxycycline","tetracycline","azithromycin"])))
print("For 50 epochs model {}\n" .format(fasttext_trained_model_50.wv.doesnt_match(["doxycycline","tetracycline","azithromycin"])))

For 10 epochs model azithromycin

For 20 epochs model azithromycin

For 50 epochs model azithromycin



  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


In [None]:
print("For 10 epochs model {}\n" .format(fasttext_trained_model_10.wv.similarity(w1='drowsiness', w2='headache')))  
print("For 20 epochs model {}\n" .format(fasttext_trained_model_20.wv.similarity(w1='drowsiness', w2='headache')))
print("For 50 epochs model {}\n" .format(fasttext_trained_model_50.wv.similarity(w1='drowsiness', w2='headache')))

For 10 epochs model 0.5154387950897217

For 20 epochs model 0.4217087924480438

For 50 epochs model 0.37296628952026367



In [None]:
print("For 10 epochs model {}\n" .format(fasttext_trained_model_10.wv.similarity(w1='pain', w2='headache')))  
print("For 20 epochs model {}\n" .format(fasttext_trained_model_20.wv.similarity(w1='pain', w2='headache')))
print("For 50 epochs model {}\n" .format(fasttext_trained_model_50.wv.similarity(w1='pain', w2='headache')))

For 10 epochs model 0.8364637494087219

For 20 epochs model 0.5725488066673279

For 50 epochs model 0.3364919424057007



In [None]:
print("For 10 epochs model {}\n" .format(fasttext_trained_model_10.wv.similarity(w1='discharge', w2='itching')))  
print("For 20 epochs model {}\n" .format(fasttext_trained_model_20.wv.similarity(w1='discharge', w2='itching')))
print("For 50 epochs model {}\n" .format(fasttext_trained_model_50.wv.similarity(w1='discharge', w2='itching')))

For 10 epochs model 0.8079519271850586

For 20 epochs model 0.6242057085037231

For 50 epochs model 0.4394967257976532



In [None]:
print("For 10 epochs model {}\n" .format(fasttext_trained_model_10.wv.similarity(w1='discharge', w2='bleeding')))  
print("For 20 epochs model {}\n" .format(fasttext_trained_model_20.wv.similarity(w1='discharge', w2='bleeding')))
print("For 50 epochs model {}\n" .format(fasttext_trained_model_50.wv.similarity(w1='discharge', w2='bleeding')))

For 10 epochs model 0.7789510488510132

For 20 epochs model 0.48376184701919556

For 50 epochs model 0.41206616163253784

