In [1]:
import gensim
import pyLDAvis
from pyLDAvis import gensim as gensimvis
import spacy
from pprint import pprint

  from collections import Sized
  from collections import defaultdict, Sequence, Sized, Iterable, Callable


In [2]:
# Comment this cell out if you like python warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# TODO: Maybe multithread this so this is less of a bottleneck?
# TODO: Put abstracts into a shared AWS database and maybe even spin up an instance to get more abstracts

# Get n abstracts for a topic from NCBI API
from metapub import PubMedFetcher

fetch = PubMedFetcher()

# Get the first n abstracts for a given search term
def get_abstracts(search_term, n):
    
    # get n pmids for search query
    pmids = fetch.pmids_for_query(search_term, retmax=n)

    # get list of abstracts from pmids:
    abstracts = {}
    for pmid in pmids:
        abstracts[pmid] = fetch.article_by_pmid(pmid).abstract
        
    return list(abstracts.values())

No NCBI API key provided; throttling to 3 requests/second; see https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/


In [4]:
# Get 500 abstracts for P53

p53_abstracts = get_abstracts('P53', 500)

In [5]:
# Get 100 abstracts for zika virus

zika_abstracts = get_abstracts('Zika virus', 100)

In [12]:
# Get 100 abstracts for BRCA

brca_abstracts = get_abstracts('BRCA', 100)

In [15]:
narcolepsy_abstracts = get_abstracts('narcolepsy', 100)

In [17]:
abs_list = p53_abstracts + zika_abstracts + brca_abstracts + narcolepsy_abstracts

In [18]:
len(abs_list)

800

In [21]:
# TODO: Check that this simple preprocessing is actually helping and not hurting our process

# TR: simple_preprocess removes numbers which destroys molecule names, see custom filter below 
# for an alternative that doesn't damage molecule names. 

##########################
##### Depricated #########
##### Use filters ########
##### Below ##############
##########################

# Get words from abstracts (pre processing)
def abs_to_words(abstracts):
    for abstract in abstracts:
        yield(gensim.utils.simple_preprocess(str(abstract), deacc=True))


# Get list of cleaned abstracts 
clean_abs = list(sent_to_words(abs_list))

In [48]:
import gensim.parsing.preprocessing as pp #import preprocess_string

def tr_strip_punctuation(s):
    s = s.replace('"','')
    s = s.replace('.','')
    s = s.replace(':','')
    s = s.replace(';','')
    s = s.replace(',','')    
    s = s.replace('!','')
    s = s.replace("'",'')
    s = s.replace("]",'')
    s = s.replace("[",'')
    s = s.replace(")",'')
    s = s.replace("(",'')
    s = s.replace("/",' ') #I think space is safest, in this case
    s = s.replace("?",'')        
    return s

#can add other pre-defined filters, or custom as above...
CUSTOM_FILTERS = [lambda x: x.lower(), pp.strip_tags, tr_strip_punctuation, 
                  pp.strip_short, pp.remove_stopwords, pp.strip_short]

def tr_abs_to_words(abstracts, filters):
    for abstract in abstracts:
        yield(pp.preprocess_string(str(abstract), filters))

tr_clean_abs = list(tr_abs_to_words(abs_list, CUSTOM_FILTERS)) #pp.preprocess_string(str(my_sentences), CUSTOM_FILTERS)

In [49]:
tr_clean_abs

[[],
 ['ovarian',
  'endometriotic',
  'cysts',
  'identified',
  'possible',
  'origin',
  'ovarian',
  'clear',
  'cell',
  'carcinoma',
  'occc',
  'predicting',
  'preventing',
  'future',
  'transformation',
  'important',
  'early',
  'detection',
  'clear',
  'cell',
  'carcinoma',
  'important',
  'shows',
  'low',
  'sensitivity',
  'chemotherapy',
  'prognosis',
  'worse',
  'histologic',
  'types',
  'recently',
  'treated',
  'patients',
  'occc',
  'young',
  'women',
  'family',
  'history',
  'cancer',
  'received',
  'long-term',
  'oral',
  'contraceptive',
  'therapy',
  'endometriotic',
  'cysts',
  'histologic',
  'diagnosis',
  'typical',
  'clear',
  'cell',
  'carcinoma',
  'patients',
  'case',
  'tumor',
  'detected',
  'periodic',
  'examination',
  'tumor',
  'expression',
  'wt1',
  'positive',
  'stage',
  'hand',
  'case',
  'presented',
  'fever',
  'unknown',
  'origin',
  'tumor',
  'showed',
  'expression',
  'p53',
  'stage',
  'ivb',
  'case',
  'ali

In [43]:
## TODO: I'm not sure if this is the right implementation for the bi/trigrams so this could use more research
# as the current implementation is lifted straight from class notes
# TODO: Look into the threshold parameters of the Phrases class, it could be that bigrams and trigrams are rare

# Build the bigram and trigram models
bigram = gensim.models.Phrases(tr_clean_abs, min_count=2) # higher threshold fewer phrases.

trigram = gensim.models.Phrases(bigram[tr_clean_abs], min_count=2)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_model = gensim.models.phrases.Phraser(bigram)
trigram_model = gensim.models.phrases.Phraser(trigram)

In [44]:
# Print a few exaples of cleaned trigram model abstracts
for s in tr_clean_abs[0:5]:
    print(f'{" ".join(trigram_model[bigram_model[s]]) } \n')

 

ovarian endometriotic cysts identified possible origin ovarian clear_cell_carcinoma occc, predicting preventing future transformation important early_detection clear_cell_carcinoma important shows low sensitivity chemotherapy prognosis worse histologic types recently treated patients occc young women family_history cancer received long-term oral contraceptive therapy endometriotic cysts, histologic diagnosis typical clear_cell_carcinoma patients however, case tumor detected periodic examination, tumor expression wt1 positive, stage hand, case presented fever unknown origin, tumor showed expression p53, stage ivb case alive evidence disease months surgery, case died months despite intensive treatment contrasting cases suggest need aware risk cancer young women_receiving long-term hormone_therapy endometriotic cysts occc greater heterogeneity reported previously 

background_aims: chronic_lymphocytic_leukemia_cll characterized accumulation cells phase_cell_cycle resistance apoptosis g

In [50]:
# TODO: Make sure this is the right implementation as this was also lifted straight from class

tr_clean_abs = [trigram_model[bigram_model[t]] for t in tr_clean_abs]

In [51]:
for t in tr_clean_abs[:5]:
    print(f'{" ".join(t)} \n')

 

ovarian endometriotic cysts identified possible origin ovarian clear_cell_carcinoma occc predicting preventing future transformation important early_detection clear_cell_carcinoma important shows low sensitivity chemotherapy prognosis worse histologic types recently treated patients occc young women family_history cancer received long-term oral contraceptive therapy endometriotic cysts histologic diagnosis typical clear_cell_carcinoma patients case tumor detected periodic examination tumor expression wt1 positive stage hand case presented fever unknown origin tumor showed expression p53 stage ivb case alive evidence disease months surgery case died months despite intensive treatment contrasting cases suggest need aware risk cancer young women_receiving long-term hormone_therapy endometriotic cysts occc greater heterogeneity reported previously 

background aims chronic_lymphocytic_leukemia_cll characterized accumulation cells phase_cell_cycle resistance apoptosis gene_mutation abnor

In [53]:
# TODO: Check that this lemmatization is helping and not hurting our process, maybe a more sophisticated 
# implementation would be better

# Lemmatize the abstracts

nlp = spacy.load('en', disable=['parser', 'ner'])

# NOUN, ADJ, VERB, ADV
def lemmatization(texts, allowed_postags=['NOUN']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        
        output_text = [token.lemma_ for token in doc if token.pos_ in allowed_postags and token.is_stop == False]
        
        if len(output_text) > 0:
            texts_out.append(output_text)
    return texts_out

In [54]:
# Lemmatize the abstracts
tr_clean_abs = lemmatization(tr_clean_abs)

In [55]:
for t in tr_clean_abs[0:5]:
    print(f'{" ".join(t)} \n')

cyst origin clear_cell_carcinoma occc transformation early_detection clear_cell_carcinoma show sensitivity chemotherapy prognosis type patient woman cancer term contraceptive therapy cyst diagnosis clear_cell_carcinoma patient case tumor examination tumor expression stage hand case fever origin tumor expression p53 stage ivb case evidence disease month surgery case month treatment case risk cancer term hormone_therapy cyst heterogeneity 

background accumulation cell resistance apoptosis gene_mutation gene_expression previous_study cop1 binet c phase cll patient regulation cop1 repair_dna damage function cell_apoptosis fludarabine method sensitivity cll_cell fludarabine cck-8 expression p53 cell western_blot cll effect overexpression cell apoptosis transplant mouse survival result cll_cell high_expression sensitivity drug enrichment p53_protein cell overexpression cell sensitivity fludarabine treatment inhibited_cell apoptosis autoubiquitination study p53_degradation disrupt formation 

In [56]:
# TODO: Same as above, this implementation is from class, not sure if this is the best way

# Create bag of words representation of abstracts

#creates a dictionary indexes (representing words) as keys and word as values
id2word = gensim.corpora.Dictionary(tr_clean_abs)

#uses the word-code as key, and the occurance as values
corpus = [id2word.doc2bow(t) for t in tr_clean_abs]

#so we have a dictionary for going from indexes to words (id2word)
#and we have a corpus (ind, occurances_in_doc)

In [70]:
# Uncomment to view an example of a bow representation for an abstract (from corpus)
[(id2word[id], freq) for id, freq in corpus[5]]

[('p53', 1),
 ('therapy', 1),
 ('tumor', 7),
 ('type', 2),
 ('cell', 2),
 ('conclusion', 1),
 ('method', 1),
 ('result', 1),
 ('transcription', 1),
 ('gene', 1),
 ('protein', 2),
 ('association', 1),
 ('dna', 1),
 ('mechanism', 1),
 ('act', 1),
 ('analysis', 2),
 ('angiogenesis', 1),
 ('approach', 1),
 ('ataxia', 1),
 ('biology', 1),
 ('biosignature', 1),
 ('boost', 1),
 ('breakage', 1),
 ('cancer_therapy', 1),
 ('carbon', 2),
 ('cellular_process', 1),
 ('change', 1),
 ('charged_particle', 6),
 ('clinical_trial', 1),
 ('coding', 1),
 ('commonality', 1),
 ('datum', 1),
 ('difference', 1),
 ('eligibility', 1),
 ('event', 1),
 ('example', 1),
 ('exposure', 3),
 ('form', 1),
 ('fraction', 1),
 ('hypoxia', 2),
 ('identification', 1),
 ('impact', 1),
 ('instance', 1),
 ('ion', 1),
 ('junk', 1),
 ('lncrna', 1),
 ('low_let', 5),
 ('marker', 1),
 ('migration_invasion', 1),
 ('mind', 1),
 ('mirna', 2),
 ('mrna', 1),
 ('nijmegen', 1),
 ('normal_tissue', 2),
 ('number', 1),
 ('omic', 2),
 ('oncoge

In [141]:
# TODO: Tune parameters like number of topics, passes, etc

# Create the LDA model

lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=mito_corpus,
                                           id2word=id2word,
                                           num_topics=3, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=200,
                                           per_word_topics=True)

In [142]:
# Print topics
pprint(lda_model.print_topics())

[(0,
  '0.018*"disturbance" + 0.016*"neuroprotective_agent" + 0.014*"p53-" + '
  '0.009*"cop1" + 0.008*"drinking_water" + 0.006*"p53wt" + '
  '0.006*"denaturation" + 0.006*"tukey" + 0.004*"biotransformation" + '
  '0.004*"β-galactosidase"'),
 (1,
  '0.022*"fuel" + 0.014*"gene_mutation" + 0.011*"hormone_therapy" + '
  '0.008*"disrupt" + 0.008*"failure" + 0.008*"autoubiquitination" + '
  '0.007*"tumor" + 0.006*"mrna" + 0.005*"reperfusion" + 0.004*"risk"'),
 (2,
  '0.007*"gd45" + 0.004*"underlying_molecular_mechanism" + 0.004*"cop1" + '
  '0.004*"polyphenol" + 0.003*"tumor" + 0.003*"potential" + 0.003*"situ" + '
  '0.003*"exposure" + 0.003*"fetuses_gd45" + 0.003*"signaling"')]


In [144]:
# Uncomment to print the top topics of a particular abstract from the corpus
collect={0:0,1:0,2:0,3:0,4:0,5:0,6:0,7:0,8:0,9:0}
for i_, abstract in enumerate(corpus):
    top_topics = lda_model.get_document_topics(abstract)
    top_topics.sort(key=lambda x: x[1], reverse=True)
    #for i in range(len(top_topics)):
    if top_topics[0][0] == 2:
        print(abs_list[i_])
    collect[top_topics[0][0]]+=1#*top_topics[i][1]
    #print(f'ind: {i_}:', top_topics)
print(collect)

None
Ovarian endometriotic cysts have been identified as the possible origin of ovarian clear cell carcinoma (OCCC), so predicting or preventing future transformation is important. Early detection of clear cell carcinoma is important because it shows low sensitivity to chemotherapy and the prognosis is worse than for other histologic types. We recently treated 2 patients with OCCC. They were both young women with no family history of cancer who received long-term oral contraceptive therapy for endometriotic cysts, and the histologic diagnosis was typical clear cell carcinoma in both patients. However, in Case 1, the tumor was detected by periodic examination, tumor expression of WT1 was positive, and the stage was IA. On the other hand, Case 2 presented with fever of unknown origin, her tumor showed expression of p53, and the stage was IVB. Case 1 is alive with no evidence of disease at 38 months after surgery, while Case 2 died after 19 months despite intensive treatment. These contra

BACKGROUND/AIMS: This is a retrospective analysis of 103 patients having locally advanced rectal cancer who received short-course radiotherapy (SCRT). The objective of the study was to check whether a polymorphism in the RAD51 gene (135 G>C), Ku70 protein expression, and tumor microenvironment: proliferation rate measured by BrdUrdLI and Ki-67LI, hypoxia (glucose transporter-1 expression), P53 protein expression, and DNA ploidy can influence DNA repair capacity, the factors contributing to patient overall survival (OS) and the incidence of recurrences and metastases.
MATERIALS AND METHODS: RAD51 (135 G>C) polymorphism was evaluated using restriction fragment length polymorphism polymerase chain reaction, and proteins were identified using immunohistochemistry.
RESULTS: There were 3 (2.9%) tumors with RAD51 CC, 75 (72.8%) with GG, and 25 (24.3%) with GC genotypes. The median follow-up time was 63.1 months (range 2-120). Patients with CC genotype survived significantly longer than those 

In [145]:
# TODO: Make sure this function changes if the above processing implementation changes as this is a mirror of
# those operations

# Now that we have gone through the process of processing/lemmatizing/bow-ing let's abstract to a function
# for any new test topics

CUSTOM_FILTERS = [lambda x: x.lower(), pp.strip_tags, tr_strip_punctuation, 
                  pp.strip_short, pp.remove_stopwords, pp.strip_short]

def abstracts_to_corpus(abstracts, filters):
    clean_abs = list(tr_abs_to_words(abstracts, filters))
    id2word = gensim.corpora.Dictionary(clean_abs)
    corpus = [id2word.doc2bow(t) for t in clean_abs]
    return corpus

In [146]:
# Print topics for n items in a corpus based on our LDA model

def print_top_topics(test_corpus, n):
    for i in range(n):
        top_topics = lda_model.get_document_topics(test_corpus[i])
        top_topics.sort(key=lambda x: x[1], reverse=True)

        print(top_topics)



In [137]:
# Example of the pipline from new search term to summary of topics based on LDA model
# for the search term mitochondria
iPS_abstracts = get_abstracts('induced pluripotent stem cell', 100)
narcolepsy_abstracts = get_abstracts('orexin narcolepsy', 50)
prader_willi_abstracts = get_abstracts('prader willi', 25)
abstracts = iPS_abstracts + narcolepsy_abstracts + prader_willi_abstracts
mito_corpus = abstracts_to_corpus(abstracts, CUSTOM_FILTERS)
print_top_topics(mito_corpus, 20)

[(0, 0.60634613), (2, 0.21127012), (3, 0.12117152), (1, 0.061212245)]
[(3, 0.50745934), (0, 0.44520777), (2, 0.044119403)]
[(0, 0.38115916), (2, 0.2607926), (3, 0.24930108), (1, 0.1087471)]
[(3, 0.58710176), (1, 0.34923106), (2, 0.059121177)]
[(2, 0.5969549), (1, 0.17447183), (0, 0.12471066), (3, 0.10386263)]
[(0, 0.5996338), (1, 0.3424398), (2, 0.053592477)]
[(0, 0.84235865), (3, 0.09746899), (1, 0.057997633)]
[(0, 0.29408705), (2, 0.29311508), (3, 0.23736459), (1, 0.1754333)]
[(3, 0.42578134), (0, 0.3428475), (1, 0.15113084), (2, 0.08024035)]
[(0, 0.3516594), (3, 0.2421717), (1, 0.21832386), (2, 0.18784507)]
[(0, 0.53643984), (1, 0.19312032), (3, 0.14303288), (2, 0.12740695)]
[(0, 0.55429655), (2, 0.32689804), (1, 0.060141493), (3, 0.058663946)]
[(3, 0.32689565), (0, 0.25389993), (1, 0.24235646), (2, 0.176848)]
[(2, 0.37846676), (3, 0.29914752), (0, 0.22381955), (1, 0.098566175)]
[(3, 0.34937495), (0, 0.34924045), (2, 0.2611981), (1, 0.04018649)]
[(3, 0.39312902), (0, 0.36357185), (1

In [147]:
# Visualization for LDA topics sent to lda.html
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, mito_corpus, id2word)
pyLDAvis.save_html(vis, 'prader_narco_iPSlda_2.html')

In [148]:
!open prader_narco_iPSlda_2.html


In [149]:
mito_corpus

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 3),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 2),
  (28, 1),
  (29, 2),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 2),
  (34, 1),
  (35, 6),
  (36, 1),
  (37, 1),
  (38, 2),
  (39, 1),
  (40, 2),
  (41, 2),
  (42, 1),
  (43, 1),
  (44, 2),
  (45, 1),
  (46, 1),
  (47, 2),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 6),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 1)],
 [(15, 1),
  (38, 1),
  (42, 4),
  (50, 1),
  (51, 1),
  (53, 1),
  (54, 1),
  (62, 1),
  (76, 1),
  (77, 1),
  (78, 2),
  (79, 1),
  (80, 1),
  (81, 1),
  (82, 1),
  (83, 2