In [198]:
import gensim
import pyLDAvis
from pyLDAvis import gensim as gensimvis
import spacy
from pprint import pprint

In [None]:
# Comment this cell out if you like python warnings
import warnings
warnings.filterwarnings('ignore')

In [190]:
# TODO: Maybe multithread this so this is less of a bottleneck?
# TODO: Put abstracts into a shared AWS database and maybe even spin up an instance to get more abstracts

# Get n abstracts for a topic from NCBI API
from metapub import PubMedFetcher

fetch = PubMedFetcher()

# Get the first n abstracts for a given search term
def get_abstracts(search_term, n):
    
    # get n pmids for search query
    pmids = fetch.pmids_for_query(search_term, retmax=n)

    # get list of abstracts from pmids:
    abstracts = {}
    for pmid in pmids:
        abstracts[pmid] = fetch.article_by_pmid(pmid).abstract
        
    return list(abstracts.values())



In [None]:
# Get 500 abstracts for P53

p53_abstracts = get_abstracts('P53', 500)

In [191]:
# Get 100 abstracts for zika virus

zika_abstracts = get_abstracts('Zika virus', 100)

In [132]:
# Get 100 abstracts for BRCA

brca_abstracts = get_abstracts('BRCA', 100)

In [133]:
abs_list = p53_abstracts + zika_abstracts + brca_abstracts

In [134]:
len(abs_list)

619

In [192]:
# TODO: Check that this simple preprocessing is actually helping and not hurting our process

# Get words from abstracts (pre processing)
def abs_to_words(abstracts):
    for abstract in abstracts:
        yield(gensim.utils.simple_preprocess(str(abstract), deacc=True))


In [193]:
# Get list of cleaned abstracts 
clean_abs = list(sent_to_words(abs_list))

In [168]:
# TODO: I'm not sure if this is the right implementation for the bi/trigrams so this could use more research
# as the current implementation is lifted straight from class notes
# TODO: Look into the threshold parameters of the Phrases class, it could be that bigrams and trigrams are rare

# Build the bigram and trigram models
bigram = gensim.models.Phrases(clean_abs, min_count=20) # higher threshold fewer phrases.

trigram = gensim.models.Phrases(bigram[clean_abs], min_count=10)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_model = gensim.models.phrases.Phraser(bigram)
trigram_model = gensim.models.phrases.Phraser(trigram)



In [171]:
# Print a few exaples of cleaned trigram model abstracts
for s in clean_abs[0:5]:
    print(f'{" ".join(trigram_model[bigram_model[s]]) } \n')

verrucous carcinoma of the esophagus vce is_rare variant of squamous_cell cancer with puzzling clinical etiological and molecular profile the etiological involvement of human_papillomavirus hpv in the cancer natural history is controversial this_study considers nine cases of vce focusing on patients clinical history before surgery histological phenotype immunophenotype egfr cadherin cyclin and expression hpv infection and tp_gene mutational_status exons using three different molecular test methods not one of these cases of vce featured hpv infection the only case with synchronous nodal metastasis was characterized_by tp missense point mutation in association_with high egfr and low cadherin expression levels in conclusion hpv infection is probably not involved vce while tp_gene mutation egfr overexpression and cadherin loss might fuel the tumor proliferation and lend it metastatic potential 

background and aim tumor genotyping may allow for improved prognostication and targeted_therapy

In [172]:
# TODO: Make sure this is the right implementation as this was also lifted straight from class

clean_abs = [trigram_model[bigram_model[t]] for t in clean_abs]

In [173]:
for t in clean_sents[:5]:
    print(f'{" ".join(t)} \n')

background treatment of cancer with natural agents induces apoptosis along with remarkable alterations in the expression of apoptosis related genes deregulation of microrna mirna expression is implicated in several human malignancies vinca alkaloids compose class of antimitotic drugs preventing cancer cells from dividing leading to apoptosis they are commonly used in clinical practice for breast cancer treatment objective the present study focused on the effects of vinca alkaloids vincristine vinblastine and vinorelbine on mirna expression of treated breast cancer cells methods we investigated the effect of vincristine vinblastine and vinorelbine on the expression of oncogenic and tumor suppressive mirnas mir mir mir mir mir mir mir mir mir mir and mir as well as on the expression of the apoptosis related genes bax bcl and tp in bt and sk br breast adenocarcinoma cells results treatment of bt cells with vincristine vinblastine and or vinorelbine resulted in upregulation of tp expressio

In [174]:
# TODO: Check that this lemmatization is helping and not hurting our process, maybe a more sophisticated 
# implementation would be better

# Lemmatize the abstracts

nlp = spacy.load('en', disable=['parser', 'ner'])

# NOUN, ADJ, VERB, ADV
def lemmatization(texts, allowed_postags=['NOUN']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        
        output_text = [token.lemma_ for token in doc if token.pos_ in allowed_postags and token.is_stop == False]
        
        if len(output_text) > 0:
            texts_out.append(output_text)
    return texts_out

In [175]:
# Lemmatize the abstracts
clean_abs = lemmatization(clean_abs)

In [176]:
for t in clean_abs[0:5]:
    print(f'{" ".join(t)} \n')

carcinoma esophagus vce variant cancer profile involvement hpv cancer history this_study case vce patient history surgery phenotype immunophenotype egfr cadherin cyclin expression infection tp_gene mutational_status exon test method case vce infection case metastasis missense point mutation egfr cadherin expression level conclusion infection vce tp_gene mutation egfr overexpression cadherin loss tumor proliferation potential 

background tumor genotyping prognostication targeted_therapy ultrasonography eus aspiration fna needle biopsy fnb tissue analysis theranostic method cohort study patient eus fna eus cytology pdac march december cytology datum genetic_alteration survival curve result study patient patient fna patient sample targeted_next_generation sequencing proportion patients_with sample compared_to fna modeling sampling testing tissue tumor tumor head neck pancrea compared_to fna alteration kra smad conclusion eus tissue pdac sequencing prognostication theranostic fnb should_b

In [177]:
# TODO: Same as above, this implementation is from class, not sure if this is the best way

# Create bag of words representation of abstracts

id2word = gensim.corpora.Dictionary(clean_abs)

corpus = [id2word.doc2bow(t) for t in clean_abs]

In [178]:
# Uncomment to view an example of a bow representation for an abstract (from corpus)
# [(id2word[id], freq) for id, freq in corpus[0]]

[('cadherin', 3), ('cancer', 2), ('carcinoma', 1), ('case', 3), ('conclusion', 1), ('cyclin', 1), ('egfr', 3), ('esophagus', 1), ('exon', 1), ('expression', 2), ('history', 2), ('hpv', 1), ('immunophenotype', 1), ('infection', 3), ('involvement', 1), ('level', 1), ('loss', 1), ('metastasis', 1), ('method', 1), ('missense', 1), ('mutation', 2), ('mutational_status', 1), ('overexpression', 1), ('patient', 1), ('phenotype', 1), ('point', 1), ('potential', 1), ('profile', 1), ('proliferation', 1), ('surgery', 1), ('test', 1), ('this_study', 1), ('tp_gene', 2), ('tumor', 1), ('variant', 1), ('vce', 4)]

In [194]:
# TODO: Tune parameters like number of topics, passes, etc

# Create the LDA model

lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=128,
                                           per_word_topics=True)

In [180]:
# Print topics
pprint(lda_model.print_topics())

[(0,
  '0.030*"gene" + 0.027*"patient" + 0.024*"mutation" + 0.022*"breast_cancer" + '
  '0.018*"cancer" + 0.018*"risk" + 0.015*"brca" + 0.012*"result" + '
  '0.011*"year" + 0.010*"study"'),
 (1,
  '0.037*"expression" + 0.027*"gene" + 0.022*"protein" + 0.019*"cell" + '
  '0.016*"apoptosis" + 0.016*"mir" + 0.015*"analysis" + 0.014*"pathway" + '
  '0.013*"mirna" + 0.012*"survival"'),
 (2,
  '0.014*"treatment" + 0.013*"cell" + 0.012*"effect" + 0.011*"cell_line" + '
  '0.009*"emodin" + 0.008*"mosquito" + 0.008*"variant" + 0.007*"hdac" + '
  '0.007*"cancer" + 0.006*"mouse"'),
 (3,
  '0.021*"tumor" + 0.016*"carcinoma" + 0.016*"cancer" + 0.013*"patient" + '
  '0.012*"case" + 0.011*"mutation" + 0.010*"cell" + 0.008*"expression" + '
  '0.008*"patients_with" + 0.007*"conclusion"'),
 (4,
  '0.045*"mutation" + 0.029*"tumor" + 0.020*"gene" + 0.012*"case" + '
  '0.008*"protein" + 0.008*"cancer" + 0.007*"squamous_cell_carcinoma" + '
  '0.007*"alteration" + 0.007*"analysis" + 0.007*"heterogeneity"'),
 

In [182]:
# Uncomment to print the top topics of a particular abstract from the corpus
# top_topics = lda_model.get_document_topics(corpus[400])
# top_topics.sort(key=lambda x: x[1], reverse=True)


# print(top_topics)

[(2, 0.7404139), (0, 0.25069582)]


In [218]:
# TODO: Make sure this function changes if the above processing implementation changes as this is a mirror of
# those operations

# Now that we have gone through the process of processing/lemmatizing/bow-ing let's abstract to a function
# for any new test topics

def abstracts_to_corpus(abstracts):
    clean_abs = list(sent_to_words(abstracts))
    id2word = gensim.corpora.Dictionary(clean_abs)
    corpus = [id2word.doc2bow(t) for t in clean_abs]
    return corpus

In [222]:
# Print topics for n items in a corpus based on our LDA model

def print_top_topics(test_corpus, n):
    for i in range(n):
        top_topics = lda_model.get_document_topics(test_corpus[i])
        top_topics.sort(key=lambda x: x[1], reverse=True)

        print(top_topics)



In [223]:
# Example of the pipline from new search term to summary of topics based on LDA model
# for the search term mitochondria
mito_abstracts = get_abstracts('mitochondria', 20)
mito_corpus = abstracts_to_corpus(mito_abstracts)
print_top_topics(mito_corpus, 20)

[(3, 0.3205298), (6, 0.28452444), (1, 0.15445842), (4, 0.13608019), (8, 0.061582934), (0, 0.04102215)]
[(4, 0.18477465), (3, 0.18393435), (6, 0.17844248), (1, 0.16565396), (0, 0.13249461), (8, 0.06625642), (9, 0.056678552), (5, 0.031113373)]
[(6, 0.33838257), (9, 0.17335859), (0, 0.14897941), (7, 0.10982089), (3, 0.10177885), (1, 0.072340116), (8, 0.053792756)]
[(6, 0.45875052), (4, 0.18954758), (3, 0.123993866), (8, 0.104326375), (0, 0.079684354), (1, 0.032170314)]
[(6, 0.3368403), (3, 0.18091713), (0, 0.16014554), (4, 0.13413037), (1, 0.0962157), (8, 0.077466786), (5, 0.012932634)]
[(0, 0.2716516), (6, 0.2546257), (4, 0.18306825), (3, 0.15645285), (9, 0.07538702), (5, 0.031308983), (8, 0.02487355)]
[(4, 0.3355083), (1, 0.306629), (3, 0.11506423), (8, 0.08112837), (6, 0.07200557), (9, 0.038002986), (5, 0.022619909), (0, 0.02176846)]
[(4, 0.25313357), (6, 0.20196074), (0, 0.14576037), (1, 0.10761401), (8, 0.101119176), (5, 0.088142276), (3, 0.08277756), (7, 0.01087374)]
[(8, 0.25835845

In [189]:
# Visualization for LDA topics sent to lda.html
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
pyLDAvis.save_html(vis, 'lda.html')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
