In [3]:
import pubmed_parser as pp

dicts_out = pp.parse_medline_xml('data/pubmed_data/raw_data/44_abstracts_test.xml',
                                 year_info_only=False,
                                 nlm_category=False,
                                 author_list=False,
                                 reference_list=False)

In [4]:
test_abstracts = []
for x in dicts_out[0:5]: print(x['pmid']);test_abstracts.append(x['abstract'])
test_abstracts

11604796
24730353
29295283
23232759
26539547


['Existing computer-based ordering systems for physicians provide effective drug-centered checks but offer little assistance for optimizing the overall patient-centered treatment strategy. Evidence-based clinical practice guidelines have been developed to disseminate state-of-the-art information concerning treatment strategy but these guidelines are poorly used in routine practice. The ASTI project aims to design a guideline-based ordering system to enable general practitioners to avoid prescription errors and to improve compliance with best therapeutic practices. The " critic mode " operates as a background process and corrects the physician\'s prescription on the basis of automatically triggered elementary rules that account for isolated guideline recommendations. The " guided mode " directs the physician to the best treatment by browsing a comprehensive guideline knowledge base represented as a decision tree. A first prototype, applied to hypertension, is currently under development

In [5]:
def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'):
    import itertools, nltk, string
    
    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # tokenize, POS-tag, and chunk using regular expressions
    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))
    all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
                                                    for tagged_sent in tagged_sents))
    # join constituent chunk words into a single chunked phrase
    ## candidates = [' '.join(word for word, pos, chunk in group).lower() for key, group in itertools.groupby(all_chunks, lambda (word,pos,chunk): chunk != 'O') if key]
    candidates = [' '.join(word for word, pos, chunk in group).lower() for key, group in itertools.groupby(all_chunks, lambda word_features: word_features[2] != 'O') if key]

    return [cand for cand in candidates if cand not in stop_words and not all(char in punct for char in cand)]

In [6]:
for text in test_abstracts:
    print(set(extract_candidate_chunks(text)),end='\n\n')

{'development', 'guideline-based ordering system', 'routine practice', 'therapeutic practices', 'systems for physicians', 'general practitioners', 'prescription errors', 'critic mode', 'little assistance', 'elementary rules', 'prescription', 'evidence-based clinical practice guidelines', 'hypertension', 'compliance', 'background process', 'first prototype', 'basis', 'asti project', 'state-of-the-art information', 'mode', 'comprehensive guideline knowledge base', 'overall patient-centered treatment strategy', 'physician', 'isolated guideline recommendations', 'decision tree', 'effective drug-centered checks', 'treatment strategy', 'guidelines', 'treatment'}

{'usefulness', 'jena format', 'decision rules', 'ontology for glycemic management', 'classes', 'jena', 'methods', 'materials', 'domain ontology', 'diabetic patients', 'domain experts', 'specification', 'correctness', 'evaluation', 'jena rules', 'patient clinical situations', 'research', 'properties', 'modified ontology development m

In [7]:
def extract_candidate_words(text, good_tags=set(['JJ','JJR','JJS','NN','NNP','NNS','NNPS'])):
    import itertools, nltk, string

    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # tokenize and POS-tag words
    tagged_words = itertools.chain.from_iterable(nltk.pos_tag_sents(nltk.word_tokenize(sent)
                                                                    for sent in nltk.sent_tokenize(text)))
    # filter on certain POS tags and lowercase all words
    candidates = [word.lower() for word, tag in tagged_words
                  if tag in good_tags and word.lower() not in stop_words
                  and not all(char in punct for char in word)]

    return candidates

In [8]:
for text in test_abstracts:
    print(set(extract_candidate_words(text)),end='\n\n')

{'development', 'checks', 'systems', 'effective', 'base', 'first', 'guideline-based', 'assistance', 'evidence-based', 'patient-centered', 'comprehensive', 'prescription', 'best', 'strategy', 'hypertension', 'compliance', 'physicians', 'state-of-the-art', 'general', 'knowledge', 'tree', 'prototype', 'system', 'errors', 'drug-centered', 'little', 'isolated', 'critic', 'decision', 'practitioners', 'practices', 'practice', 'computer-based', 'clinical', 'basis', 'guideline', 'project', 'asti', 'mode', 'process', 'ordering', 'therapeutic', 'physician', 'routine', 'overall', 'information', 'recommendations', 'background', 'rules', 'guidelines', 'treatment', 'elementary'}

{'experts', 'usefulness', 'development', 'acceptance', 'maintenance', 'classes', 'jena', 'patients', 'glycemic', 'materials', 'editor', 'specification', 'correctness', 'concept', 'evaluation', 'evidence-based', 'management', 'situations', 'formal', 'inpatient', 'protégé-web', 'knowledge', 'analysis', 'properties', 'modified'

In [9]:
def score_keyphrases_by_tfidf(texts, candidates='chunks'):
    import gensim, nltk
    
    # extract candidates from each text in texts, either chunks or words
    if candidates == 'chunks':
        boc_texts = [extract_candidate_chunks(text) for text in texts]
    elif candidates == 'words':
        boc_texts = [extract_candidate_words(text) for text in texts]
    # make gensim dictionary and corpus
    dictionary = gensim.corpora.Dictionary(boc_texts)
    corpus = [dictionary.doc2bow(boc_text) for boc_text in boc_texts]
    # transform corpus with tf*idf model
    tfidf = gensim.models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    
    return corpus_tfidf, dictionary

In [18]:
import heapq 
from operator import itemgetter

tfidfs, id2word = score_keyphrases_by_tfidf(test_abstracts)
# fileids = texts.fileids()

# Print top keywords by TF-IDF
for idx, doc in enumerate(tfidfs):
    #print("Document '{}' key phrases:".format(fileids[idx]))
    # Get top 20 terms by TF-IDF score
    for wid, score in heapq.nlargest(20, doc, key=itemgetter(1)):
        print("{:0.3f}: {}".format(score, id2word[wid]))

    print("")

0.357: physician
0.179: asti project
0.179: background process
0.179: compliance
0.179: comprehensive guideline knowledge base
0.179: critic mode
0.179: decision tree
0.179: development
0.179: effective drug-centered checks
0.179: elementary rules
0.179: evidence-based clinical practice guidelines
0.179: first prototype
0.179: general practitioners
0.179: guideline-based ordering system
0.179: guidelines
0.179: hypertension
0.179: isolated guideline recommendations
0.179: little assistance
0.179: mode
0.179: overall patient-centered treatment strategy

0.385: surgery
0.292: ontology
0.257: diabetic patients
0.257: recommendations
0.146: system
0.128: acceptance of recommendations
0.128: classes
0.128: clinical decision support system
0.128: clinicians
0.128: conceptualization
0.128: contribution
0.128: correctness
0.128: decision rules
0.128: domain experts
0.128: domain ontology
0.128: domain ontology with formal concept analysis
0.128: embedded clinical knowledge
0.128: evaluation
0.

In [27]:
def score_keyphrases_by_textrank(text, n_keywords=0.05):
    from itertools import takewhile, tee
    import networkx, nltk
    
    # tokenize for all words, and extract *candidate* words
    words = [word.lower()
             for sent in nltk.sent_tokenize(text)
             for word in nltk.word_tokenize(sent)]
    candidates = extract_candidate_words(text)
    # build graph, each node is a unique candidate
    graph = networkx.Graph()
    graph.add_nodes_from(set(candidates))
    # iterate over word-pairs, add unweighted edges into graph
    def pairwise(iterable):
        """s -> (s0,s1), (s1,s2), (s2, s3), ..."""
        a, b = tee(iterable)
        next(b, None)
        return zip(a, b)
    for w1, w2 in pairwise(candidates):
        if w2:
            graph.add_edge(*sorted([w1, w2]))
    # score nodes using default pagerank algorithm, sort by score, keep top n_keywords
    ranks = networkx.pagerank(graph)
    if 0 < n_keywords < 1:
        n_keywords = int(round(len(candidates) * n_keywords))
    word_ranks = {word_rank[0]: word_rank[1]
                  for word_rank in sorted(ranks.items(), key=lambda x: x[1], reverse=True)[:n_keywords]}
    keywords = set(word_ranks.keys())
    # merge keywords into keyphrases
    keyphrases = {}
    j = 0
    for i, word in enumerate(words):
        if i < j:
            continue
        if word in keywords:
            kp_words = list(takewhile(lambda x: x in keywords, words[i:i+10]))
            avg_pagerank = sum(word_ranks[w] for w in kp_words) / float(len(kp_words))
            keyphrases[' '.join(kp_words)] = avg_pagerank
            # counter as hackish way to ensure merged keyphrases are non-overlapping
            j = i + len(kp_words)
    
    return sorted(keyphrases.items(), key=lambda x: x[1], reverse=True)

In [28]:
for text in test_abstracts:
    print(score_keyphrases_by_textrank(text),end='\n\n')

[('treatment', 0.035435595386177486), ('prescription', 0.0304338902313484), ('guideline', 0.03038849408029296)]

[('ontology', 0.09994400148702443), ('clinical', 0.0356274537523074), ('evaluation', 0.03414810855673551), ('recommendations', 0.033103198353565524)]

[('soa', 0.04076361751032182), ('cdss', 0.03934718824443082), ('system', 0.038598515429225176), ('ontology', 0.03834964909616907)]

[('architecture', 0.0541778824359536), ('task-based architecture', 0.043141343771745055), ('system', 0.03739572555558461), ('system components', 0.03341824682594001), ('task-based', 0.032104805107536515), ('components', 0.029440768096295415), ('ed', 0.028745688609939896)]

[('data', 0.030239250904916765), ('clinical data', 0.027035401505245707), ('use', 0.026148841959380175), ('project', 0.025532042634200194), ('clinical use', 0.024990197032477415), ('clinical', 0.023831552105574652), ('lhs', 0.02218841241579772)]



In [30]:
def extract_candidate_features(candidates, doc_text, doc_excerpt, doc_title):
    import collections, math, nltk, re
    
    candidate_scores = collections.OrderedDict()
    
    # get word counts for document
    doc_word_counts = collections.Counter(word.lower()
                                          for sent in nltk.sent_tokenize(doc_text)
                                          for word in nltk.word_tokenize(sent))
    
    for candidate in candidates:
        
        pattern = re.compile(r'\b'+re.escape(candidate)+r'(\b|[,;.!?]|\s)', re.IGNORECASE)
        
        # frequency-based
        # number of times candidate appears in document
        cand_doc_count = len(pattern.findall(doc_text))
        # count could be 0 for multiple reasons; shit happens in a simplified example
        if not cand_doc_count:
            print('**WARNING:', candidate, 'not found!')
            continue
    
        # statistical
        candidate_words = candidate.split()
        max_word_length = max(len(w) for w in candidate_words)
        term_length = len(candidate_words)
        # get frequencies for term and constituent words
        sum_doc_word_counts = float(sum(doc_word_counts[w] for w in candidate_words))
        try:
            # lexical cohesion doesn't make sense for 1-word terms
            if term_length == 1:
                lexical_cohesion = 0.0
            else:
                lexical_cohesion = term_length * (1 + math.log(cand_doc_count, 10)) * cand_doc_count / sum_doc_word_counts
        except (ValueError, ZeroDivisionError) as e:
            lexical_cohesion = 0.0
        
        # positional
        # found in title, key excerpt
        in_title = 1 if pattern.search(doc_title) else 0
        in_excerpt = 1 if pattern.search(doc_excerpt) else 0
        # first/last position, difference between them (spread)
        doc_text_length = float(len(doc_text))
        first_match = pattern.search(doc_text)
        abs_first_occurrence = first_match.start() / doc_text_length
        if cand_doc_count == 1:
            spread = 0.0
            abs_last_occurrence = abs_first_occurrence
        else:
            for last_match in pattern.finditer(doc_text):
                pass
            abs_last_occurrence = last_match.start() / doc_text_length
            spread = abs_last_occurrence - abs_first_occurrence

        candidate_scores[candidate] = {'term_count': cand_doc_count,
                                       'term_length': term_length, 'max_word_length': max_word_length,
                                       'spread': spread, 'lexical_cohesion': lexical_cohesion,
                                       'in_excerpt': in_excerpt, 'in_title': in_title,
                                       'abs_first_occurrence': abs_first_occurrence,
                                       'abs_last_occurrence': abs_last_occurrence}

    return candidate_scores

In [34]:
candidates = ['development', 'guideline-based ordering system', 'routine practice', 'therapeutic practices', 'systems for physicians', 'general practitioners', 'prescription errors', 'critic mode', 'little assistance', 'elementary rules', 'prescription', 'evidence-based clinical practice guidelines', 'hypertension', 'compliance', 'background process', 'first prototype', 'basis', 'asti project', 'state-of-the-art information', 'mode', 'comprehensive guideline knowledge base', 'overall patient-centered treatment strategy', 'physician', 'isolated guideline recommendations', 'decision tree', 'effective drug-centered checks', 'treatment strategy', 'guidelines', 'treatment',
             'development', 'checks', 'systems', 'effective', 'base', 'first', 'guideline-based', 'assistance', 'evidence-based', 'patient-centered', 'comprehensive', 'prescription', 'best', 'strategy', 'hypertension', 'compliance', 'physicians', 'state-of-the-art', 'general', 'knowledge', 'tree', 'prototype', 'system', 'errors', 'drug-centered', 'little', 'isolated', 'critic', 'decision', 'practitioners', 'practices', 'practice', 'computer-based', 'clinical', 'basis', 'guideline', 'project', 'asti', 'mode', 'process', 'ordering', 'therapeutic', 'physician', 'routine', 'overall', 'information', 'recommendations', 'background', 'rules', 'guidelines', 'treatment', 'elementary']
extract_candidate_features(candidates, test_abstracts[0],"","ASTI: a guideline-based drug-ordering system for primary care")

OrderedDict([('development',
              {'term_count': 1,
               'term_length': 1,
               'max_word_length': 11,
               'spread': 0.0,
               'lexical_cohesion': 0.0,
               'in_excerpt': 0,
               'in_title': 0,
               'abs_first_occurrence': 0.9879759519038076,
               'abs_last_occurrence': 0.9879759519038076}),
             ('guideline-based ordering system',
              {'term_count': 1,
               'term_length': 3,
               'max_word_length': 15,
               'spread': 0.0,
               'lexical_cohesion': 0.75,
               'in_excerpt': 0,
               'in_title': 0,
               'abs_first_occurrence': 0.41783567134268534,
               'abs_last_occurrence': 0.41783567134268534}),
             ('routine practice',
              {'term_count': 1,
               'term_length': 2,
               'max_word_length': 8,
               'spread': 0.0,
               'lexical_cohesion': 0.66666666