In [1]:
import re
import requests
from bs4 import BeautifulSoup

In [2]:
def get_citations(pubmed_id):
    url = 'http://www.ncbi.nlm.nih.gov/pmc/articles/PMC%s' % pubmed_id
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "lxml")
    pubmed_article_urls = [span.a['href'] for span in soup.findAll("span", {"class":"nowrap ref pubmed"})]
    pubmed_ids = {int(url.replace(r'/pubmed/', '')) for url in pubmed_article_urls}
    return pubmed_ids

In [3]:
get_citations(3168302)

{6344946,
 7949911,
 11079836,
 11825149,
 11825203,
 15360816,
 15919728,
 16287934,
 16779043,
 16779044,
 17646325,
 17971238,
 18436913,
 19166973,
 19376821,
 19497938,
 21303863,
 21346958}

In [40]:
from Bio import Entrez

# If we access the DB too much they will send an email before cutting us off.
# Try to access in batches.
Entrez.email = "rohan.nagar@utexas.edu"

def get_main_headings(pubmed_ids):
    handle = Entrez.efetch(db="pubmed", id=pubmed_ids, rettype="medline", retmode="xml")
    records = Entrez.read(handle)

    main_headings = []
    for record in records:
        # Make sure it has MeSH terms before trying to get them
        if 'MeshHeadingList' not in record['MedlineCitation']:
            continue
        
        for heading in record['MedlineCitation']['MeshHeadingList']:
            # If the heading is a main heading
            if heading['DescriptorName'].attributes['MajorTopicYN'] == 'Y':
                main_headings.append(heading['DescriptorName'])
    
    handle.close()
    
    return main_headings

In [39]:
get_main_headings("6344946")

[StringElement('Abstracting and Indexing as Topic', attributes={'MajorTopicYN': 'Y', 'UI': 'D000043'}),
 StringElement('MEDLARS', attributes={'MajorTopicYN': 'Y', 'UI': 'D008523'})]