## Workflow

- article provided as doi (for now, alternatives possible)
- look up citing documents (OpenAlex)
- extract pmids
- pubmed search using pmids and clinicaltrial query
- estimate impact

In [8]:
import requests

In [2]:
def get_citing_pmids(doi):
    
    base_url_works = 'https://api.openalex.org/works'
    
    # get work id
    params = {'filter': f'doi:{doi}'}
    r = requests.get(base_url_works, params)
    data = r.json()
    work_id = data['results'][0]['id']  # if multiple, take first
    work_id = work_id.replace('https://openalex.org/', '')
    
    # obtain citing documents/pmids
    params = {'filter': f'cites:{work_id}',
              'cursor': '*', 'per-page': 100}
    pmids = set()
    done = False
    while not done:
        r = requests.get(base_url_works, params)
        data = r.json()
        for work in data['results']:
            pmid = work['ids'].get('pmid')
            if pmid:
                pmids.add(pmid.replace('https://pubmed.ncbi.nlm.nih.gov/', ''))
        if data['meta']['next_cursor']:
            params['cursor'] = data['meta']['next_cursor']
        else:
            done = True
    
    return list(pmids)

In [26]:
def get_clinical_trials(pmids):
    
    search_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    
    query = ' OR '.join([f'{pmid}[pmid]' for pmid in pmids]) + ' AND (clinicaltrial[Filter])'
                                                                     
    data = {'term': query.encode('utf-8'), 'db': 'pubmed', 'retmax': 10000, 'retmode': 'json'}
    # https://stackoverflow.com/questions/55887958/what-is-the-default-encoding-when-python-requests-post-data-is-string-type
    headers={'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8',
             'Accept': 'application/json'}
    r = requests.post(search_url, data=data, headers=headers)
    data = r.json()['esearchresult']
    
    return data['idlist']

## Test

In [9]:
doi = '10.1136/annrheumdis-2019-216655'  # this article cites at least 1 clinical trial

pmid_exp = '35081280'  # we expect this trial to be amongst them

In [10]:
pmids = get_citing_pmids(doi)

In [21]:
len(pmids)

1308

In [27]:
pmids_ct = get_clinical_trials(pmids)

In [28]:
len(pmids_ct)

89

In [30]:
pmid_exp in pmids_ct

True