In [8]:
import os
import shutil
import pandas as pd
import requests
import time
url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&linkname=pubmed_pubmed_citedin&id=22407813"
pmid = '22407813'


In [9]:
def download_pmc_xml(pmid):
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&linkname=pubmed_pubmed_citedin&id={pmid}&rettype=xml"
    response = requests.get(url, timeout=30)
    if response.status_code == 200:
       
        file_path = f"{pmid}.xml"
        with open("citing_xml", "wb") as f:
            f.write(response.content)  # Save response XML
        print(f" File downloaded")
        return  # Exit function if successful

    #  Handle Other Errors 
    print(f" Failed ... (Status {response.status_code})")
download_pmc_xml(pmid)
xml_file = 'citing_xml'


 File downloaded


In [10]:
import lxml.etree as ET

pmids = []

def extract_pmids(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    #print(root.findall(".//Link"))
    for link in root.findall(".//Link"):
        text = " ".join(link.itertext()).strip()
        pmids.append(text)
extract_pmids(xml_file)      
print(len(pmids), pmids)

64 ['39275059', '38981955', '38622137', '38497004', '38474532', '38076646', '37214711', '37034382', '36979588', '36737845', '36536633', '36475576', '36316402', '36133414', '36132507', '35558776', '35542809', '35541833', '35529211', '35520477', '35514886', '35497143', '35494640', '35424689', '35335711', '34838374', '34377553', '34067118', '34065804', '33803897', '33291853', '33244465', '32688287', '32328432', '32208468', '32195453', '31938900', '31652527', '31459247', '31458732', '31457796', '31410663', '31199142', '31129736', '31038938', '30612201', '30572668', '30460280', '30343449', '29349585', '29269774', '28724997', '28717465', '28605883', '28458628', '28134510', '27830733', '27818705', '27752120', '27266447', '25346649', '24389590', '24121717', '23502324']


In [11]:
import time
import requests
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

BASE_URL = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/"

def fetch_doi(batch):
    params = {'format': 'json', 'ids': ','.join(batch),}
    try:
        response = requests.get(BASE_URL, params=params, timeout=10)
        if response.status_code == 200:
            data = response.json()
            return [(rec.get('doi', 'No full text available'), rec.get('pmid', 'Not Found'))
                    for rec in data.get('records', [])]
        elif response.status_code == 429:
            print("Rate limit exceeded! Retrying after 10 seconds...")
            time.sleep(10)
            return fetch_ids_batch(doi_batch)  # Retry the same batch
    except requests.RequestException as e:
        print(f"Request failed: {e}")

    # Return errors if request fails
    return [(doi, 'Error', 'Error') for doi in doi_batch]


batch_size = 3
nb_workers = 1
results = []

batches = [pmids[i:i + batch_size] for i in range(0, len(pmids), batch_size)]

with ThreadPoolExecutor(max_workers = nb_workers) as executor:
    results_list = list(executor.map(fetch_doi, batches)) # This returns a list of lists (batches) of tuples

# Flatten results (since each batch returns a list)
results = [item for sublist in results_list for item in sublist] # This is a list of tuples after flattening the batches lists

# Convert results to DataFrame and save as CSV
result_df = pd.DataFrame(results, columns=['DOI', 'PMID'])
display(result_df)
result_df.to_csv('citing_dois.tsv', index=False, sep='\t') 


Unnamed: 0,DOI,PMID
0,10.1038/s41467-024-47549-1,38622137
1,10.3390/molecules29174211,39275059
2,No full text available,38981955
3,10.1016/j.bioactmat.2023.10.025,38076646
4,10.3390/molecules29051020,38474532
...,...,...
59,10.1016/j.nantod.2016.05.010,27818705
60,10.1038/srep02933,24121717
61,10.1038/srep03564,24389590
62,10.1186/1556-276X-9-583,25346649


In [None]:
def fetch_doi(pmid):
    params = {'format': 'json', 'ids': pmid,}
    try:
        response = requests.get(BASE_URL, params=params, timeout=10)
        if response.status_code == 200:
            data = response.json()
            print(data)
            rec = data.get('records', [])
            print(rec)
            return [(rec.get('doi', 'Not Found'))
                    for rec in data.get('records', [])]
        elif response.status_code == 429:
            print("Rate limit exceeded! Retrying after 10 seconds...")
            time.sleep(10)
            return fetch_ids_batch(doi_batch)  # Retry the same batch
    except requests.RequestException as e:
        print(f"Request failed: {e}")

    # Return errors if request fails
    return [(doi, 'Error', 'Error') for doi in doi_batch]
print(fetch_doi('15369599')[0])

In [None]:
print(results)

In [17]:
## useful list of Entrez databases that can be queried through API
# pmc_pubmed	PubMed citations for these articles
# pmc_refs_pubmed	PubMed article citing PMC article
# pmc_pmc_cites	   PMC articles that given PMC article cites
# pmc_pmc_citedby	PMC article citing given PMC article
# pubmed_pubmed	 Calculated set of PubMed citations similar to the selected article(s) retrieved using a word weight algorithm. 
# pubmed_pubmed_refs    Citation referenced in PubMed article. Only valid for PubMed citations that are also in PMC.


from Bio import Entrez

Entrez.email = "mcfrank@stanford.edu"

def get_abstract(pmid):
    handle = Entrez.efetch(db='pubmed', id=pmid, retmode='text', rettype='abstract')
    return handle.read()

def get_links_id(pmid):
	link_list = []
	links = Entrez.elink(dbfrom="pubmed", id=pmid, linkname="pubmed_pubmed")	
	record = Entrez.read(links)
	
	records = record[0][u'LinkSetDb'][0][u'Link']

	for link in records:
		link_list.append(link[u'Id'])

	return link_list

def get_links_term(term):
	links = Entrez.esearch(db="pubmed", retmax = 1000, term=term)	
	record = Entrez.read(links)
	link_list = record[u'IdList']

	return link_list


### MAIN -----------------------

#print(get_links_term("Saffran JR[Author] "))

#print ("----------------------------")

#print(get_abstract("22407813"))

#print ("----------------------------")

#print(get_links_id("22407813"))



In [54]:
import time
from habanero import Crossref
import requests

cr = Crossref(mailto="recheinje@gmail.com", timeout=60)

def fetch_crossref_data(doi, retries=3):
    attempt = 0
    while attempt < retries:
        try:
            work = cr.works(ids=doi)
            #print('this is work ',work)
            if "message" not in work:
                print("No data found for DOI.")
                return

            message = work["message"]
            #print('this is message ', message)

            # Extract metadata
            title = message.get("title", ["No title found"])[0]
            '''license_url = None
            if "license" in message:
                license_url = message["license"][0].get("URL")'''
            
            # Extract full-text URLs for text-mining
            links = message.get("link", [])
            text_mining_links = {}
            for link in links:
                if link.get("intended-application") == "text-mining":
                    content_type = link.get("content-type")
                    url = link.get("URL")
                    text_mining_links[content_type] = url
            
            # Output
            print(f"Title: {title}")
            #print(f"License URL: {license_url}")
            print("Text-Mining URLs:")
            for k, v in text_mining_links.items():
                print(f"{k}: {v}")

            return {
                "title": title,
               # "license_url": license_url,
                "text_mining_links": text_mining_links
            }
        
        except requests.exceptions.Timeout:
            print(f"Timeout occurred. Retrying... ({attempt+1}/{retries})")
            attempt += 1
            time.sleep(2)  # wait before retrying
        except Exception as e:
            print(f"Error: {str(e)}")
            break

            
my_df = pd.read_csv('citing_dois.tsv', sep = '\t', dtype=str)
dois = my_df['DOI']


with open("file.txt", "a") as f:
    for d in dois:
        f.write('DOI: ' + d + '\n')
        print('DOI: ', d)
        
        result = fetch_crossref_data(d)
        if result is not None:
            title = result["title"]
            text_mining_links = result["text_mining_links"]
            
            print(title)
            print(text_mining_links)
            
            f.write(title + '\n')
            for content_type, url in text_mining_links.items():
                f.write(f"{content_type}: {url}\n")
        else:
            print(f"Warning: No data found for DOI {d}")
            f.write("No data found for this DOI\n")


DOI:  10.1038/s41467-024-47549-1
Title: Alkyne-tagged SERS nanoprobe for understanding Cu+ and Cu2+ conversion in cuproptosis processes
Text-Mining URLs:
application/pdf: https://www.nature.com/articles/s41467-024-47549-1.pdf
text/html: https://www.nature.com/articles/s41467-024-47549-1
Alkyne-tagged SERS nanoprobe for understanding Cu+ and Cu2+ conversion in cuproptosis processes
{'application/pdf': 'https://www.nature.com/articles/s41467-024-47549-1.pdf', 'text/html': 'https://www.nature.com/articles/s41467-024-47549-1'}
DOI:  10.3390/molecules29174211
Title: Carbon Dots: A Versatile Platform for Cu2+ Detection, Anti-Counterfeiting, and Bioimaging
Text-Mining URLs:
Carbon Dots: A Versatile Platform for Cu2+ Detection, Anti-Counterfeiting, and Bioimaging
{}
DOI:  No full text available
Error: Client error '404 Not Found' for url 'https://api.crossref.org/works/No'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404
DOI:  10.1016/j.bioactmat.2023.10

Title: Horseradish peroxidase-triggered direct in situ fluorescent immunoassay platform for sensing cardiac troponin I and SARS-CoV-2 nucleocapsid protein in serum
Text-Mining URLs:
text/xml: https://api.elsevier.com/content/article/PII:S0956566321008605?httpAccept=text/xml
text/plain: https://api.elsevier.com/content/article/PII:S0956566321008605?httpAccept=text/plain
Horseradish peroxidase-triggered direct in situ fluorescent immunoassay platform for sensing cardiac troponin I and SARS-CoV-2 nucleocapsid protein in serum
{'text/xml': 'https://api.elsevier.com/content/article/PII:S0956566321008605?httpAccept=text/xml', 'text/plain': 'https://api.elsevier.com/content/article/PII:S0956566321008605?httpAccept=text/plain'}
DOI:  10.3390/nano12060898
Title: Synthesis of Doped/Hybrid Carbon Dots and Their Biomedical Application
Text-Mining URLs:
Synthesis of Doped/Hybrid Carbon Dots and Their Biomedical Application
{}
DOI:  10.3390/ma14061504
Title: Non-Invasive Topical Drug-Delivery System

Title: Molecularly Imprinted Core-Shell CdSe@SiO2/CDs as a Ratiometric Fluorescent Probe for 4-Nitrophenol Sensing
Text-Mining URLs:
text/html: http://link.springer.com/article/10.1186/s11671-018-2440-6/fulltext.html
application/pdf: http://link.springer.com/content/pdf/10.1186/s11671-018-2440-6.pdf
Molecularly Imprinted Core-Shell CdSe@SiO2/CDs as a Ratiometric Fluorescent Probe for 4-Nitrophenol Sensing
{'text/html': 'http://link.springer.com/article/10.1186/s11671-018-2440-6/fulltext.html', 'application/pdf': 'http://link.springer.com/content/pdf/10.1186/s11671-018-2440-6.pdf'}
DOI:  No full text available
Error: Client error '404 Not Found' for url 'https://api.crossref.org/works/No'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404
DOI:  10.1186/s11671-017-2137-2
Title: High-Efficient Excitation-Independent Blue Luminescent Carbon Dots
Text-Mining URLs:
application/pdf: http://link.springer.com/content/pdf/10.1186/s11671-017-2137-2.pdf
text/h

In [43]:
import requests
import csv

def get_opencitations(doi):
    doi_encoded = requests.utils.quote(doi)
    url = f"https://opencitations.net/index/coci/api/v1/citations/{doi_encoded}"
    response = requests.get(url)
    cited_dois = []
    if response.status_code == 200:
        for item in response.json():
            citing_doi = item.get('citing')
            if citing_doi:
                cited_dois.append(citing_doi)
    return cited_dois

def get_semantic_scholar(doi):
    url = f"https://api.semanticscholar.org/graph/v1/paper/DOI:{doi}?fields=citations.paperId,citations.externalIds"
    response = requests.get(url)
    cited_dois = []
    if response.status_code == 200:
        data = response.json()
        citations = data.get('citations', [])
        for citation in citations:
            ext_ids = citation.get('externalIds', {})
            citing_doi = ext_ids.get('DOI')
            if citing_doi:
                cited_dois.append(citing_doi)
    return cited_dois

def get_crossref(doi):
    url = f"https://api.crossref.org/works/{doi}"
    response = requests.get(url)
    citation_count = None
    if response.status_code == 200:
        data = response.json()
        citation_count = data['message'].get('is-referenced-by-count')
    return citation_count

def cross_check_citations(doi):
    print(f"Cross-checking DOI: {doi}\n")

    # OpenCitations
    oc_dois = get_opencitations(doi)
    print(f"OpenCitations: {len(oc_dois)} citing DOIs found.")

    # Semantic Scholar
    ss_dois = get_semantic_scholar(doi)
    print(f"Semantic Scholar: {len(ss_dois)} citing DOIs found.")

    # Crossref (gives citation count but no DOIs)
    cr_count = get_crossref(doi)
    print(f"Crossref metadata says: {cr_count} citations (count only, no list).")

    # Optional: compare overlap
    overlap = set(oc_dois) & set(ss_dois)
    print(f"\nOverlap between OpenCitations & Semantic Scholar: {len(overlap)} DOIs")

    # Save to TSV
    with open('citations.tsv', 'w', newline='', encoding='utf-8') as tsvfile:
        writer = csv.writer(tsvfile, delimiter='\t')
        writer.writerow(['OpenCitations_DOI', 'SemanticScholar_DOI'])
        max_len = max(len(oc_dois), len(ss_dois))
        for i in range(max_len):
            oc_doi = oc_dois[i] if i < len(oc_dois) else ''
            ss_doi = ss_dois[i] if i < len(ss_dois) else ''
            writer.writerow([oc_doi, ss_doi])

    print("\nDOIs saved to citations.tsv")

    return {
        "OpenCitations": oc_dois,
        "SemanticScholar": ss_dois,
        "CrossrefCount": cr_count
    }

doi = "10.1002/anie.201109089"
results = cross_check_citations(doi)


Cross-checking DOI: 10.1002/anie.201109089

OpenCitations: 484 citing DOIs found.
Semantic Scholar: 544 citing DOIs found.
Crossref metadata says: 499 citations (count only, no list).

Overlap between OpenCitations & Semantic Scholar: 332 DOIs

DOIs saved to citations.tsv


In [76]:
import pandas as pd

def cross_check_citations(doi):
    print(f"Cross-checking DOI: {doi}\n")

    # Load your citing DOIs from TSV
    my_df = pd.read_csv('citing_dois.tsv', sep='\t', dtype=str)
    input_dois = set(my_df['DOI'].dropna())

    # OpenCitations
    oc_dois = get_opencitations(doi)
    print(f"OpenCitations: {len(oc_dois)} citing DOIs found.")

    # Semantic Scholar
    ss_dois = get_semantic_scholar(doi)
    print(f"Semantic Scholar: {len(ss_dois)} citing DOIs found.")

    # Crossref count
    cr_count = get_crossref(doi)
    print(f"Crossref metadata says: {cr_count} citations (count only, no list).")

    # Sets
    set_oc = set(oc_dois)
    set_ss = set(ss_dois)

    # Overlap between API sources
    overlap_oc_ss = set_oc & set_ss

    # Overlap with your dataframe DOIs
    overlap_input_oc = input_dois & set_oc
    overlap_input_ss = input_dois & set_ss
    overlap_input_both = input_dois & overlap_oc_ss

    print(f"\nOverlap between OpenCitations & Semantic Scholar: {len(overlap_oc_ss)} DOIs")
    print(f"Overlap between your input DOIs & OpenCitations: {len(overlap_input_oc)} DOIs")
    print(f"Overlap between your input DOIs & Semantic Scholar: {len(overlap_input_ss)} DOIs")
    print(f"DOIs present in ALL THREE (your input, OC, SS): {len(overlap_input_both)} DOIs")

    # Optional: print the DOIs found in all three sources
    """print("\nDOIs in aPMC and OC:")
    for doi in overlap_input_oc:
        print(doi)
        
    print("\nDOIs in PMC and SS:")
    for doi in overlap_input_ss:
        print(doi)"""

    # Save to TSV
    with open('citations.tsv', 'w', newline='', encoding='utf-8') as tsvfile:
        writer = csv.writer(tsvfile, delimiter='\t')
        writer.writerow(['OpenCitations_DOI', 'SemanticScholar_DOI'])
        max_len = max(len(oc_dois), len(ss_dois))
        for i in range(max_len):
            oc_doi = oc_dois[i] if i < len(oc_dois) else ''
            ss_doi = ss_dois[i] if i < len(ss_dois) else ''
            writer.writerow([oc_doi, ss_doi])

    print("\nDOIs saved to citations.tsv")

    return {
        "OpenCitations": oc_dois,
        "SemanticScholar": ss_dois,
        "CrossrefCount": cr_count,
        "Overlap_Input_OC": list(overlap_input_oc),
        "Overlap_Input_SS": list(overlap_input_ss),
        "Overlap_Input_Both": list(overlap_input_both)
    }


In [77]:
dois = cross_check_citations("10.1002/anie.201109089")
print(dois)

Cross-checking DOI: 10.1002/anie.201109089

OpenCitations: 484 citing DOIs found.
Semantic Scholar: 544 citing DOIs found.
Crossref metadata says: 499 citations (count only, no list).

Overlap between OpenCitations & Semantic Scholar: 332 DOIs
Overlap between your input DOIs & OpenCitations: 46 DOIs
Overlap between your input DOIs & Semantic Scholar: 54 DOIs
DOIs present in ALL THREE (your input, OC, SS): 44 DOIs

DOIs saved to citations.tsv
{'OpenCitations': ['10.1039/d0nj01814g', '10.1021/acs.chemrev.5b00008', '10.1039/c9qm00415g', '10.1007/s00604-018-3043-8', '10.1039/c9na00794f', '10.1039/c9na00168a', '10.1038/s41598-017-06356-z', '10.1038/srep03564', '10.1039/c5an00957j', '10.1021/acsapm.2c02098', '10.1039/c2jm32973e', '10.1016/j.snb.2020.127916', '10.1039/c9ra05689k', '10.1021/jp307308z', '10.1007/978-3-030-38101-1_5', '10.1016/j.matpr.2020.11.417', '10.1016/j.jhazmat.2019.02.008', '10.1016/b978-0-323-47906-6.00001-1', '10.1039/c2cc35966a', '10.1039/c7nr03754f', '10.1016/j.snb.20

In [84]:
my_df = pd.read_csv('citing_dois.tsv', sep='\t', dtype=str)
input_dois = set(my_df['DOI'].dropna())

In [85]:
import requests

def check_open_access(doi):
    url = f"https://api.unpaywall.org/v2/{doi}"
    params = {
        "email": "recheinje@gmail.com"
    }
    response = requests.get(url, params=params)
    if response.status_code == 200:
        data = response.json()
        if data.get('is_oa'): 
            link[doi] = data.get('best_oa_location', {}).get('url')
            #print(f"DOI: {doi} is Open Access")
            #print(f"OA URL: {data.get('best_oa_location', {}).get('url')}")
        else:
            print(f"DOI: {doi} is not Open Access")
    else:
        print(f"Error: {response.status_code} for DOI: {doi}")

# Example usage
#doi = "10.3390/nano11051232"  # Replace with any DOI you want to check
link = {}
i = 0

for doi in input_dois:
    check_open_access(doi)


Error: 404 for DOI: No full text available


In [89]:
print(len(input_dois))
print(len(link))

60
59
