In [1]:
from Bio import Entrez
import csv
import os
import xml.etree.ElementTree as ET
import copy

In [7]:

def detect_duplicate_elements(xml_tree):
    duplicate_elements = []
    pub_ids = set()

    for element in xml_tree.iter():
        pub_id_type = element.attrib.get('pub-id-type')
        
        if pub_id_type == 'pmid' or pub_id_type == 'pmc':
            if element.text in pub_ids:
                duplicate_elements.append(element.text)
            else:
                pub_ids.add(element.text)

    return duplicate_elements

In [16]:
def detect_duplicate_elements(xml_tree):
    pub_id_tuples = []
    duplicate_elements = []

    for element in xml_tree.iter():
        id_tuple = ()

        if element.attrib.get('pub-id-type') == 'pmid':
            id_tuple += (element.text,)

        if element.attrib.get('pub-id-type') == 'pmc':
            id_tuple += (element.text,)

        if id_tuple:
            match_found = False
            for pub_id_tuple in pub_id_tuples:
                if any(id in pub_id_tuple for id in id_tuple):
                    duplicate_elements.append(id_tuple)
                    match_found = True
                    break

            if not match_found:
                pub_id_tuples.append(id_tuple)

    return pub_id_tuples, duplicate_elements

In [17]:
tree = ET.parse('/Users/tillohlendorf/Downloads/merged_base_papers_102.xml')
pub_id_tuples, duplicate_elements = detect_duplicate_elements(tree)
print(duplicate_elements)



[('17934449',), ('17934449',), ('31138793',), ('26405015',), ('25456129',), ('25319413',), ('29045828',), ('25464853',), ('28889947',), ('28886380',), ('28889989',), ('21772278',), ('35672519',), ('PMC9213371',), ('11138001',), ('16741571',), ('26272906',), ('26160380',), ('22318520',), ('29858010',), ('19260764',), ('22066012',), ('20064449',), ('20064450',), ('19454668',), ('31138793',), ('26405015',), ('25456129',), ('25319413',), ('30979818',), ('15070758',), ('20072126',), ('14499113',), ('21625598',), ('29045828',), ('33960413',), ('19995950',), ('29941474',), ('25464853',), ('28889947',), ('28886380',), ('10077625',), ('25725099',), ('27068879',), ('28889989',), ('2203532',), ('2203531',), ('19234210',), ('30643267',), ('28580957',), ('24014877',), ('24416730',), ('21789593',), ('10878354',), ('12471145',), ('16732290',), ('16585559',), ('19411637',), ('21807870',), ('24073289',), ('26912039',), ('22045568',), ('29116141',), ('10602462',), ('25376833',), ('21772278',), ('1877158

In [10]:
def check_for_duplicates(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    article_ids = set()
    total_articles = 0

    for article in root.iter('article'):
        total_articles += 1
        for element in article.iter('article-id'):
            article_ids.add(element.text)

    if len(article_ids) < total_articles:
        print("There are duplicate articles.")
    else:
        print("There are no duplicate articles.")
    return(article_ids)

In [12]:
tree = ET.parse('/Users/tillohlendorf/Downloads/merged_base_papers_102.xml')
duplicate_elements = detect_duplicate_elements(tree)
print(duplicate_elements)

article_ids = check_for_duplicates('/Users/tillohlendorf/Downloads/merged_base_papers_102.xml')


['17934449', '17934449', '31138793', '26405015', '25456129', '25319413', '29045828', '25464853', '28889947', '28886380', '28889989', '21772278', '35672519', 'PMC9213371', '11138001', '16741571', '26272906', '26160380', '22318520', '29858010', '19260764', '22066012', '20064449', '20064450', '19454668', '31138793', '26405015', '25456129', '25319413', '30979818', '15070758', '20072126', '14499113', '21625598', '29045828', '33960413', '19995950', '29941474', '25464853', '28889947', '28886380', '10077625', '25725099', '27068879', '28889989', '2203532', '2203531', '19234210', '30643267', '28580957', '24014877', '24416730', '21789593', '10878354', '12471145', '16732290', '16585559', '19411637', '21807870', '24073289', '26912039', '22045568', '29116141', '10602462', '25376833', '21772278', '18771589', '8892630', '28385374', '31554891', '29459403', '29298831', '31578269', '22302935', '18927578', '19438970', '23578005', '28598502', '28452361', '33525048', '11931770', '23200824', '17641034', '313

In [23]:
def check_duplicates(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    # Create a dictionary to store articles based on their pub-id values
    articles_dict = {}

    # Iterate through each <article> element
    for article_elem in root.findall('.//article'):
        article_id_elements = article_elem.findall('.//article-id')

        # Extract pmid and pmc values for the current article
        pmid_value = None
        pmc_value = None

        for article_id_elem in article_id_elements:
            pub_id_type = article_id_elem.get('pub-id-type')
            if pub_id_type == 'pmid':
                pmid_value = article_id_elem.text
            elif pub_id_type == 'pmc':
                pmc_value = article_id_elem.text

        # Check for duplicates based on pmid or pmc values
        duplicate_found = False
        for key in articles_dict:
            if (pmid_value is not None and pmid_value in key) or (pmc_value is not None and pmc_value in key):
                print(f"Duplicate found for article with pmid='{pmid_value}' or pmc='{pmc_value}'")
                duplicate_found = True
                break

        if not duplicate_found:
            # Add the article to the dictionary
            articles_dict[(pmid_value, pmc_value)] = article_elem
    
    return articles_dict

# Replace 'your_xml_file.xml' with the actual path to your XML file
articles_dict = check_duplicates('/Users/tillohlendorf/Downloads/merged_base_papers_102.xml')

Duplicate found for article with pmid='35672519' or pmc='PMC9213371'
Duplicate found for article with pmid='35939623' or pmc='PMC9361767'
Duplicate found for article with pmid='36771343' or pmc='PMC9921660'
Duplicate found for article with pmid='35893909' or pmc='PMC9332067'
Duplicate found for article with pmid='36795699' or pmc='PMC9934417'
Duplicate found for article with pmid='35917163' or pmc='PMC9484756'
Duplicate found for article with pmid='35572572' or pmc='PMC9103485'
Duplicate found for article with pmid='34885050' or pmc='PMC8656578'
Duplicate found for article with pmid='33627512' or pmc='PMC8544895'
Duplicate found for article with pmid='34616012' or pmc='PMC8651779'
Duplicate found for article with pmid='33925452' or pmc='PMC8145447'
Duplicate found for article with pmid='34039428' or pmc='PMC8157445'
Duplicate found for article with pmid='34658896' or pmc='PMC8514721'
Duplicate found for article with pmid='34312385' or pmc='PMC8313584'
Duplicate found for article with p

In [22]:
articles_dict

{('31138793', 'PMC6538646'): <Element 'article' at 0x17d2379c0>,
 ('31628411', 'PMC6800427'): <Element 'article' at 0x17b64c950>,
 ('31681563', 'PMC6797844'): <Element 'article' at 0x1079631a0>,
 ('30673913', 'PMC6344609'): <Element 'article' at 0x1191cf5b0>,
 ('31998276', 'PMC6965490'): <Element 'article' at 0x16b3c50d0>,
 ('31369546', 'PMC6692052'): <Element 'article' at 0x17b186a20>,
 (None, 'PMC9568547'): <Element 'article' at 0x17a8498a0>,
 ('35732736', 'PMC9259500'): <Element 'article' at 0x16bced530>,
 ('35672519', 'PMC9213371'): <Element 'article' at 0x118188090>,
 ('35939623', 'PMC9361767'): <Element 'article' at 0x17c1e56c0>,
 ('36771343', 'PMC9921660'): <Element 'article' at 0x16b0b4d60>,
 ('36334897', 'PMC9672454'): <Element 'article' at 0x1394b4180>,
 ('35893909', 'PMC9332067'): <Element 'article' at 0x17b58d580>,
 ('36882420', 'PMC9992512'): <Element 'article' at 0x138a4c4a0>,
 ('35456965', 'PMC9031264'): <Element 'article' at 0x1196f0e50>,
 ('36795699', 'PMC9934417'): <E