In [4]:
from Bio import Entrez
import csv
import os
import xml.etree.ElementTree as ET
import copy

In [2]:
def check_duplicates(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    # Create a dictionary to store articles based on their pub-id values
    articles_dict = {}

    # Duplicate counter
    duplicate_count = 0

    # Iterate through each <article> element
    for article_elem in root.findall('.//article'):
        article_id_elements = article_elem.findall('.//article-id')

        # Extract pmid and pmc values for the current article
        pmid_value = None
        pmc_value = None

        for article_id_elem in article_id_elements:
            pub_id_type = article_id_elem.get('pub-id-type')
            if pub_id_type == 'pmid':
                pmid_value = article_id_elem.text
            elif pub_id_type == 'pmc':
                pmc_value = article_id_elem.text

        # Check for duplicates based on pmid or pmc values
        duplicate_found = False
        for key in articles_dict:
            if (pmid_value is not None and pmid_value in key) or (pmc_value is not None and pmc_value in key):
                print(f"{duplicate_count}: Duplicate found for article with pmid='{pmid_value}' or pmc='{pmc_value}'")
                articles_dict[('duplicate', pmid_value, pmc_value)] = article_elem
                duplicate_found = True
                duplicate_count += 1
                break

        if not duplicate_found:
            # Add the article to the dictionary
            articles_dict[(pmid_value, pmc_value)] = article_elem
    
    return articles_dict

In [12]:
# Check for duplicates:
CRC_base_papers_102_articles_dict = check_duplicates('/Users/tillohlendorf/Downloads/merged_base_papers_102.xml')

0: Duplicate found for article with pmid='35672519' or pmc='PMC9213371'
1: Duplicate found for article with pmid='35939623' or pmc='PMC9361767'
2: Duplicate found for article with pmid='36771343' or pmc='PMC9921660'
3: Duplicate found for article with pmid='35893909' or pmc='PMC9332067'
4: Duplicate found for article with pmid='36795699' or pmc='PMC9934417'
5: Duplicate found for article with pmid='35917163' or pmc='PMC9484756'
6: Duplicate found for article with pmid='35572572' or pmc='PMC9103485'
7: Duplicate found for article with pmid='34885050' or pmc='PMC8656578'
8: Duplicate found for article with pmid='33627512' or pmc='PMC8544895'
9: Duplicate found for article with pmid='34616012' or pmc='PMC8651779'
10: Duplicate found for article with pmid='33925452' or pmc='PMC8145447'
11: Duplicate found for article with pmid='34039428' or pmc='PMC8157445'
12: Duplicate found for article with pmid='34658896' or pmc='PMC8514721'
13: Duplicate found for article with pmid='34312385' or pmc='

Duplicate found for article with pmid='35672519' or pmc='PMC9213371': confirmed (<article article-type="review-article" dtd-version="1.3"><processing-meta base-tagset="archiving" mathml-version="3.0" table-model="xhtml" tagset-family="jats"><restricted-by>pmc</restricted-by></processing-meta><front><journal-meta><journal-id journal-id-type="nlm-ta">J Mol Med (Berl)</journal-id><journal-id journal-id-type="iso-abbrev">J Mol Med (Berl)</journal-id><journal-title-group><journal-title>Journal of Molecular Medicine (Berlin, Germany)</journal-title></journal-title-group><issn pub-type="ppub">0946-2716</issn><issn pub-type="epub">1432-1440</issn><publisher><publisher-name>Springer Berlin Heidelberg</publisher-name><publisher-loc>Berlin/Heidelberg</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="pmid">35672519</article-id><article-id pub-id-type="pmc">PMC9213371</article-id>)

Duplicate found for article with pmid='35939623' or pmc='PMC9361767': confirmed (<article article-type="review-article" dtd-version="1.3" xml:lang="en"><processing-meta base-tagset="archiving" mathml-version="3.0" table-model="xhtml" tagset-family="jats"><restricted-by>pmc</restricted-by></processing-meta><front><journal-meta><journal-id journal-id-type="nlm-ta">Gut Microbes</journal-id><journal-id journal-id-type="iso-abbrev">Gut Microbes</journal-id><journal-title-group><journal-title>Gut Microbes</journal-title></journal-title-group><issn pub-type="ppub">1949-0976</issn><issn pub-type="epub">1949-0984</issn><publisher><publisher-name>Taylor &amp; Francis</publisher-name></publisher></journal-meta><article-meta><article-id pub-id-type="pmid">35939623</article-id><article-id pub-id-type="pmc">PMC9361767</article-id>)

Duplicate found for article with pmid='36771343' or pmc='PMC9921660': confirmed (<article article-type="research-article" dtd-version="1.3" xml:lang="en"><processing-meta base-tagset="archiving" mathml-version="3.0" table-model="xhtml" tagset-family="jats"><restricted-by>pmc</restricted-by></processing-meta><front><journal-meta><journal-id journal-id-type="nlm-ta">Nutrients</journal-id><journal-id journal-id-type="iso-abbrev">Nutrients</journal-id><journal-id journal-id-type="publisher-id">nutrients</journal-id><journal-title-group><journal-title>Nutrients</journal-title></journal-title-group><issn pub-type="epub">2072-6643</issn><publisher><publisher-name>MDPI</publisher-name></publisher></journal-meta><article-meta><article-id pub-id-type="pmid">36771343</article-id><article-id pub-id-type="pmc">PMC9921660</article-id>)

Duplicate found for article with pmid='35893909' or pmc='PMC9332067' confirmed (<article article-type="research-article" dtd-version="1.3" xml:lang="en"><processing-meta base-tagset="archiving" mathml-version="3.0" table-model="xhtml" tagset-family="jats"><restricted-by>pmc</restricted-by></processing-meta><front><journal-meta><journal-id journal-id-type="nlm-ta">Nutrients</journal-id><journal-id journal-id-type="iso-abbrev">Nutrients</journal-id><journal-id journal-id-type="publisher-id">nutrients</journal-id><journal-title-group><journal-title>Nutrients</journal-title></journal-title-group><issn pub-type="epub">2072-6643</issn><publisher><publisher-name>MDPI</publisher-name></publisher></journal-meta><article-meta><article-id pub-id-type="pmid">35893909</article-id><article-id pub-id-type="pmc">PMC9332067</article-id>)

Duplicate found for article with pmid='36795699' or pmc='PMC9934417'
Duplicate found for article with pmid='35917163' or pmc='PMC9484756'

Duplicate found for article with pmid='35572572' or pmc='PMC9103485': confirmed (<article article-type="research-article" dtd-version="1.3" xml:lang="en"><processing-meta base-tagset="archiving" mathml-version="3.0" table-model="xhtml" tagset-family="jats"><restricted-by>pmc</restricted-by></processing-meta><front><journal-meta><journal-id journal-id-type="nlm-ta">Front Immunol</journal-id><journal-id journal-id-type="iso-abbrev">Front Immunol</journal-id><journal-id journal-id-type="publisher-id">Front. Immunol.</journal-id><journal-title-group><journal-title>Frontiers in Immunology</journal-title></journal-title-group><issn pub-type="epub">1664-3224</issn><publisher><publisher-name>Frontiers Media S.A.</publisher-name></publisher></journal-meta><article-meta><article-id pub-id-type="pmid">35572572</article-id><article-id pub-id-type="pmc">PMC9103485</article-id>)

Duplicate found for article with pmid='34885050' or pmc='PMC8656578'
Duplicate found for article with pmid='33627512' or pmc='PMC8544895'
Duplicate found for article with pmid='34616012' or pmc='PMC8651779'
Duplicate found for article with pmid='33925452' or pmc='PMC8145447'
Duplicate found for article with pmid='34039428' or pmc='PMC8157445'

Duplicate found for article with pmid='34658896' or pmc='PMC8514721': confirmed (<article article-type="review-article" dtd-version="1.3" xml:lang="en"><processing-meta base-tagset="archiving" mathml-version="3.0" table-model="xhtml" tagset-family="jats"><restricted-by>pmc</restricted-by></processing-meta><front><journal-meta><journal-id journal-id-type="nlm-ta">Front Pharmacol</journal-id><journal-id journal-id-type="iso-abbrev">Front Pharmacol</journal-id><journal-id journal-id-type="publisher-id">Front. Pharmacol.</journal-id><journal-title-group><journal-title>Frontiers in Pharmacology</journal-title></journal-title-group><issn pub-type="epub">1663-9812</issn><publisher><publisher-name>Frontiers Media S.A.</publisher-name></publisher></journal-meta><article-meta><article-id pub-id-type="pmid">34658896</article-id><article-id pub-id-type="pmc">PMC8514721</article-id>)

Duplicate found for article with pmid='34312385' or pmc='PMC8313584'
Duplicate found for article with pmid='33785070' or pmc='PMC8008609'
Duplicate found for article with pmid='34857933' or pmc='PMC8941000'
Duplicate found for article with pmid='34093985' or pmc='PMC8131268'
Duplicate found for article with pmid='34944797' or pmc='PMC8699173'
Duplicate found for article with pmid='34946037' or pmc='PMC8708753'
Duplicate found for article with pmid='33949105' or pmc='PMC8103097'
Duplicate found for article with pmid='34442780' or pmc='PMC8401523'
Duplicate found for article with pmid='34777363' or pmc='PMC8588834'
Duplicate found for article with pmid='34680258' or pmc='PMC8533893'
Duplicate found for article with pmid='35393532' or pmc='PMC8989999'
Duplicate found for article with pmid='34001868' or pmc='PMC8128913'
Duplicate found for article with pmid='34836426' or pmc='PMC8621464'
Duplicate found for article with pmid='36914891' or pmc='PMC10063449'
Duplicate found for article with pmid='37081886' or pmc='PMC10110899'
Duplicate found for article with pmid='33550886' or pmc='PMC7889221'
Duplicate found for article with pmid='32703946' or pmc='PMC7378201'
Duplicate found for article with pmid='33646117' or pmc='PMC7946422'
Duplicate found for article with pmid='32859898' or pmc='PMC7456324'
Duplicate found for article with pmid='33319778' or pmc='PMC7738495'
Duplicate found for article with pmid='33240188' or pmc='PMC7677513'
Duplicate found for article with pmid='33075129' or pmc='PMC7868734'
Duplicate found for article with pmid='32823573' or pmc='PMC7460440'
Duplicate found for article with pmid='33260635' or pmc='PMC7730263'
Duplicate found for article with pmid='32913197' or pmc='PMC7484793'


What is causing the error (duplicates)?

- Is the error in this section: <article article-type="review-article" dtd-version="1.3" xml:lang="en"><processing-meta base-tagset="archiving" mathml-version="3.0"    table-model="xhtml" tagset-family="jats"><restricted-by>pmc</restricted-by></processing-meta><front><journal-meta><journal-id journal-id-type="nlm-ta"> ?

Probably not because PMC9966613 (only PMC no pmid), PMC9319092 (only PMC no pmid), PMC9504724 (only PMC no pmid) contains that exact section as well, but is not duplicated in the set. Also the same section with 'article-type="research-article"' created duplicates. 


- Interestingly, none of the duplicates have only pmid or pmc, indicating the problem could be because of search with both Id types. However some articles with both are not duplicated, so that can't be the only reason.





In [None]:
CRC_base_papers_102_articles_dict

32 articles found in PMC9 with 7 duplicates == 25 articles found with function new extraction function which excludes duplicates 

CRC_base_papers_102_articles_dict:
{('31138793', 'PMC6538646'): <Element 'article' at 0x10d255fd0>,
 ('31628411', 'PMC6800427'): <Element 'article' at 0x10dd59260>,
 ('31681563', 'PMC6797844'): <Element 'article' at 0x10ddac8b0>,
 ('30673913', 'PMC6344609'): <Element 'article' at 0x10dd86160>,
 ('31998276', 'PMC6965490'): <Element 'article' at 0x10d364d10>,
 ('31369546', 'PMC6692052'): <Element 'article' at 0x10d5e3f10>,
 (None, 'PMC9568547'): <Element 'article' at 0x10d5367f0>,
 (None, 'PMC9100044'): <Element 'article' at 0x10bbfa020>,
 ('35732736', 'PMC9259500'): <Element 'article' at 0x10cb18bd0>,
 (None, 'PMC9966613'): <Element 'article' at 0x10d78eb60>
 ('35672519', 'PMC9213371'): <Element 'article' at 0x10d7b0a90>,
 ('duplicate', '35672519', 'PMC9213371'): <Element 'article' at 0x10cd921b0>,
 ('35939623', 'PMC9361767'): <Element 'article' at 0x10ce9f290>,
 ('duplicate', '35939623', 'PMC9361767'): <Element 'article' at 0x10d6c3880>,
 ('36771343', 'PMC9921660'): <Element 'article' at 0x10c97c400>,
 ('duplicate', '36771343', 'PMC9921660'): <Element 'article' at 0x10c205df0>,
 ('36334897', 'PMC9672454'): <Element 'article' at 0x10c264950>,
 ('35893909', 'PMC9332067'): <Element 'article' at 0x10c19f4c0>,
 ('duplicate', '35893909', 'PMC9332067'): <Element 'article' at 0x10c0f44f0>,
 (None, 'PMC9319092'): <Element 'article' at 0x10c057d80>,
 (None, 'PMC9723555'): <Element 'article' at 0x10c3a16c0>,
 (None, 'PMC9017205'): <Element 'article' at 0x10c52e2f0>,
 ('36882420', 'PMC9992512'): <Element 'article' at 0x10c584540>,
 (None, 'PMC9504724'): <Element 'article' at 0x10c4dad90>,
 (None, 'PMC9391071'): <Element 'article' at 0x10bd819e0>,
 ('35456965', 'PMC9031264'): <Element 'article' at 0x10bf204f0>,
 ('36795699', 'PMC9934417'): <Element 'article' at 0x10bfaf380>,
 ('duplicate', '36795699', 'PMC9934417'): <Element 'article' at 0x10c711490>,
 (None, 'PMC9320355'): <Element 'article' at 0x10c7b6110>,
 (None, 'PMC9105816'): <Element 'article' at 0x10dba2660>,
 ('35534699', 'PMC9259489'): <Element 'article' at 0x10d809b20>,
 ('35917163', 'PMC9484756'): <Element 'article' at 0x10cf18f90>,
 ('duplicate', '35917163', 'PMC9484756'): <Element 'article' at 0x10cfece00>,
 (None, 'PMC9207268'): <Element 'article' at 0x10bad4cc0>,
 (None, 'PMC9632171'): <Element 'article' at 0x1182f6f70>,
 ('35572572', 'PMC9103485'): <Element 'article' at 0x1182a4720>,
 ('duplicate', '35572572', 'PMC9103485'): <Element 'article' at 0x1182076f0>,
 ('35437952', 'PMC9081902'): <Element 'article' at 0x1188a2700>,
 ('35050167', 'PMC8778376'): <Element 'article' at 0x10ff6e480>,
 ('34885050', 'PMC8656578'): <Element 'article' at 0x11846b6f0>,
 ('duplicate', '34885050', 'PMC8656578'): <Element 'article' at 0x1185f5440>,
 ('34580292', 'PMC8476619'): <Element 'article' at 0x11855cf90>,
 ('33627512', 'PMC8544895'): <Element 'article' at 0x10fcdee30>,
 ('duplicate', '33627512', 'PMC8544895'): <Element 'article' at 0x11809af20>,
 ('34616012', 'PMC8651779'): <Element 'article' at 0x10fb4f010>,
 ('duplicate', '34616012', 'PMC8651779'): <Element 'article' at 0x10f1a33d0>,
 ('33925452', 'PMC8145447'): <Element 'article' at 0x10ee11580>,
 ('duplicate', '33925452', 'PMC8145447'): <Element 'article' at 0x10ee99080>,
 ('34039428', 'PMC8157445'): <Element 'article' at 0x10ee28b30>,
 ('duplicate', '34039428', 'PMC8157445'): <Element 'article' at 0x10f590ea0>,
 ('34658896', 'PMC8514721'): <Element 'article' at 0x10ecf9990>,
 ('duplicate', '34658896', 'PMC8514721'): <Element 'article' at 0x10bc78d60>,
 ('34312385', 'PMC8313584'): <Element 'article' at 0x10c6b3380>,
 ('duplicate', '34312385', 'PMC8313584'): <Element 'article' at 0x10c86ed90>,
 ('33785070', 'PMC8008609'): <Element 'article' at 0x10ca227f0>,
 ('duplicate', '33785070', 'PMC8008609'): <Element 'article' at 0x10cc05b70>,
 ('34857933', 'PMC8941000'): <Element 'article' at 0x10cce4ef0>,
 ('duplicate', '34857933', 'PMC8941000'): <Element 'article' at 0x10ed87dd0>,
 ('34093985', 'PMC8131268'): <Element 'article' at 0x10ef3ed40>,
 ('duplicate', '34093985', 'PMC8131268'): <Element 'article' at 0x10f00b8d0>,
 (None, 'PMC8855857'): <Element 'article' at 0x10f0e04a0>,
 ('34944797', 'PMC8699173'): <Element 'article' at 0x10f284c20>,
 ('duplicate', '34944797', 'PMC8699173'): <Element 'article' at 0x10f3682c0>,
 ('34946037', 'PMC8708753'): <Element 'article' at 0x10f4338d0>,
 ('duplicate', '34946037', 'PMC8708753'): <Element 'article' at 0x10f6223e0>,
 ('33949105', 'PMC8103097'): <Element 'article' at 0x10f714ef0>,
 ('duplicate', '33949105', 'PMC8103097'): <Element 'article' at 0x10f9102c0>,
 ('34442780', 'PMC8401523'): <Element 'article' at 0x10fa076a0>,
 ('duplicate', '34442780', 'PMC8401523'): <Element 'article' at 0x10faa6480>,
 ('34777363', 'PMC8588834'): <Element 'article' at 0x10fd412b0>,
 ('duplicate', '34777363', 'PMC8588834'): <Element 'article' at 0x10fdc4950>,
 ('34680258', 'PMC8533893'): <Element 'article' at 0x10fe47fb0>,
 ('duplicate', '34680258', 'PMC8533893'): <Element 'article' at 0x10fedf6a0>,
 ('35393532', 'PMC8989999'): <Element 'article' at 0x11817ad90>,
 ('duplicate', '35393532', 'PMC8989999'): <Element 'article' at 0x118633290>,
 ('34001868', 'PMC8128913'): <Element 'article' at 0x1186df790>,
 ('duplicate', '34001868', 'PMC8128913'): <Element 'article' at 0x1186f59e0>,
 ('34836426', 'PMC8621464'): <Element 'article' at 0x1186ffbf0>,
 ('duplicate', '34836426', 'PMC8621464'): <Element 'article' at 0x118944db0>,
 (None, 'PMC10064692'): <Element 'article' at 0x118a6df30>,
 (None, 'PMC10096730'): <Element 'article' at 0x118c28900>,
 (None, 'PMC10025556'): <Element 'article' at 0x119702570>,
 ('36914891', 'PMC10063449'): <Element 'article' at 0x11b068ae0>,
 ('duplicate', '36914891', 'PMC10063449'): <Element 'article' at 0x11b13a390>,
 ('37081886', 'PMC10110899'): <Element 'article' at 0x11b207ba0>,
 ('duplicate', '37081886', 'PMC10110899'): <Element 'article' at 0x11b2a4130>,
 (None, 'PMC10077291'): <Element 'article' at 0x11b324630>,
 ('33550886', 'PMC7889221'): <Element 'article' at 0x11b403ec0>,
 ('duplicate', '33550886', 'PMC7889221'): <Element 'article' at 0x11b4b6520>,
 ('32703946', 'PMC7378201'): <Element 'article' at 0x11b57cb80>,
 ('duplicate', '32703946', 'PMC7378201'): <Element 'article' at 0x11b5fc7c0>,
 ('33646117', 'PMC7946422'): <Element 'article' at 0x11b67c400>,
 ('duplicate', '33646117', 'PMC7946422'): <Element 'article' at 0x11b7540e0>,
 ('32859898', 'PMC7456324'): <Element 'article' at 0x11b7fbd30>,
 ('duplicate', '32859898', 'PMC7456324'): <Element 'article' at 0x11b8a64d0>,
 ('33319778', 'PMC7738495'): <Element 'article' at 0x11b948c70>,
 ('duplicate', '33319778', 'PMC7738495'): <Element 'article' at 0x11bac4ae0>,
 ('33240188', 'PMC7677513'): <Element 'article' at 0x11bc5c8b0>,
 ('duplicate', '33240188', 'PMC7677513'): <Element 'article' at 0x11bd0eb60>,
 ('33075129', 'PMC7868734'): <Element 'article' at 0x11bdc8db0>,
 ('duplicate', '33075129', 'PMC7868734'): <Element 'article' at 0x11be954e0>,
 ('32823573', 'PMC7460440'): <Element 'article' at 0x11bf59c10>,
 ('duplicate', '32823573', 'PMC7460440'): <Element 'article' at 0x11bfb2110>,
 ('33260635', 'PMC7730263'): <Element 'article' at 0x11c01a660>,
 ('duplicate', '33260635', 'PMC7730263'): <Element 'article' at 0x11c0bf010>,
 ('32913197', 'PMC7484793'): <Element 'article' at 0x11c15f9c0>,
 ('duplicate', '32913197', 'PMC7484793'): <Element 'article' at 0x11c1fdf80>,
 ('33469015', 'PMC7815729'): <Element 'article' at 0x11c28c4f0>}

In [10]:
# Check for duplicates based on pmid or pmc values
PMC9_articles_dict = check_duplicates('/Users/tillohlendorf/Documents/MBT/Module/Systems_BioMedicine/NLP/TFA_repo/sysbiomed_nlp_project/Till/Extract_XML_duplicate_test_PMC9.xml')

In [None]:
PMC9_articles_dict

PMC9_articles_dict: 25 articles
{(None, 'PMC9100044'): <Element 'article' at 0x10d208900>,
 ('35893909', 'PMC9332067'): <Element 'article' at 0x10e6147c0>,
 ('36795699', 'PMC9934417'): <Element 'article' at 0x10e5286d0>,
 ('35456965', 'PMC9031264'): <Element 'article' at 0x10e57fbf0>,
 ('35939623', 'PMC9361767'): <Element 'article' at 0x10e5e69d0>,
 (None, 'PMC9207268'): <Element 'article' at 0x10b7ad350>,
 ('36882420', 'PMC9992512'): <Element 'article' at 0x1195c5b20>,
 (None, 'PMC9568547'): <Element 'article' at 0x10b9099e0>,
 ('35534699', 'PMC9259489'): <Element 'article' at 0x10b9ef290>,
 ('35732736', 'PMC9259500'): <Element 'article' at 0x10dc47f10>,
 (None, 'PMC9319092'): <Element 'article' at 0x10da43ce0>,
 (None, 'PMC9017205'): <Element 'article' at 0x10e3c9620>,
 ('35572572', 'PMC9103485'): <Element 'article' at 0x10d44ee80>,
 (None, 'PMC9105816'): <Element 'article' at 0x10d40ccc0>,
 ('36771343', 'PMC9921660'): <Element 'article' at 0x10e113ab0>,
 ('35437952', 'PMC9081902'): <Element 'article' at 0x10e08d990>,
 (None, 'PMC9632171'): <Element 'article' at 0x10e4ab6a0>,
 (None, 'PMC9504724'): <Element 'article' at 0x10b606980>,
 ('35672519', 'PMC9213371'): <Element 'article' at 0x10df567a0>,
 (None, 'PMC9320355'): <Element 'article' at 0x10dfefd30>,
 ('35917163', 'PMC9484756'): <Element 'article' at 0x10e25c0e0>,
 ('36334897', 'PMC9672454'): <Element 'article' at 0x10d0b44a0>,
 (None, 'PMC9723555'): <Element 'article' at 0x10d02f600>,
 (None, 'PMC9966613'): <Element 'article' at 0x10d105080>,
 (None, 'PMC9391071'): <Element 'article' at 0x10d1dd8f0>}