In [1]:
import os
import re
import copy
import pandas as pd
import warnings

# DType warning when importing .csv files with parsed PMIDs. Avoid showing the warning
warnings.simplefilter(action='ignore', category=pd.errors.DtypeWarning)

In [2]:
# Directory containing .txt files containing PMIDs
PMID_input = input().strip()

 C:\Users\svalb\OneDrive\Escritorio\


In [3]:
# Directory containing .csv files with parsed articles from XMLs
parsedDFs_input = input().strip()

 E:\sciAbstractsProject_parsedDFs\


In [4]:
# Combine all PMIDs in a single list
with open(PMID_input+"pmids_until_19991231.txt") as file_pmids_until_19991231:
    articles_pmids = [line.rstrip() for line in file_pmids_until_19991231]

with open(PMID_input+"pmids_20000101_until_20091231.txt") as file_pmids_20000101_until_20091231:
    articles_pmids_20000101_until_20091231 = [line.rstrip() for line in file_pmids_20000101_until_20091231]

with open(PMID_input+"pmids_20100101_until_20191231.txt") as file_pmids_20100101_until_20191231:
    articles_pmids_20100101_until_20191231 = [line.rstrip() for line in file_pmids_20100101_until_20191231]

with open(PMID_input+"pmids_20200101_until_20240701.txt") as file_pmids_20200101_until_20240701:
    articles_pmids_20200101_until_20240701 = [line.rstrip() for line in file_pmids_20200101_until_20240701]


articles_pmids.extend(articles_pmids_20000101_until_20091231)
articles_pmids.extend(articles_pmids_20100101_until_20191231)
articles_pmids.extend(articles_pmids_20200101_until_20240701)

In [5]:
# Number of PMIDs
len(articles_pmids)

37845132

In [6]:
# Separate in unique and duplicates PMIDs
unique_pmids = set()
duplicated_pmids = []

for pmid in articles_pmids:
    if pmid in unique_pmids:
        duplicated_pmids.append(pmid)
    else:
        unique_pmids.add(pmid)

In [7]:
# Number of unique PMIDs
len(unique_pmids)

37544208

In [8]:
# Number of duplicated PMIDs
len(duplicated_pmids)

300924

In [9]:
# Save lists of unique and duplicated PMIDs as .txt files
with open(os.path.join("E:", "PMIDsInitialParsing", "unique_PMIDs.txt"), 'w') as output:
    for row in unique_pmids:
        output.write(str(row) + '\n')

with open(os.path.join("E:", "PMIDsInitialParsing", "duplicated_PMIDs.txt"), 'w') as output:
    for row in duplicated_pmids:
        output.write(str(row) + '\n')

In [9]:
# Extract and add names of files in directories containing .csv files to a list, keep only .csv files
parsed_DFs = []
for root, dirs, files in os.walk(parsedDFs_input):
        for file in files:
            if file[-4:] == ".csv":
                parsed_DFs.append(os.path.join(root, file))

In [None]:
# For each .csv file, extract the values of 'PMID' column and add to a list
parsedDFs_PMIDs = []
for DF in parsed_DFs:
    print("Extracting PMIDs from file: " + str(parsed_DFs.index(DF)+1) + "/" + str(len(parsed_DFs)))
    df = pd.read_csv(DF)
    parsedDFs_PMIDs.extend(list(df["PMID"].values))

Extracting PMIDs from file: 1/382
Extracting PMIDs from file: 2/382


In [14]:
# Separate in unique and duplicates parsed PMIDs
unique_pmids_parsed_DFs = set()
duplicated_pmids_parsed_DFs = []

for parsed_pmid in parsedDFs_PMIDs:
    if str(parsed_pmid) in unique_pmids_parsed_DFs:
        duplicated_pmids_parsed_DFs.append(str(parsed_pmid))
    else:
        unique_pmids_parsed_DFs.add(str(parsed_pmid))

In [19]:
len(parsedDFs_PMIDs)

37844775

In [20]:
len(unique_pmids_parsed_DFs)

37543881

In [21]:
len(duplicated_pmids_parsed_DFs)

300894

In [22]:
# Save lists of unique and duplicated parsed PMIDs as .txt files
with open(os.path.join("E:", "PMIDsInitialParsing", "parsed_unique_PMIDs.txt"), 'w') as output:
    for row in unique_pmids_parsed_DFs:
        output.write(str(row) + '\n')

with open(os.path.join("E:", "PMIDsInitialParsing", "parsed_duplicated_PMIDs.txt"), 'w') as output:
    for row in duplicated_pmids_parsed_DFs:
        output.write(str(row) + '\n')

In [23]:
# Subtract unique PMIDs - unique parsed PMIDs to get missing (not parsed) PMIDs
notParsedPMIDs = unique_pmids.difference(unique_pmids_parsed_DFs)

In [24]:
# Save missing (not parsed) PMIDs
with open(os.path.join("E:", "PMIDsInitialParsing", "not_parsed_PMIDS.txt"), 'w') as output:
    for row in notParsedPMIDs:
        output.write(str(row) + '\n')

In [26]:
len(notParsedPMIDs)

327

In [27]:
notParsedPMIDs

{'26641866',
 '26641867',
 '26641871',
 '26641872',
 '27270254',
 '27270255',
 '27270256',
 '27845535',
 '27845536',
 '27845537',
 '27845538',
 '27977291',
 '29283619',
 '29283620',
 '29608087',
 '29608088',
 '29902042',
 '29902043',
 '30589330',
 '30920266',
 '30920267',
 '31033592',
 '31512912',
 '31512913',
 '31512914',
 '31613213',
 '31687108',
 '31713074',
 '31815517',
 '31988696',
 '32202837',
 '32202839',
 '32202840',
 '32378917',
 '32525357',
 '32525358',
 '32525359',
 '32525360',
 '32525361',
 '32760179',
 '32869736',
 '32940509',
 '32955288',
 '32955289',
 '32981335',
 '32984194',
 '33483229',
 '33591786',
 '33591787',
 '33591788',
 '34053074',
 '34223521',
 '34410782',
 '34807650',
 '34824687',
 '34852679',
 '34931577',
 '35112295',
 '35225061',
 '35414849',
 '35811169',
 '35838395',
 '35848347',
 '35848356',
 '35848357',
 '35848360',
 '35848374',
 '35850556',
 '35866530',
 '35900525',
 '35944616',
 '35981213',
 '36190112',
 '36222400',
 '36222401',
 '36222402',
 '36222403',

THESE PMIDS ARE NOT ASSOCIATED TO ANY PAPER IN PUBMED

In [29]:
for el in duplicated_pmids:
    if duplicated_pmids.index(el) % 10000 == 0:
        print(duplicated_pmids.index(el))
    if el not in duplicated_pmids_parsed_DFs:
        print(el)

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
23316251
23317426
23326896
23596640
23596641
23596642
23596643
100000
29651827
29651831
29651832
29651833
29651920
29651921
29651922
29651923
29651924
30098369
30226401
30387724
30387737
30387758
30387759
30387760
110000
30939070
30939076
30939080
30939081
30939103
30939105
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
37437081
38630970
