In [5]:
pip install linkify-it-py -q

Note: you may need to restart the kernel to use updated packages.


In [6]:
import json
import re
import time

from linkify_it import LinkifyIt
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import sys
from tqdm import tqdm
import requests
import xmltodict

%reload_ext autoreload
%autoreload 2
sys.path.append('../scripts')
import pbmd_tools as pbmd

In [21]:
pbmd.read_tokens("../.env")
GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN")
PUBMED_TOKEN = os.environ.get("PUBMED_TOKEN")

In [8]:
linkify = (
    LinkifyIt()
    .set({"fuzzy_email": False}) 
)

In [38]:
PMIDs = []
queries = []
for year in range(2009, 2023):
    queries.append(f'(("http"[Title/Abstract])) AND (("{year}/01/01"[Date - Publication] : "{year}/12/31"[Date - Publication]))')
stats = pbmd.get_forges_stat(queries, PMIDs)
len(PMIDs)

100%|██████████| 14/14 [00:28<00:00,  2.04s/it]


47467

In [None]:
db = 'pubmed'
domain = 'https://www.ncbi.nlm.nih.gov/entrez/eutils'
retmode = 'xml'

In [40]:
for PMID in tqdm(PMIDs):
    url = f'{domain}/efetch.fcgi?db={db}&id={PMID}&retmode={retmode}&rettype=abstract&api_key={PUBMED_TOKEN}'  
    try:
        response = requests.get(url)
        if response.status_code == 200:
            with open(f"../data/xml/{PMID}.xml", "wb") as file:
                file.write(response.content)
        else:
            print(f"{PMID} failed :", response.status_code)
    except:
        continue

100%|██████████| 46/46 [00:23<00:00,  1.95it/s]


In [41]:
PMIDs_failed = []

files = os.listdir('../data/xml')
for file in files:
    if file[-4:] != ".xml":
        files.remove(file)

PMIDs_succed = [file.split('.')[0] for file in files]

for id in PMIDs:
    if id not in PMIDs_succed :
        PMIDs_failed.append(id)
print(len(PMIDs))
print(len(PMIDs_succed))
print(len(PMIDs_failed))

47467
47467
0


In [42]:
with_http = 0
without_http = 0

links_stat = {}

for file in files:
    with open(f"../data/xml/{file}", "r") as f:
        summary = xmltodict.parse(f.read())
        abstract = pbmd.get_abstract_from_summary(summary, "../data/log_files/status_all_links.txt")
    
        if abstract != None:

            if linkify.test(abstract):
                for match in linkify.match(abstract):
                    link = match.raw
                    if link.startswith('http://github') or link.startswith('https://github'):
                        with_http += 1
                    elif link.startswith('github'):
                        without_http += 1
                    try:
                        key = link.split('/')[2]
                    except:
                        try:
                            key = link.split('/')[0]
                        except:
                            key = link
                    if key in links_stat:
                        links_stat[key] += 1
                    else:
                        links_stat[key] = 1

In [43]:
len(links_stat)

17726

In [44]:
sum(links_stat.values())

56935

In [45]:
links_stat_lower = {}

for key in links_stat.keys():
    if key.lower() in links_stat_lower:
        links_stat_lower[key.lower()] += links_stat[key]
    else:
        links_stat_lower[key.lower()] = links_stat[key]

In [46]:
len(links_stat_lower)

17532

In [47]:
keys_to_modify = [key for key in links_stat_lower.keys() if key.startswith('www')]

for key in keys_to_modify:
    if key[4:] in links_stat_lower:
        links_stat_lower[key[4:]] += links_stat_lower[key]
    else:
        links_stat_lower[key[4:]] = links_stat_lower[key]
    links_stat_lower[key] = 0
    

In [48]:
keys_to_delete = [key for key, value in links_stat_lower.items() if value == 0]
for key in keys_to_delete:
    del links_stat_lower[key]

In [49]:
len(links_stat_lower)

17038

In [50]:
with open("../data/links_stat_lower.json", "w") as f:
    json.dump(links_stat_lower, f)

In [None]:
'github.com': 1145
'sourceforge.net': 352
'code.google.com': 206
'bitbucket.org': 45
'gitlab.com': 16

In [51]:
dict(sorted(links_stat_lower.items(), key=lambda x: x[1], reverse=True))

{'clinicaltrials.gov': 6128,
 'links.lww.com': 2333,
 'dx.doi.org': 1989,
 'webcitation.org': 1231,
 'github.com': 1145,
 'chictr.org.cn': 1117,
 'elsevier.com': 861,
 'onlinelibrary.wiley.com': 780,
 'diagnosticpathology.diagnomx.eu': 639,
 'ncbi.nlm.nih.gov': 500,
 'anzctr.org.au': 460,
 'isrctn.com': 449,
 'controlled-trials.com': 434,
 'ow.ly': 381,
 'sourceforge.net': 352,
 'trialregister.nl': 343,
 'crd.york.ac.uk': 338,
 'bioconductor.org': 319,
 'umin.ac.jp': 270,
 'biomedcentral.com': 259,
 'cancerres.aacrjournals.org': 252,
 'cran.r-project.org': 252,
 'doi.org': 239,
 'ebi.ac.uk': 220,
 'code.google.com': 205,
 'bit.ly': 195,
 'irct.ir': 188,
 'youtu.be': 186,
 'proteomecentral.proteomexchange.org': 173,
 'rimed.org': 172,
 'radiology.rsna.org': 146,
 'springer.com': 142,
 'cdc.gov': 131,
 'ctri.nic.in': 123,
 'journals.sagepub.com': 111,
 'who.int': 103,
 'ensaiosclinicos.gov.br': 101,
 '': 99,
 'apps.who.int': 94,
 'nt.ars-grin.gov': 92,
 'chictr.org': 88,
 'w3.org': 88,
 

In [9]:
df = pd.read_csv('../data/articles.tsv', sep='\t')
abstracts = df['Abstract'].to_list()

with_http = 0
without_http = 0
links_stat = {}

for abstract in abstracts:
    if abstract != None:

        if linkify.test(str(abstract)):
            for match in linkify.match(str(abstract)):
                link = match.raw
                if link.startswith('http://github') or link.startswith('https://github'):
                    with_http += 1
                elif link.startswith('github') :
                    without_http += 1
                try:
                    key = link.split('/')[2]
                except:
                    try:
                        key = link.split('/')[0]
                    except:
                        key = link
                if key in links_stat:
                    links_stat[key] += 1
                else:
                    links_stat[key] = 1

In [10]:
with_http, without_http 

(10840, 329)