# Current Trends in Bioinformatics Software Development and Archiving

In [1]:
from datetime import datetime
import json
import re
import time

import dotenv
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from tqdm import tqdm
import requests
import warnings
import xmltodict

%reload_ext autoreload
%autoreload 2
import pbmd_tools as pbmd

In [3]:
pbmd.read_tokens()
GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN")
PUBMED_TOKEN = os.environ.get("PUBMED_TOKEN")

## 1. PubMed API Entrez Interactions

In [15]:
db = "pubmed"
domain = "https://www.ncbi.nlm.nih.gov/entrez/eutils"
retmode = "json"
queries_github = []
queries_gitlab = []
queries_sourceforge = []
queries_googlecode = []
queries_bitbucket = []

#creating queries for every forge and every year
for year in range(2009,2023):
    queries_github.append(f'((github.com[Title/Abstract])) AND (("{year}/01/01"[Date - Publication] : "{year}/12/31"[Date - Publication]))')
    queries_gitlab.append(f'((gitlab.com[Title/Abstract])) AND (("{year}/01/01"[Date - Publication] : "{year}/12/31"[Date - Publication]))')
    queries_sourceforge.append(f'((sourceforge.net[Title/Abstract])) AND (("{year}/01/01"[Date - Publication] : "{year}/12/31"[Date - Publication]))')
    queries_googlecode.append(f'(googlecode) AND ("{year}/01/01"[Date - Publication] : "{year}/12/31"[Date - Publication])')
    queries_bitbucket.append(f'(bitbucket.org[Title/Abstract]) AND ("{year}/01/01"[Date - Publication] : "{year}/12/31"[Date - Publication])')

In [72]:
#dictionaries for stocking the number of articles for each forge for each year
#example: {'2009': 0, '2010': 5, '2011': 15, ... }

stats_github = {}
stats_gitlab = {}
stats_sourceforge = {}
stats_googlecode = {}
stats_bitbucket = {}
PMIDs = []
PMIDs_all = []

for query in tqdm(queries_github):
    nb = 0 #number of articles for this query
    queryLinkSearch = f"{domain}/esearch.fcgi?db={db}&retmode={retmode}&retmax=15000&term={query}"
    response = requests.get(queryLinkSearch)
    pubmed_json = response.json()
    for id in pubmed_json["esearchresult"]["idlist"]:
        #checking if there are any dublicates in PubMed IDs (it happens because of the PubDate that can be EPubDate or normal)
        if id not in PMIDs:
            nb += 1
            PMIDs.append(id)
    #query[38:42] - it is the year of this query
    stats_github[query[38:42]] = nb 
    
for query in tqdm(queries_bitbucket):
    nb = 0
    queryLinkSearch = f"{domain}/esearch.fcgi?db={db}&retmode={retmode}&retmax=15000&term={query}"
    response = requests.get(queryLinkSearch)
    pubmed_json = response.json()
    for id in pubmed_json["esearchresult"]["idlist"]:
        if id not in PMIDs_all:
            nb += 1
            PMIDs_all.append(id)
    stats_bitbucket[query[38:42]] = nb
    
for query in tqdm(queries_gitlab):
    nb = 0
    queryLinkSearch = f"{domain}/esearch.fcgi?db={db}&retmode={retmode}&retmax=15000&term={query}"
    response = requests.get(queryLinkSearch)
    pubmed_json = response.json()
    for id in pubmed_json["esearchresult"]["idlist"]:
        if id not in PMIDs_all:
            nb += 1
            PMIDs_all.append(id)
    stats_gitlab[query[38:42]] = nb
    
for query in tqdm(queries_sourceforge):
    nb = 0
    queryLinkSearch = f"{domain}/esearch.fcgi?db={db}&retmode={retmode}&retmax=15000&term={query}"
    response = requests.get(queryLinkSearch)
    pubmed_json = response.json()
    for id in pubmed_json["esearchresult"]["idlist"]:
        if id not in PMIDs_all:
            nb += 1
            PMIDs_all.append(id)
    stats_sourceforge[query[43:47]] = nb
    
for query in tqdm(queries_googlecode):
    nb = 0
    queryLinkSearch = f"{domain}/esearch.fcgi?db={db}&retmode={retmode}&retmax=15000&term={query}"
    response = requests.get(queryLinkSearch)
    pubmed_json = response.json()
    for id in pubmed_json["esearchresult"]["idlist"]:
        if id not in PMIDs_all:
            nb += 1
            PMIDs_all.append(id)
    stats_googlecode[query[19:23]] = nb

print(f"\n{len(PMIDs)} articles with 'github.com' found in PubMed")

#checking that there is no duplicates
PMIDs = list(set(PMIDs))
print(f"\n{len(PMIDs)} articles with 'github.com' found in PubMed")

100%|██████████| 14/14 [00:10<00:00,  1.34it/s]
100%|██████████| 14/14 [00:08<00:00,  1.57it/s]
100%|██████████| 14/14 [00:08<00:00,  1.63it/s]
100%|██████████| 14/14 [00:08<00:00,  1.58it/s]
100%|██████████| 14/14 [00:09<00:00,  1.54it/s]


10880 articles with 'github.com' found in PubMed

10880 articles with 'github.com' found in PubMed





In [90]:
#saving the statistics to reuse it

with open("PMIDs.txt", "w") as f:
    for PMID in PMIDs:
        f.write(str(PMID)+"\n")
with open("stats_github.json", "w") as f:
    json.dump(stats_github, f)
with open("stats_gitlab.json", "w") as f:
    json.dump(stats_gitlab, f)
with open("stats_sourceforge.json", "w") as f:
    json.dump(stats_sourceforge, f)    
with open("stats_googlecode.json", "w") as f:
    json.dump(stats_googlecode, f)
with open("stats_bitbucket.json", "w") as f:
    json.dump(stats_bitbucket, f)

In [230]:
# https://pubmed.ncbi.nlm.nih.gov/26262258/ - No DOI in PubMed although there is one in the full text of the article (not from ArXiv), and there are a lot of them
# https://pubmed.ncbi.nlm.nih.gov/28269829/ - they give a wrong link https://github.com/SBU-BMI/imageboxs://github.com/SBU-BMI/imagebox but if you use this link :
# https://github.com/SBU-BMI/imagebox it works. Yet, i am not sure that it is actually what we are looking for since they also provide another link to github.io 
# (also incorect) and i think it's more likely that their code is there
# PMID = 36789260 - 2 links
#https://github.com/tyqGitHub/TYQ/tree/master/GACNNMDA - ????
#https://github.com/mofradlab - ?????? (PMID 36786404)
# PMID = 26124555 - a space in the link
# PMID = 24324759, 22151646 - no space after link
# PMID = 23849037 - why + in the end ?
# PMID = 36315552 - super smart

In [101]:
results = []

In [131]:
#API Pubmed rate limit is 10 request per second with a token and 3 request par second without it

#count = 0
for PMID in tqdm(pm):
    #count += 1
    #if count % 10 == 0:
    #    time.sleep(1)
    
    try:
        summary = pbmd.get_summary(PMID, PUBMED_TOKEN, "status.txt")
        abstract = pbmd.get_abstract_from_summary(summary, "status.txt")
        pubdate = pbmd.get_pubdate_from_summary(summary, "status.txt")
        title = pbmd.get_title_from_summary(summary, "status.txt")
        journal = pbmd.get_journal_from_summary(summary, "status.txt")
        doi = pbmd.get_doi_from_summary(summary, "status.txt")
    except:
        try:
            summary = pbmd.get_summary(PMID, PUBMED_TOKEN, "status.txt")
            abstract = pbmd.get_abstract_from_summary(summary, "status.txt")
            pubdate = pbmd.get_pubdate_from_summary(summary, "status.txt")
            title = pbmd.get_title_from_summary(summary, "status.txt")
            journal = pbmd.get_journal_from_summary(summary, "status.txt")
            doi = pbmd.get_doi_from_summary(summary, "status.txt")
        except:
            continue
        
    #checking in case the API is bugging 
    if (pubdate, doi) == (None, None):
        time.sleep(2)
        summary = pbmd.get_summary(PMID, PUBMED_TOKEN, "status.txt")
        abstract = pbmd.get_abstract_from_summary(summary, "status.txt")
        pubdate = pbmd.get_pubdate_from_summary(summary, "status.txt")
        title = pbmd.get_title_from_summary(summary, "status.txt")
        journal = pbmd.get_journal_from_summary(summary, "status.txt")
        doi = pbmd.get_doi_from_summary(summary, "status.txt")     

    results.append((PMID, pubdate, doi, journal, title, abstract))

100%|██████████| 59/59 [01:40<00:00,  1.70s/it]


In [141]:
df = pd.DataFrame.from_records(results)
df = df.rename(columns = {0: 'PMID', 1: 'PubDate', 2: 'DOI', 3: 'Journal', 4: 'Title', 5: 'Abstract'})
df = df.drop_duplicates(subset = 'PMID')
df = df.reset_index(drop = True)

In [142]:
len(df)

10880

In [149]:
print(f"Number of records without publication date is: {len(df[df['PubDate'].isnull()])}")

Number of records without publication date is: 59


In [169]:
len(df[df['PubDate'].isnull()].drop_duplicates(subset = 'PMID', keep=False)

Unnamed: 0,PMID,PubDate,DOI,Journal,Title,Abstract,GitHub_link_raw,GitHub_link_clean,GitHub_owner,GitHub_repo
515,32165788,,10.1021/acs.chemmater.9b02166,Chemistry of materials : a publication of the ...,Accelerated Discovery of Efficient Solar-cell ...,Solar-energy plays an important role in solvin...,github.com/usnistgov/jarvis.,https://github.com/usnistgov/jarvis/,usnistgov,jarvis
591,32165790,,10.1016/j.commatsci.2019.02.006,Computational materials science,Convergence and machine learning predictions o...,"In this work, we developed an automatic conver...",github.com/usnistgov/jarvis.,https://github.com/usnistgov/jarvis/,usnistgov,jarvis
808,36438203,,10.1109/icdm51629.2021.00097,Proceedings. IEEE International Conference on ...,SCEHR: Supervised Contrastive Learning for Cli...,Contrastive learning has demonstrated promisin...,github.com/calvin-zcx/SCEHR.,https://github.com/calvin-zcx/SCEHR/,calvin-zcx,SCEHR
1050,30918410,,10.1038/s41586-019-1091-9,Nature,Author Correction: Universal resilience patter...,"In this Letter, in Fig. 3c and f the Saccharom...",github.com/jianxigao/NuRsE,https://github.com/jianxigao/NuRsE/,jianxigao,NuRsE
1562,33708457,,10.1145/3292500.3330975,KDD : proceedings. International Conference on...,A Free Energy Based Approach for Distance Metr...,We present a reformulation of the distance met...,github.com/kouroshz/fenn.,https://github.com/kouroshz/fenn/,kouroshz,fenn
1597,37193322,,10.24963/ijcai.2022/301,IJCAI : proceedings of the conference,Adapt to Adaptation: Learning Personalization ...,Conventional federated learning (FL) trains on...,github.com/ljaiverson/pFL-APPLE.,https://github.com/ljaiverson/pFL-APPLE/,ljaiverson,pFL-APPLE
1701,35088055,,,Proceedings of machine learning research,Weakly-supervised High-resolution Segmentation...,"In the last few years, deep learning classifie...",github.com/nyukat/GLAM.,https://github.com/nyukat/GLAM/,nyukat,GLAM
1724,36532945,,10.24963/ijcai.2022/498,IJCAI : proceedings of the conference,Stabilizing and Enhancing Link Prediction thro...,Graph neural networks have been widely used fo...,github.com/xinxingwu-uk/DGAE.,https://github.com/xinxingwu-uk/DGAE/,xinxingwu-uk,DGAE
1953,33390682,,,Advances in neural information processing systems,Evaluating Protein Transfer Learning with TAPE.,Machine learning applied to protein sequences ...,github.com/songlab-cal/tape.,https://github.com/songlab-cal/tape/,songlab-cal,tape
1985,37139473,,,Proceedings of machine learning research,Multi Resolution Analysis (MRA) for Approximat...,Transformers have emerged as a preferred model...,github.com/mlpen/mra-attention.,https://github.com/mlpen/mra-attention/,mlpen,mra-attention


In [150]:
df.to_csv('articles.tsv', sep='\t', index=False)

## 2. Geting links from the obtained data using regex

In [151]:
df = pd.read_csv('articles.tsv', sep='\t')

In [152]:
df['GitHub_link_raw'] = df['Abstract'].astype(str).apply(pbmd.get_link_from_abstract)
df['GitHub_link_clean'] = df['GitHub_link_raw'].astype(str).apply(pbmd.clean_link)
df['GitHub_owner'] = df['GitHub_link_clean'].apply(pbmd.get_owner_from_link)
df['GitHub_repo'] = df['GitHub_link_clean'].apply(pbmd.get_repo_from_link)

In [162]:
print(f"Number of records with weird abstracts leading to inability to extract a link: {len(df[df['GitHub_owner'].isna()])}")

Number of records with weird abstracts leading to inability to extract a link: 6


In [163]:
print(f"Number of records without a repository name: {len(df[df['GitHub_repo'].isna()])-len(df[df['GitHub_owner'].isna()])}")

Number of records without a repository name: 251


In [164]:
df.to_csv('articles.tsv', sep='\t', index=False)

## 3. GitHub API Interactions

In [165]:
df = pd.read_csv('articles.tsv', sep='\t')

In [166]:
PMIDs = df['PMID'][df['GitHub_repo'].notna()][df['PubDate'].notna()].to_list()
len(PMIDs)

10565

In [9]:
for PMID in tqdm(PMIDs):

    with open("gitstat.txt", "a") as f:
        f.write(f"\n\n PMID: {PMID}, GitHub link: {df[df['PMID'] == PMID]['GitHub_link_clean'].values[0]}")

    info = pbmd.get_repo_info(df[df['PMID']==PMID]['GitHub_owner'].values[0], df[df['PMID']==PMID]['GitHub_repo'].values[0], GITHUB_TOKEN, "gitstat.txt")

    if info["status"]: 
        idx = df.index[df['PMID'] == PMID][0]

        df.loc[idx, "Repo_created_at"] = pbmd.get_repo_date_created(info)
        df.loc[idx, "Repo_updated_at"] = pbmd.get_repo_date_updated(info)
        df.loc[idx, "Fork"] = pbmd.is_fork(info)
    else:
        
        time.sleep(3600)
        
        info = pbmd.get_repo_info(df[df['PMID']==PMID]['GitHub_owner'].values[0], df[df['PMID']==PMID]['GitHub_repo'].values[0], GITHUB_TOKEN, "gitstat.txt")
        idx = df.index[df['PMID'] == PMID][0]
        df.loc[idx, "Repo_created_at"] = pbmd.get_repo_date_created(info)
        df.loc[idx, "Repo_updated_at"] = pbmd.get_repo_date_updated(info)
        df.loc[idx, "Fork"] = pbmd.is_fork(info)

100%|██████████████████████████████████████████████████████████████████████████████| 1077/1077 [05:01<00:00,  3.57it/s]


In [10]:
PMIDs = df['PMID'][df['Repo_created_at'].isna()].to_list()
len(PMIDs)

766

In [None]:
df.to_csv('articles.tsv', sep='\t', index=False)

## 4. Software Heritage API interactions

In [2]:
df = pd.read_csv('articles.tsv', sep='\t')

In [242]:
PMIDs = df['PMID'][df['GitHub_repo'].notna()].to_list()
len(PMIDs)

11914

In [270]:
for PMID in tqdm(PMIDs):
    
    info = pbmd.check_is_in_softwh(df[df['PMID']==PMID]['GitHub_link_clean'].values[0])

    idx = df.index[df['PMID'] == PMID][0]
    
    df.loc[idx, "In_SoftWH"] = pbmd.is_in_softwh(info)
    df.loc[idx, "Archived"] = pbmd.get_date_archived(info)

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 15.18it/s]


In [282]:
df.to_csv('articles.tsv', sep='\t', index=False)

## Unresolved links analysis

In [383]:
df = pd.read_csv('no_info.tsv', sep='\t',usecols=['PMID', 'PubDate', 'DOI', 'Journal', 'Title', 'Abstract', 'Issue', 'GitHub_link_clean','Correct_link'])

In [384]:
print(f"Number of articles with a wrong link (either a space in the link, or no space after link, etc) : {len(df[df['Issue'] == 'wrong link'])}") 

Number of articles with a wrong link (either a space in the link, or no space after link, etc) : 58


In [385]:
print(f"Number of articles with a renamed repository : {len(df[df['Issue'] == 'renamed'])}") 

Number of articles with a renamed repository : 11


In [386]:
print(f"Number of articles with a deleted repository : {len(df[df['Issue'] == 'owner deleted']) + len(df[df['Issue'] == 'repo deleted'])}")

Number of articles with a deleted repository : 146


Resolving unresolved but existing links

In [387]:
df.loc[df['Correct_link'].notna(),'GitHub_owner'] = df.loc[df['Correct_link'].notna(),'Correct_link'].apply(pbmd.get_owner_from_link)
df.loc[df['Correct_link'].notna(),'GitHub_repo'] = df.loc[df['Correct_link'].notna(),'Correct_link'].apply(pbmd.get_repo_from_link)

df.loc[df['Correct_link'].isna(),'GitHub_owner'] = df.loc[df['Correct_link'].isna(),'GitHub_link_clean'].apply(pbmd.get_owner_from_link)
df.loc[df['Correct_link'].isna(),'GitHub_repo'] = df.loc[df['Correct_link'].isna(),'GitHub_link_clean'].apply(pbmd.get_repo_from_link)

In [389]:
PMIDs = df['PMID'][df['GitHub_repo'].notna()].to_list()
len(PMIDs)

215

In [390]:
count = 0
for PMID in tqdm(PMIDs):
    count += 1
    if count % 5000 == 0:
        time.sleep(3600)
    
    with open("gitstat.txt", "a") as f:
        f.write(f"\n\n PMID: {PMID}, GitHub link: {df[df['PMID'] == PMID]['Correct_link'].values[0]}")

    info = pbmd.get_repo_info(df[df['PMID']==PMID]['GitHub_owner'].values[0], df[df['PMID']==PMID]['GitHub_repo'].values[0], GITHUB_TOKEN, "gitstat.txt")

    idx = df.index[df['PMID'] == PMID][0]

    df.loc[idx, "Repo_created_at"] = pbmd.get_repo_date_created(info)
    df.loc[idx, "Repo_updated_at"] = pbmd.get_repo_date_updated(info)
    df.loc[idx, "Fork"] = pbmd.is_fork(info)

100%|████████████████████████████████████████████████████████████████████████████████| 215/215 [01:05<00:00,  3.30it/s]


In [391]:
PMIDs = df['PMID'][df['Correct_link'].notna()].to_list()
len(PMIDs)

69

In [392]:
for PMID in tqdm(PMIDs):
    
    info = pbmd.check_is_in_softwh(df[df['PMID']==PMID]['Correct_link'].values[0])

    idx = df.index[df['PMID'] == PMID][0]
    
    df.loc[idx, "In_SoftWH"] = pbmd.is_in_softwh(info)
    df.loc[idx, "Archived"] = pbmd.get_date_archived(info)

100%|██████████████████████████████████████████████████████████████████████████████████| 69/69 [00:13<00:00,  5.03it/s]


In [393]:
PMIDs = df['PMID'][df['Correct_link'].isna()].to_list()
len(PMIDs)

146

In [394]:
for PMID in tqdm(PMIDs):
    
    info = pbmd.check_is_in_softwh(df[df['PMID']==PMID]['GitHub_link_clean'].values[0])

    idx = df.index[df['PMID'] == PMID][0]
    
    df.loc[idx, "In_SoftWH"] = pbmd.is_in_softwh(info)
    df.loc[idx, "Archived"] = pbmd.get_date_archived(info)

100%|████████████████████████████████████████████████████████████████████████████████| 146/146 [00:56<00:00,  2.60it/s]


In [396]:
print(f"Repositories that are forks : {len(df[df['Fork'] == 1])}")

Repositories that are forks : 0


In [397]:
print(f"Not in SoftWH : {len(df[df['In_SoftWH'] == 0])}")

Not in SoftWH : 118


In [398]:
print(f"In SoftWH : {len(df[df['In_SoftWH'] == 1])}")

In SoftWH : 97


In [401]:
print(f"Were deleted but archived in SoftWH : {len(df[df['Correct_link'].isna()][df['In_SoftWH'] == 1])}")

Were deleted but archived in SoftWH : 49


  print(f"Were deleted but archived in SoftWH : {len(df[df['Correct_link'].isna()][df['In_SoftWH'] == 1])}")
