# Current Trends in Bioinformatics Software Development and Archiving

### Import libraries

In [5]:
import json
import re
import time

import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import sys
from tqdm import tqdm
import requests
import xmltodict

%reload_ext autoreload
%autoreload 2
sys.path.append('../scripts')
import pbmd_tools as pbmd

In [6]:
%load_ext watermark
%watermark --python --machine
# Python packages versions
%watermark --packages jupyterlab --iversions --watermark
# conda environment name
%watermark --conda

Python implementation: CPython
Python version       : 3.9.16
IPython version      : 8.11.0

Compiler    : GCC 11.2.0
OS          : Linux
Release     : 5.19.0-45-generic
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

jupyterlab: 3.5.3

re        : 2.2.1
numpy     : 1.23.5
pandas    : 1.5.3
sys       : 3.9.16 (main, Mar  8 2023, 14:00:05) 
[GCC 11.2.0]
xmltodict : 0.12.0
matplotlib: 3.7.1
requests  : 2.28.2
json      : 2.0.9

Watermark: 2.3.1

conda environment: bioinfosoft



### Import tokens

In [7]:
pbmd.read_tokens("../.env")
GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN")
PUBMED_TOKEN = os.environ.get("PUBMED_TOKEN")

## 1. PubMed API Entrez Interactions

First of all we are going to explore PubMed in order to find out how many publications for each among 5 forges are there.

In [10]:
queries_github = []
queries_gitlab = []
queries_sourceforge = []
queries_googlecode = []
queries_bitbucket = []

#creating queries for every forge and every year
for year in range(2009, 2023):
    queries_github.append(f'((github.com[Title/Abstract])) AND (("{year}/01/01"[Date - Publication] : "{year}/12/31"[Date - Publication]))')
    queries_gitlab.append(f'((https://gitlab[Title/Abstract])) OR ((http://gitlab[Title/Abstract])) OR ((gitlab.[Title/Abstract])) AND (("{year}/01/01"[Date - Publication] : "{year}/12/31"[Date - Publication]))')
    queries_sourceforge.append(f'((sourceforge.net[Title/Abstract])) AND (("{year}/01/01"[Date - Publication] : "{year}/12/31"[Date - Publication]))')
    queries_googlecode.append(f'((googlecode) OR ("code.google.com") AND ("{year}/01/01"[Date - Publication] : "{year}/12/31"[Date - Publication]))')
    queries_bitbucket.append(f'((bitbucket.org[Title/Abstract]) AND ("{year}/01/01"[Date - Publication] : "{year}/12/31"[Date - Publication]))')

In [12]:
#dictionaries for stocking the number of articles for each forge for each year
#example: {'2009': 0, '2010': 5, '2011': 15, ... }

PMIDs = []
PMIDs_all = []

stats_github = pbmd.get_forges_stat(queries_github, PMIDs)
stats_gitlab = pbmd.get_forges_stat(queries_gitlab, PMIDs_all)
stats_sourceforge = pbmd.get_forges_stat(queries_sourceforge, PMIDs_all)
stats_googlecode = pbmd.get_forges_stat(queries_googlecode, PMIDs_all)
stats_bitbucket = pbmd.get_forges_stat(queries_bitbucket, PMIDs_all)

print(f"\n{len(PMIDs)} articles with 'github.com' found in PubMed")

100%|██████████| 14/14 [00:08<00:00,  1.66it/s]
100%|██████████| 14/14 [00:08<00:00,  1.63it/s]
100%|██████████| 14/14 [00:08<00:00,  1.57it/s]
100%|██████████| 14/14 [00:09<00:00,  1.53it/s]


10861 articles with 'github.com' found in PubMed





In [13]:
#saving the statistics to reuse it in another notebook

with open("../data/PMIDs.txt", "w") as f:
    for PMID in PMIDs:
        f.write(str(PMID)+"\n")
with open("../data/stats_github.json", "w") as f:
    json.dump(stats_github, f)
with open("../data/stats_gitlab.json", "w") as f:
    json.dump(stats_gitlab, f)
with open("../data/stats_sourceforge.json", "w") as f:
    json.dump(stats_sourceforge, f)    
with open("../data/stats_googlecode.json", "w") as f:
    json.dump(stats_googlecode, f)
with open("../data/stats_bitbucket.json", "w") as f:
    json.dump(stats_bitbucket, f)

In [230]:
# https://pubmed.ncbi.nlm.nih.gov/26262258/ - No DOI in PubMed although there is one in the full text of the article (not from ArXiv), and there are a lot of them
# https://pubmed.ncbi.nlm.nih.gov/28269829/ - they give a wrong link https://github.com/SBU-BMI/imageboxs://github.com/SBU-BMI/imagebox but if you use this link :
# https://github.com/SBU-BMI/imagebox it works. Yet, i am not sure that it is actually what we are looking for since they also provide another link to github.io 
# (also incorect) and i think it's more likely that their code is there
# PMID = 36789260 - 2 links
# https://github.com/tyqGitHub/TYQ/tree/master/GACNNMDA - ????
# https://github.com/mofradlab - ?????? (PMID 36786404)
# PMID = 26124555 - a space in the link
# PMID = 24324759, 22151646 - no space after link
# PMID = 23849037 - why + in the end ?
# PMID = 36315552 - super smart

Next we will use API PubMed to gather the information about each article such as the publication date, the doi, the abstract, the title of the article and the journal. We will then analyse this information.

In [14]:
results = []

In [None]:
#API Pubmed rate limit is 10 request per second with a token and 3 request par second without it

#count = 0
for PMID in tqdm(PMIDs):
    #count += 1
    #if count % 10 == 0:
    #    time.sleep(1)
    
    try:
        summary = pbmd.get_summary(PMID, PUBMED_TOKEN, "../data/log_files/status.txt")
    except:
        try:
            summary = pbmd.get_summary(PMID, PUBMED_TOKEN, "../data/log_files/status.txt")
        except:
            continue
            
    abstract = pbmd.get_abstract_from_summary(summary, "../data/log_files/status.txt")
    pubdate = pbmd.get_pubdate_from_summary(summary, "../data/log_files/status.txt")
    title = pbmd.get_title_from_summary(summary, "../data/log_files/status.txt")
    journal = pbmd.get_journal_from_summary(summary, "../data/log_files/status.txt")
    doi = pbmd.get_doi_from_summary(summary, "../data/log_files/status.txt")  

    results.append((PMID, pubdate, doi, journal, title, abstract))

  0%|▎                                                                            | 49/10858 [00:42<2:15:43,  1.33it/s]

In [141]:
df = pd.DataFrame.from_records(results)
df = df.rename(columns = {0: 'PMID', 1: 'PubDate', 2: 'DOI', 3: 'Journal', 4: 'Title', 5: 'Abstract'})
df = df.drop_duplicates(subset = 'PMID')
df = df.reset_index(drop = True)

In [149]:
print(f"Number of records without publication date is: {len(df[df['PubDate'].isnull()])}")

Number of records without publication date is: 59


In [22]:
df.to_csv('../data/articles.tsv', sep='\t', index=False)

## 2. Geting links from the obtained data using regex

In [88]:
df = pd.read_csv('../data/articles.tsv', sep='\t')

In [89]:
df['GitHub_link_raw'] = df['Abstract'].astype(str).apply(pbmd.get_link_from_abstract)
df['GitHub_link_clean'] = df['GitHub_link_raw'].astype(str).apply(pbmd.clean_link)
df['GitHub_owner'] = df['GitHub_link_clean'].apply(pbmd.get_owner_from_link)
df['GitHub_repo'] = df['GitHub_link_clean'].apply(pbmd.get_repo_from_link)

In [90]:
print(f"Number of records with weird abstracts leading to inability to extract a link: {len(df[df['GitHub_owner'].isna()])}")

Number of records with weird abstracts leading to inability to extract a link: 6


In [91]:
print(f"Number of records without a repository name: {len(df[df['GitHub_repo'].isna()])-len(df[df['GitHub_owner'].isna()])}")

Number of records without a repository name: 251


In [164]:
df.to_csv('../data/articles.tsv', sep='\t', index=False)

## 3. GitHub API Interactions

In [4]:
df = pd.read_csv('../data/articles.tsv', sep='\t')

In [111]:
PMIDs = df['PMID'][df['GitHub_repo'].notna()].to_list()
len(PMIDs)

10623

In [113]:
for PMID in tqdm(PMIDs):

    with open("../data/log_files/gitstat.txt", "a") as f:
        f.write(f"\n\n PMID: {PMID}, GitHub link: {df[df['PMID'] == PMID]['GitHub_link_clean'].values[0]}")

    info = pbmd.get_repo_info(df[df['PMID']==PMID]['GitHub_owner'].values[0], 
                              df[df['PMID']==PMID]['GitHub_repo'].values[0], 
                              GITHUB_TOKEN, "../data/log_files/gitstat.txt")

    if info["status"]: 
        idx = df.index[df['PMID'] == PMID][0]
    else:      
        time.sleep(3600)      
        info = pbmd.get_repo_info(df[df['PMID']==PMID]['GitHub_owner'].values[0], 
                                  df[df['PMID']==PMID]['GitHub_repo'].values[0], 
                                  GITHUB_TOKEN, "../data/log_files/gitstat.txt")

    df.loc[idx, "Repo_created_at"] = pbmd.get_repo_date_created(info)
    df.loc[idx, "Repo_updated_at"] = pbmd.get_repo_date_updated(info)
    df.loc[idx, "Fork"] = pbmd.is_fork(info)


100%|██████████| 905/905 [05:39<00:00,  2.67it/s]


In [None]:
PMIDs = df['PMID'][df['Repo_created_at'].isna()].to_list()
len(PMIDs)

758

In [115]:
df.to_csv('../data/articles.tsv', sep='\t', index=False)

## 4. Software Heritage API interactions

In [74]:
df = pd.read_csv('../data/articles.tsv', sep='\t')

In [5]:
PMIDs = df['PMID'][df['GitHub_owner'].notna()].to_list()
len(PMIDs)

10847

In [6]:
for PMID in tqdm(PMIDs):
    
    try:
        info = pbmd.check_is_in_softwh(df[df['PMID']==PMID]['GitHub_link_clean'].values[0])
    except:
        try:
            info = pbmd.check_is_in_softwh(df[df['PMID']==PMID]['GitHub_link_clean'].values[0])
        except:
            continue

    idx = df.index[df['PMID'] == PMID][0]

    df.loc[idx, "In_SoftWH"] = pbmd.is_in_softwh(info)
    df.loc[idx, "Archived"] = pbmd.get_date_archived(info)

100%|██████████| 10847/10847 [19:47<00:00,  9.13it/s]  


In [57]:
df.to_csv('../data/articles.tsv', sep='\t', index=False)

## Unresolved links analysis

The repositories that we were anable to access via extracted links were analysed manualy to determine the reason. The following reasons were found:
1. Error in the link
2. Deleted repository or user
3. Renamed repository

In [None]:
df1 = pd.read_csv('no_info2.tsv', sep='\t',usecols=['PMID', 'PubDate', 'DOI', 'Journal', 'Title', 'Abstract', 'Issue', 'GitHub_link_clean','Correct_link'])

In [384]:
print(f"Number of articles with a wrong link (either a space in the link, or no space after link, etc) : {len(df1[df1['Issue'] == 'wrong link'])}") 

Number of articles with a wrong link (either a space in the link, or no space after link, etc) : 58


In [385]:
print(f"Number of articles with a renamed repository : {len(df1[df1['Issue'] == 'renamed'])}") 

Number of articles with a renamed repository : 11


In [386]:
print(f"Number of articles with a deleted repository : {len(df1[df1['Issue'] == 'owner deleted']) + len(df1[df1['Issue'] == 'repo deleted'])}")

Number of articles with a deleted repository : 146


Resolving unresolved but existing links

In [387]:
df1.loc[df['Correct_link'].notna(),'GitHub_owner'] = df1.loc[df['Correct_link'].notna(),'Correct_link'].apply(pbmd.get_owner_from_link)
df1.loc[df['Correct_link'].notna(),'GitHub_repo'] = df1.loc[df['Correct_link'].notna(),'Correct_link'].apply(pbmd.get_repo_from_link)

df1.loc[df['Correct_link'].isna(),'GitHub_owner'] = df1.loc[df['Correct_link'].isna(),'GitHub_link_clean'].apply(pbmd.get_owner_from_link)
df1.loc[df['Correct_link'].isna(),'GitHub_repo'] = df1.loc[df['Correct_link'].isna(),'GitHub_link_clean'].apply(pbmd.get_repo_from_link)

In [389]:
PMIDs = df1['PMID'][df1['GitHub_repo'].notna()].to_list()
len(PMIDs)

215

In [390]:
count = 0
for PMID in tqdm(PMIDs):
    count += 1
    if count % 5000 == 0:
        time.sleep(3600)
    
    with open("gitstat.txt", "a") as f:
        f.write(f"\n\n PMID: {PMID}, GitHub link: {df1[df1['PMID'] == PMID]['Correct_link'].values[0]}")

    info = pbmd.get_repo_info(df1[df1['PMID']==PMID]['GitHub_owner'].values[0], df1[df1['PMID']==PMID]['GitHub_repo'].values[0], GITHUB_TOKEN, "gitstat.txt")

    idx = df1.index[df['PMID'] == PMID][0]

    df1.loc[idx, "Repo_created_at"] = pbmd.get_repo_date_created(info)
    df1.loc[idx, "Repo_updated_at"] = pbmd.get_repo_date_updated(info)
    df1.loc[idx, "Fork"] = pbmd.is_fork(info)

100%|████████████████████████████████████████████████████████████████████████████████| 215/215 [01:05<00:00,  3.30it/s]


In [391]:
PMIDs = df1['PMID'][df1['Correct_link'].notna()].to_list()
len(PMIDs)

69

In [392]:
for PMID in tqdm(PMIDs):
    
    info = pbmd.check_is_in_softwh(df1[df1['PMID']==PMID]['Correct_link'].values[0])

    idx = df1.index[df1['PMID'] == PMID][0]
    
    df1.loc[idx, "In_SoftWH"] = pbmd.is_in_softwh(info)
    df1.loc[idx, "Archived"] = pbmd.get_date_archived(info)

100%|██████████████████████████████████████████████████████████████████████████████████| 69/69 [00:13<00:00,  5.03it/s]


In [393]:
PMIDs = df1['PMID'][df1['Correct_link'].isna()].to_list()
len(PMIDs)

146

In [394]:
for PMID in tqdm(PMIDs):
    
    info = pbmd.check_is_in_softwh(df1[df1['PMID']==PMID]['GitHub_link_clean'].values[0])

    idx = df1.index[df1['PMID'] == PMID][0]
    
    df1.loc[idx, "In_SoftWH"] = pbmd.is_in_softwh(info)
    df1.loc[idx, "Archived"] = pbmd.get_date_archived(info)

100%|████████████████████████████████████████████████████████████████████████████████| 146/146 [00:56<00:00,  2.60it/s]


In [396]:
print(f"Repositories that are forks : {len(df1[df1['Fork'] == 1])}")

Repositories that are forks : 0


In [397]:
print(f"Not in SoftWH : {len(df1[df1['In_SoftWH'] == 0])}")

Not in SoftWH : 118


In [None]:
print(f"In SoftWH : {len(df1[df1['In_SoftWH'] == 1])}")

In SoftWH : 97


In [None]:
print(f"Were deleted but archived in SoftWH : {len(df1[df1['Correct_link'].isna()][df1['In_SoftWH'] == 1])}")

Were deleted but archived in SoftWH : 49


  print(f"Were deleted but archived in SoftWH : {len(df[df['Correct_link'].isna()][df['In_SoftWH'] == 1])}")
