In [1]:
from datetime import datetime
import time
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import tqdm as tqdm_module
import requests
import re
import json
import pbmd_tools as pbmd

In [2]:
db = 'pubmed'
domain = 'https://www.ncbi.nlm.nih.gov/entrez/eutils'
nresults = 100
query = '"github.com"[Title/Abstract] NOT "github.com"[Title]'
retmode = 'json'

In [3]:
queryLinkSearch = f'{domain}/esearch.fcgi?db={db}&retmax={nresults}&retmode={retmode}&term={query}'
response = requests.get(queryLinkSearch)
pubmedJson = response.json()
results = []
for paperId in pubmedJson["esearchresult"]["idlist"]:
    queryLinkSummary = f'{domain}/esummary.fcgi?db={db}&id={paperId}&retmode={retmode}'
    results.append({'paperId': paperId, 'metadata': requests.get(queryLinkSummary).json()})

In [4]:
df1 = pd.DataFrame.from_records(results)
df1["result"] = df1["metadata"].apply(lambda x: x["result"])
df1["PMID"] = df1["paperId"]
df1["PubDate"] = [d[k]["sortpubdate"] for d in df1["result"] for k in d.keys() if k != "uids"]
df1["PubDate"] = df1["PubDate"].apply(lambda x: datetime.strptime(x, "%Y/%m/%d %H:%M").strftime("%Y-%m-%d") if x else None)
df1["DOI"] = [d[k]["articleids"][i]["value"] for d in df1["result"] for k in d.keys() if k != "uids" for i in range(len(d[k]["articleids"])) if d[k]["articleids"][i]["idtype"] == "doi"] 
df1["Journal"] = [d[k]["fulljournalname"] for d in df1["result"] for k in d.keys() if k != "uids"]
df1["Title"] = [d[k]["title"] for d in df1["result"] for k in d.keys() if k != "uids"]
df1 = df1.drop(["paperId", "result", "metadata"], axis=1)
df1

Unnamed: 0,PMID,PubDate,DOI,Journal,Title
0,36760999,2023-01-24,10.3389/fgene.2023.1082032,Frontiers in genetics,Identification of functional gene modules by i...
1,36759692,2023-02-09,10.1038/s41598-023-29320-6,Scientific reports,Deep consistency-preserving hash auto-encoders...
2,36759336,2023-02-09,10.1093/bib/bbad044,Briefings in bioinformatics,HiConfidence: a novel approach uncovering the ...
3,36756726,2023-02-09,10.1111/1755-0998.13765,Molecular ecology resources,Long amplicon HiFi sequencing for mitochondria...
4,36756173,2022-10-23,10.1016/j.csbj.2022.10.016,Computational and structural biotechnology jou...,Multi-task deep autoencoder to predict Alzheim...
...,...,...,...,...,...
95,36711471,2023-01-20,10.1101/2023.01.17.524477,bioRxiv : the preprint server for biology,Combining protein sequences and structures wit...
96,36710930,2023-01-13,10.3389/fnmol.2022.1037565,Frontiers in molecular neuroscience,Inferring cell developmental stage-specific ln...
97,36710872,2023-01-23,10.7717/peerj.14706,PeerJ,A clustering method for small scRNA-seq data b...
98,36709790,2023-01-26,10.1016/j.ymeth.2023.01.006,"Methods (San Diego, Calif.)",Predicting latent lncRNA and cancer metastatic...


In [5]:
def get_abstract(PMID):
    queryLinkSearch = f'{domain}/efetch.fcgi?db={db}&id={PMID}&retmax={nresults}&retmode={retmode}&rettype=abstract'
    response = requests.get(queryLinkSearch)
    return response.text

In [None]:
rate_limit = 3
request_count = 0

for id_iter in df1["PMID"]:  
    
    if request_count >= rate_limit:
        
        time.sleep(1)
        request_count = 0
        
    df1.loc[df1["PMID"] == id_iter,"Abstract"] = get_abstract(id_iter)
    
    request_count += 1
df1

Unnamed: 0,PMID,PubDate,DOI,Journal,Title,Abstract
0,36760999,2023-01-24,10.3389/fgene.2023.1082032,Frontiers in genetics,Identification of functional gene modules by i...,Front Genet. 2023 Jan 24;14:1082032. doi: 10.3...
1,36759692,2023-02-09,10.1038/s41598-023-29320-6,Scientific reports,Deep consistency-preserving hash auto-encoders...,Sci Rep. 2023 Feb 9;13(1):2316. doi: 10.1038/s...
2,36759336,2023-02-09,10.1093/bib/bbad044,Briefings in bioinformatics,HiConfidence: a novel approach uncovering the ...,Brief Bioinform. 2023 Feb 9:bbad044. doi: 10.1...
3,36756726,2023-02-09,10.1111/1755-0998.13765,Molecular ecology resources,Long amplicon HiFi sequencing for mitochondria...,Mol Ecol Resour. 2023 Feb 9. doi: 10.1111/1755...
4,36756173,2022-10-23,10.1016/j.csbj.2022.10.016,Computational and structural biotechnology jou...,Multi-task deep autoencoder to predict Alzheim...,Comput Struct Biotechnol J. 2022 Oct 23;20:576...
...,...,...,...,...,...,...
95,36711471,2023-01-20,10.1101/2023.01.17.524477,bioRxiv : the preprint server for biology,Combining protein sequences and structures wit...,bioRxiv. 2023 Jan 20:2023.01.17.524477. doi: 1...
96,36710930,2023-01-13,10.3389/fnmol.2022.1037565,Frontiers in molecular neuroscience,Inferring cell developmental stage-specific ln...,Front Mol Neurosci. 2023 Jan 13;15:1037565. do...
97,36710872,2023-01-23,10.7717/peerj.14706,PeerJ,A clustering method for small scRNA-seq data b...,PeerJ. 2023 Jan 23;11:e14706. doi: 10.7717/pee...
98,36709790,2023-01-26,10.1016/j.ymeth.2023.01.006,"Methods (San Diego, Calif.)",Predicting latent lncRNA and cancer metastatic...,Methods. 2023 Jan 26;211:1-9. doi: 10.1016/j.y...


In [None]:
def get_link(df, PMID):
    
    regex = ["https:\/[^ ]*[./]github.com[^\n .,)]*"]
    links_with_point = ''
    
    for rgx in regex:
        if (links_with_point == '') and re.search(rgx, str(df.loc[df["PMID"] == PMID, "Abstract"].values[0]), re.IGNORECASE):
                links_with_point = re.findall(rgx, str(df.loc[df["PMID"] == PMID, "Abstract"].values[0]), re.IGNORECASE)
    links = []
    for link in links_with_point :
        links.append(link) 
        
    return links

In [None]:
df1

Unnamed: 0,PMID,PubDate,DOI,Journal,Title,Abstract
0,36760999,2023-01-24,10.3389/fgene.2023.1082032,Frontiers in genetics,Identification of functional gene modules by i...,Front Genet. 2023 Jan 24;14:1082032. doi: 10.3...
1,36759692,2023-02-09,10.1038/s41598-023-29320-6,Scientific reports,Deep consistency-preserving hash auto-encoders...,Sci Rep. 2023 Feb 9;13(1):2316. doi: 10.1038/s...
2,36759336,2023-02-09,10.1093/bib/bbad044,Briefings in bioinformatics,HiConfidence: a novel approach uncovering the ...,Brief Bioinform. 2023 Feb 9:bbad044. doi: 10.1...
3,36756726,2023-02-09,10.1111/1755-0998.13765,Molecular ecology resources,Long amplicon HiFi sequencing for mitochondria...,Mol Ecol Resour. 2023 Feb 9. doi: 10.1111/1755...
4,36756173,2022-10-23,10.1016/j.csbj.2022.10.016,Computational and structural biotechnology jou...,Multi-task deep autoencoder to predict Alzheim...,Comput Struct Biotechnol J. 2022 Oct 23;20:576...
...,...,...,...,...,...,...
95,36711471,2023-01-20,10.1101/2023.01.17.524477,bioRxiv : the preprint server for biology,Combining protein sequences and structures wit...,bioRxiv. 2023 Jan 20:2023.01.17.524477. doi: 1...
96,36710930,2023-01-13,10.3389/fnmol.2022.1037565,Frontiers in molecular neuroscience,Inferring cell developmental stage-specific ln...,Front Mol Neurosci. 2023 Jan 13;15:1037565. do...
97,36710872,2023-01-23,10.7717/peerj.14706,PeerJ,A clustering method for small scRNA-seq data b...,PeerJ. 2023 Jan 23;11:e14706. doi: 10.7717/pee...
98,36709790,2023-01-26,10.1016/j.ymeth.2023.01.006,"Methods (San Diego, Calif.)",Predicting latent lncRNA and cancer metastatic...,Methods. 2023 Jan 26;211:1-9. doi: 10.1016/j.y...


In [None]:
def clean_link(link_array):
    links = []
    for link in link_array:
        if link != "":
            if not link.startswith("https://"):
                link = "https://" + link
            if link[-2] == ")":
                link = link[:-2]
            if link[-1] == ")" or link[-1] == "." or link[-1] == "," or link[-1] == ",":
                link = link[:-1]
        links.append(link)
        
    return links

In [None]:
def get_phrase_with_link(df, PMID):
    
    regex = ["\.[^.]*[./]github[^ ]* [^.]*[^ ]*\.", "\.[^.]*[./]github[^ ]*", "[^.]*github[^ ]*", "[a-zA-Z0-9 .,/:\'\"!?]{101}github[^ ]*[a-zA-Z0-9 .,/:\'\"!?()]{100}"]
    phrase_with_point = ''
    
    for rgx in regex:
        if (phrase_with_point == "") and re.search(rgx, str(df.loc[df["PMID"] == PMID, "Abstract"].values[0]), re.IGNORECASE):
            phrase_with_point = re.findall(rgx, str(df.loc[df["PMID"] == PMID, "Abstract"].values[0]), re.IGNORECASE)
    phrases = []
    for phrase in phrase_with_point :
        phrase = phrase[2:]
        phrases.append(phrase)
        
    return ' '.join(phrases)

In [None]:
for id_iter in df1["PMID"]:
    
    df1.loc[df1["PMID"] == id_iter,"GitHub_link_raw"] = ", ".join(get_link(df1, id_iter))
    df1.loc[df1["PMID"] == id_iter,"GitHub_link_clean"] = ", ".join(clean_link(df1.loc[df1["PMID"] == id_iter,"GitHub_link_raw"]))
    df1.loc[df1["PMID"] == id_iter, "Phrase"] = str(get_phrase_with_link(df1, id_iter))

df1 = df1[["PMID", "PubDate", "DOI", "GitHub_link_raw", "GitHub_link_clean", "Journal", "Title", "Phrase", "Abstract"]]
pd.options.display.max_colwidth = None
df1

Unnamed: 0,PMID,PubDate,DOI,GitHub_link_raw,GitHub_link_clean,Journal,Title,Phrase,Abstract
0,36760999,2023-01-24,10.3389/fgene.2023.1082032,https://github.com/free1234hm/CLAM,https://github.com/free1234hm/CLAM,Frontiers in genetics,Identification of functional gene modules by integrating multi-omics data and known molecular interactions.,"We implemented Correlation-based Local Approximation of Membership as \na user-friendly application available at https://github.com/free1234hm/CLAM.\n\nCopyright © 2023 Chen, Han, Li, Li, Zhang and Zhu.","Front Genet. 2023 Jan 24;14:1082032. doi: 10.3389/fgene.2023.1082032. \neCollection 2023.\n\nIdentification of functional gene modules by integrating multi-omics data and \nknown molecular interactions.\n\nChen X(1)(2), Han M(2), Li Y(3), Li X(2), Zhang J(2), Zhu Y(1)(2).\n\nAuthor information:\n(1)Basic Medical School, Anhui Medical University, Hefei, China.\n(2)National Center for Protein Sciences (Beijing), Beijing Proteome Research \nCenter, Beijing Institute of Lifeomics, Beijing, China.\n(3)Central Research Laboratory, Peking Union Medical College Hospital, Chinese \nAcademy of Medical Sciences and Peking Union Medical College, Beijing, China.\n\nMulti-omics data integration has emerged as a promising approach to identify \npatient subgroups. However, in terms of grouping genes (or gene products) into \nco-expression modules, data integration methods suffer from two main drawbacks. \nFirst, most existing methods only consider genes or samples measured in all \ndifferent datasets. Second, known molecular interactions (e.g., transcriptional \nregulatory interactions, protein-protein interactions and biological pathways) \ncannot be utilized to assist in module detection. Herein, we present a novel \ndata integration framework, Correlation-based Local Approximation of Membership \n(CLAM), which provides two methodological innovations to address these \nlimitations: 1) constructing a trans-omics neighborhood matrix by integrating \nmulti-omics datasets and known molecular interactions, and 2) using a local \napproximation procedure to define gene modules from the matrix. Applying \nCorrelation-based Local Approximation of Membership to human colorectal cancer \n(CRC) and mouse B-cell differentiation multi-omics data obtained from The Cancer \nGenome Atlas (TCGA), Clinical Proteomics Tumor Analysis Consortium (CPTAC), Gene \nExpression Omnibus (GEO) and ProteomeXchange database, we demonstrated its \nsuperior ability to recover biologically relevant modules and gene ontology (GO) \nterms. Further investigation of the colorectal cancer modules revealed numerous \ntranscription factors and KEGG pathways that played crucial roles in colorectal \ncancer progression. Module-based survival analysis constructed four \nsurvival-related networks in which pairwise gene correlations were significantly \ncorrelated with colorectal cancer patient survival. Overall, the series of \nevaluations demonstrated the great potential of Correlation-based Local \nApproximation of Membership for identifying modular biomarkers for complex \ndiseases. We implemented Correlation-based Local Approximation of Membership as \na user-friendly application available at https://github.com/free1234hm/CLAM.\n\nCopyright © 2023 Chen, Han, Li, Li, Zhang and Zhu.\n\nDOI: 10.3389/fgene.2023.1082032\nPMCID: PMC9902936\nPMID: 36760999\n\nConflict of interest statement: The authors declare that the research was \nconducted in the absence of any commercial or financial relationships that could \nbe construed as a potential conflict of interest.\n\n"
1,36759692,2023-02-09,10.1038/s41598-023-29320-6,https://github.com/Socrates023/DCPHA,https://github.com/Socrates023/DCPHA,Scientific reports,Deep consistency-preserving hash auto-encoders for neuroimage cross-modal retrieval.,We make code and models publicly available: \nhttps://github.com/Socrates023/DCPHA .,"Sci Rep. 2023 Feb 9;13(1):2316. doi: 10.1038/s41598-023-29320-6.\n\nDeep consistency-preserving hash auto-encoders for neuroimage cross-modal \nretrieval.\n\nWang X(1), Zeng X(2).\n\nAuthor information:\n(1)College of Computer Science and Technology, Chongqing University of Posts and \nTelecommunications, Chongqing, 400065, China.\n(2)College of Computer Science and Technology, Chongqing University of Posts and \nTelecommunications, Chongqing, 400065, China. zengxh@cqupt.edu.cn.\n\nCross-modal hashing is an efficient method to embed high-dimensional \nheterogeneous modal feature descriptors into a consistency-preserving Hamming \nspace with low-dimensional. Most existing cross-modal hashing methods have been \nable to bridge the heterogeneous modality gap, but there are still two \nchallenges resulting in limited retrieval accuracy: (1) ignoring the continuous \nsimilarity of samples on manifold; (2) lack of discriminability of hash codes \nwith the same semantics. To cope with these problems, we propose a Deep \nConsistency-Preserving Hash Auto-encoders model, called DCPHA, based on the \nmulti-manifold property of the feature distribution. Specifically, DCPHA \nconsists of a pair of asymmetric auto-encoders and two semantics-preserving \nattention branches working in the encoding and decoding stages, respectively. \nWhen the number of input medical image modalities is greater than 2, the encoder \nis a multiple pseudo-Siamese network designed to extract specific modality \nfeatures of different medical image modalities. In addition, we define the \ncontinuous similarity of heterogeneous and homogeneous samples on Riemann \nmanifold from the perspective of multiple sub-manifolds, respectively, and the \ntwo constraints, i.e., multi-semantic consistency and multi-manifold \nsimilarity-preserving, are embedded in the learning of hash codes to obtain \nhigh-quality hash codes with consistency-preserving. The extensive experiments \nshow that the proposed DCPHA has the most stable and state-of-the-art \nperformance. We make code and models publicly available: \nhttps://github.com/Socrates023/DCPHA .\n\n© 2023. The Author(s).\n\nDOI: 10.1038/s41598-023-29320-6\nPMID: 36759692\n\n"
2,36759336,2023-02-09,10.1093/bib/bbad044,https://github.com/victorykobets/HiConfidence,https://github.com/victorykobets/HiConfidence,Briefings in bioinformatics,HiConfidence: a novel approach uncovering the biological signal in Hi-C data affected by technical biases.,The method is freely available \nat GitHub: https://github.com/victorykobets/HiConfidence.\n\n© The Author(s) 2023.,"Brief Bioinform. 2023 Feb 9:bbad044. doi: 10.1093/bib/bbad044. Online ahead of \nprint.\n\nHiConfidence: a novel approach uncovering the biological signal in Hi-C data \naffected by technical biases.\n\nKobets VA(1), Ulianov SV(2)(3), Galitsyna AA(1)(2)(4), Doronin SA(5), Mikhaleva \nEA(5), Gelfand MS(1)(4), Shevelyov YY(5), Razin SV(2)(3), Khrameeva EE(1).\n\nAuthor information:\n(1)Skolkovo Institute of Science and Technology, Moscow, 121205, Russia.\n(2)Institute of Gene Biology, Russian Academy of Sciences, Moscow, 119334, \nRussia.\n(3)Faculty of Biology, M.V. Lomonosov Moscow State University, Moscow, 119992, \nRussia.\n(4)A.A. Kharkevich Institute for Information Transmission Problems, Russian \nAcademy of Sciences, Moscow, 127051, Russia.\n(5)Institute of Molecular Genetics of National Research Centre ""Kurchatov \nInstitute"", Moscow, 123182, Russia.\n\nThe chromatin interaction assays, particularly Hi-C, enable detailed studies of \ngenome architecture in multiple organisms and model systems, resulting in a \ndeeper understanding of gene expression regulation mechanisms mediated by \nepigenetics. However, the analysis and interpretation of Hi-C data remain \nchallenging due to technical biases, limiting direct comparisons of datasets \nobtained in different experiments and laboratories. As a result, removing biases \nfrom Hi-C-generated chromatin contact matrices is a critical data analysis step. \nOur novel approach, HiConfidence, eliminates biases from the Hi-C data by \nweighing chromatin contacts according to their consistency between replicates so \nthat low-quality replicates do not substantially influence the result. The \nalgorithm is effective for the analysis of global changes in chromatin \nstructures such as compartments and topologically associating domains. We apply \nthe HiConfidence approach to several Hi-C datasets with significant technical \nbiases, that could not be analyzed effectively using existing methods, and \nobtain meaningful biological conclusions. In particular, HiConfidence aids in \nthe study of how changes in histone acetylation pattern affect chromatin \norganization in Drosophila melanogaster S2 cells. The method is freely available \nat GitHub: https://github.com/victorykobets/HiConfidence.\n\n© The Author(s) 2023. Published by Oxford University Press.\n\nDOI: 10.1093/bib/bbad044\nPMID: 36759336\n\n"
3,36756726,2023-02-09,10.1111/1755-0998.13765,https://github.com/Caizf-script/HQGR,https://github.com/Caizf-script/HQGR,Molecular ecology resources,Long amplicon HiFi sequencing for mitochondrial DNA genomes.,"The High Quality Reads Generator \n(HQGR) software is provided to facilitate data analyses, which is publicly \naccessible on GitHub (https://github.com/Caizf-script/HQGR). Our long amplicon \nHiFi sequencing pipeline can also be applied in various target enrichment \nstrategies for small genomes and candidate genes.","Mol Ecol Resour. 2023 Feb 9. doi: 10.1111/1755-0998.13765. Online ahead of \nprint.\n\nLong amplicon HiFi sequencing for mitochondrial DNA genomes.\n\nCai ZF(#)(1)(2), Hu JY(#)(3), Yin TT(#)(2), Wang D(3), Shen QK(2), Ma C(2)(4), \nOu DQ(5), Xu MM(2)(4), Shi X(2)(4), Li QL(1)(2), Wu RN(2), Ajuma L(2)(4), Adeola \nAC(2), Zhang YP(1)(2)(4), Peng MS(2)(4).\n\nAuthor information:\n(1)State Key Laboratory for Conservation and Utilization of Bio-resources in \nYunnan, Yunnan University, Kunming, China.\n(2)State Key Laboratory of Genetic Resources and Evolution & Yunnan Laboratory \nof Molecular Biology of Domestic Animals, Kunming Institute of Zoology, Chinese \nAcademy of Sciences, Kunming, China.\n(3)School of Software, Yunnan University, Kunming, China.\n(4)University of Chinese Academy of Sciences, Beijing, China.\n(5)Department of Anesthesiology, First Affiliated Hospital of Kunming Medical \nUniversity, Kunming, China.\n(#)Contributed equally\n\nLong-read sequencing technology is a powerful approach with application in \nvarious genetic and genomic researches. Herein, we developed the pipeline for \nlong amplicon high-fidelity (HiFi) sequencing and then applied it for sequencing \nmitochondrial DNA (mtDNA) genomes from pools of 79 Tibetan Mastiffs. We \namplified the mtDNA genome with long-range PCR using two pairs of primers. Two \nrounds of circular consensus sequencing (CCS) were conducted and their accuracy \nwas evaluated. The results indicate that the second round of CCS can improve the \naccuracy of HiFi reads. In addition, the analysis of 79 high-quality mtDNA \ngenomes shows the Tibetan Mastiffs from outside of the Tibetan Plateau \nexperienced hybridization with other dogs. The High Quality Reads Generator \n(HQGR) software is provided to facilitate data analyses, which is publicly \naccessible on GitHub (https://github.com/Caizf-script/HQGR). Our long amplicon \nHiFi sequencing pipeline can also be applied in various target enrichment \nstrategies for small genomes and candidate genes.\n\nThis article is protected by copyright. All rights reserved.\n\nDOI: 10.1111/1755-0998.13765\nPMID: 36756726\n\n"
4,36756173,2022-10-23,10.1016/j.csbj.2022.10.016,https://github.com/lichen-lab/MTAE,https://github.com/lichen-lab/MTAE,Computational and structural biotechnology journal,Multi-task deep autoencoder to predict Alzheimer's disease progression using temporal DNA methylation data in peripheral blood.,\nAvailability:: https://github.com/lichen-lab/MTAE.\n\n© 2022 The Author(s).,"Comput Struct Biotechnol J. 2022 Oct 23;20:5761-5774. doi: \n10.1016/j.csbj.2022.10.016. eCollection 2022.\n\nMulti-task deep autoencoder to predict Alzheimer's disease progression using \ntemporal DNA methylation data in peripheral blood.\n\nChen L(1), Saykin AJ(2), Yao B(3), Zhao F(1); Alzheimer’s Disease Neuroimaging \nInitiative (ADNI).\n\nAuthor information:\n(1)Department of Biostatistics, University of Florida, Gainesville, FL 32603, \nUnited States.\n(2)Department of Radiology and Imaging Sciences, Indiana University School of \nMedicine, Indianapolis, IN 46202, United States.\n(3)Department of Human Genetics, Emory University, Atlanta, GA 30322, United \nStates.\n\nTraditional approaches for diagnosing Alzheimer's disease (AD) such as brain \nimaging and cerebrospinal fluid are invasive and expensive. It is desirable to \ndevelop a useful diagnostic tool by exploiting biomarkers obtained from \nperipheral tissues due to their noninvasive and easily accessible \ncharacteristics. However, the capacity of using DNA methylation data in \nperipheral blood for predicting AD progression is rarely known. It is also \nchallenging to develop an efficient prediction model considering the complex and \nhigh-dimensional DNA methylation data in a longitudinal study. Here, we develop \ntwo multi-task deep autoencoders, which are based on the convolutional \nautoencoder and long short-term memory autoencoder to learn the compressed \nfeature representation by jointly minimizing the reconstruction error and \nmaximizing the prediction accuracy. By benchmarking on longitudinal DNA \nmethylation data collected from the peripheral blood in Alzheimer's Disease \nNeuroimaging Initiative, we demonstrate that the proposed multi-task deep \nautoencoders outperform state-of-the-art machine learning approaches for both \npredicting AD progression and reconstructing the temporal DNA methylation \nprofiles. In addition, the proposed multi-task deep autoencoders can predict AD \nprogression accurately using only the historical DNA methylation data and the \nperformance is further improved by including all temporal DNA methylation data. \nAvailability:: https://github.com/lichen-lab/MTAE.\n\n© 2022 The Author(s).\n\nDOI: 10.1016/j.csbj.2022.10.016\nPMCID: PMC9619306\nPMID: 36756173\n\nConflict of interest statement: The authors declare that they have no known \ncompeting financial interests or personal relationships that could have appeared \nto influence the work reported in this paper.\n\n"
...,...,...,...,...,...,...,...,...,...
95,36711471,2023-01-20,10.1101/2023.01.17.524477,https://github.com/jianlin-cheng/TransFun,https://github.com/jianlin-cheng/TransFun,bioRxiv : the preprint server for biology,Combining protein sequences and structures with transformers and equivariant graph neural networks to predict protein function.,AVAILABILITY: The source code of TransFun is available at \nhttps://github.com/jianlin-cheng/TransFun.\nCONTACT: chengji@missouri.edu.,"bioRxiv. 2023 Jan 20:2023.01.17.524477. doi: 10.1101/2023.01.17.524477. \nPreprint.\n\nCombining protein sequences and structures with transformers and equivariant \ngraph neural networks to predict protein function.\n\nBoadu F, Cao H, Cheng J.\n\nMOTIVATION: Millions of protein sequences have been generated by numerous genome \nand transcriptome sequencing projects. However, experimentally determining the \nfunction of the proteins is still a time consuming, low-throughput, and \nexpensive process, leading to a large protein sequence-function gap. Therefore, \nit is important to develop computational methods to accurately predict protein \nfunction to fill the gap. Even though many methods have been developed to use \nprotein sequences as input to predict function, much fewer methods leverage \nprotein structures in protein function prediction because there was lack of \naccurate protein structures for most proteins until recently.\nRESULTS: We developed TransFun - a method using a transformer-based protein \nlanguage model and 3D-equivariant graph neural networks to distill information \nfrom both protein sequences and structures to predict protein function. It \nextracts feature embeddings from protein sequences using a pre-trained protein \nlanguage model (ESM) via transfer learning and combines them with 3D structures \nof proteins predicted by AlphaFold2 through equivariant graph neural networks. \nBenchmarked on the CAFA3 test dataset and a new test dataset, TransFun \noutperforms several state-of-the-art methods, indicating the language model and \n3D-equivariant graph neural networks are effective methods to leverage protein \nsequences and structures to improve protein function prediction. Combining \nTransFun predictions and sequence similarity-based predictions can further \nincrease prediction accuracy.\nAVAILABILITY: The source code of TransFun is available at \nhttps://github.com/jianlin-cheng/TransFun.\nCONTACT: chengji@missouri.edu.\n\nDOI: 10.1101/2023.01.17.524477\nPMCID: PMC9882282\nPMID: 36711471\n\n"
96,36710930,2023-01-13,10.3389/fnmol.2022.1037565,https://github.com/linxi159/CDSlncR,https://github.com/linxi159/CDSlncR,Frontiers in molecular neuroscience,Inferring cell developmental stage-specific lncRNA regulation in the developing human neocortex with CDSlncR.,"CDSlncR is available at https://github.com/linxi159/CDSlncR.\n\nCopyright © 2023 Huang, Ma and Zhang.","Front Mol Neurosci. 2023 Jan 13;15:1037565. doi: 10.3389/fnmol.2022.1037565. \neCollection 2022.\n\nInferring cell developmental stage-specific lncRNA regulation in the developing \nhuman neocortex with CDSlncR.\n\nHuang M(1)(2), Ma J(1)(3), Zhang J(3).\n\nAuthor information:\n(1)Department of Automation, Xiamen University, Xiamen, China.\n(2)Department of Computer Science, University of Tsukuba, Tsukuba, Japan.\n(3)School of Engineering, Dali University, Dali, China.\n\nNoncoding RNAs (ncRNAs) occupy ~98% of the transcriptome in human, and are \nusually not translated into proteins. Among ncRNAs, long non-coding RNAs \n(lncRNAs, >200 nucleotides) are important regulators to modulate gene \nexpression, and are involved in many biological processes (e.g., cell \ndevelopment). To study lncRNA regulation, many computational approaches or tools \nhave been proposed by using bulk transcriptomics data. Nevertheless, previous \nbulk data-driven methods are mostly limited to explore the lncRNA regulation \nregarding all of cells, instead of the lncRNA regulation specific to cell \ndevelopmental stages. Fortunately, recent advance in single-cell sequencing data \nhas provided a way to investigate cell developmental stage-specific lncRNA \nregulation. In this work, we present a novel computational method, CDSlncR (Cell \nDevelopmental Stage-specific lncRNA regulation), which combines putative \nlncRNA-target binding information with single-cell transcriptomics data to infer \ncell developmental stage-specific lncRNA regulation. For each cell developmental \nstage, CDSlncR constructs a cell developmental stage-specific lncRNA regulatory \nnetwork in the cell developmental stage. To illustrate the effectiveness of \nCDSlncR, we apply CDSlncR into single-cell transcriptomics data of the \ndeveloping human neocortex for exploring lncRNA regulation across different \nhuman neocortex developmental stages. Network analysis shows that the lncRNA \nregulation is unique in each developmental stage of human neocortex. As a case \nstudy, we also perform particular analysis on the cell developmental \nstage-specific lncRNA regulation related to 18 known lncRNA biomarkers in autism \nspectrum disorder. Finally, the comparison result indicates that CDSlncR is an \neffective method for predicting cell developmental stage-specific lncRNA \ntargets. CDSlncR is available at https://github.com/linxi159/CDSlncR.\n\nCopyright © 2023 Huang, Ma and Zhang.\n\nDOI: 10.3389/fnmol.2022.1037565\nPMCID: PMC9880432\nPMID: 36710930\n\nConflict of interest statement: The authors declare that the research was \nconducted in the absence of any commercial or financial relationships that could \nbe construed as a potential conflict of interest.\n\n"
97,36710872,2023-01-23,10.7717/peerj.14706,https://github.com/ningzilan/SSWD,https://github.com/ningzilan/SSWD,PeerJ,A clustering method for small scRNA-seq data based on subspace and weighted distance.,SSWD can be \ndownloaded at https://github.com/ningzilan/SSWD.\n\n©2023 Ning et al.,"PeerJ. 2023 Jan 23;11:e14706. doi: 10.7717/peerj.14706. eCollection 2023.\n\nA clustering method for small scRNA-seq data based on subspace and weighted \ndistance.\n\nNing Z(1)(2), Dai Z(1), Zhang H(2), Chen Y(1), Yuan Z(1).\n\nAuthor information:\n(1)Hunan Engineering & Technology Research Centre for Agricultural Big Data \nAnalysis & Decision-Making, Hunan Agricultural University, Changsha, Hunan, \nChina.\n(2)Hunan Agricultural University, College of Information and Intelligence, \nChangsha, Hunan, China.\n\nBACKGROUND: Identifying the cell types using unsupervised methods is essential \nfor scRNA-seq research. However, conventional similarity measures introduce \nchallenges to single-cell data clustering because of the high dimensional, high \nnoise, and high dropout.\nMETHODS: We proposed a clustering method for small ScRNA-seq data based on \nSubspace and Weighted Distance (SSWD), which follows the assumption that the \nsets of gene subspace composed of similar density-distributing genes can better \ndistinguish cell groups. To accurately capture the intrinsic relationship among \ncells or genes, a new distance metric that combines Euclidean and Pearson \ndistance through a weighting strategy was proposed. The relative \nCalinski-Harabasz (CH) index was used to estimate the cluster numbers instead of \nthe CH index because it is comparable across degrees of freedom.\nRESULTS: We compared SSWD with seven prevailing methods on eight publicly \nscRNA-seq datasets. The experimental results show that the SSWD has better \nclustering accuracy and the partitioning ability of cell groups. SSWD can be \ndownloaded at https://github.com/ningzilan/SSWD.\n\n©2023 Ning et al.\n\nDOI: 10.7717/peerj.14706\nPMCID: PMC9879162\nPMID: 36710872 [Indexed for MEDLINE]\n\nConflict of interest statement: The authors declare there are no competing \ninterests.\n\n"
98,36709790,2023-01-26,10.1016/j.ymeth.2023.01.006,https://github.com/zhuyuan-cug/VGAE-LCME,https://github.com/zhuyuan-cug/VGAE-LCME,"Methods (San Diego, Calif.)",Predicting latent lncRNA and cancer metastatic event associations via variational graph auto-encoder.,The source code and \ndata are available at https://github.com/zhuyuan-cug/VGAE-LCME.\n\nCopyright © 2023 Elsevier Inc.,"Methods. 2023 Jan 26;211:1-9. doi: 10.1016/j.ymeth.2023.01.006. Online ahead of \nprint.\n\nPredicting latent lncRNA and cancer metastatic event associations via \nvariational graph auto-encoder.\n\nZhu Y(1), Zhang F(2), Zhang S(3), Yi M(4).\n\nAuthor information:\n(1)School of Automation, China University of Geosciences, 388 Lumo Road, \nHongshan District, 430074, Wuhan, Hubei, China; Hubei Key Laboratory of Advanced \nControl and Intelligent Automation for Complex Systems, 388 Lumo Road, Hongshan \nDistrict, 430074, Wuhan, Hubei, China; Engineering Research Center of \nIntelligent Technology for Geo-Exploration, 388 Lumo Road, Hongshan District, \n430074, Wuhan, Hubei, China.\n(2)School of Mathematics and Physics, China University of Geosciences, 388 Lumo \nRoad, Hongshan District, 430074, Wuhan, Hubei, China.\n(3)College of Life Science and Health, Wuhan University of Science and \nTechnology, 974 Heping Avenue, Qingshan District, 430081, Wuhan, Hubei, China. \nElectronic address: shihuazhang@wust.edu.cn.\n(4)School of Mathematics and Physics, China University of Geosciences, 388 Lumo \nRoad, Hongshan District, 430074, Wuhan, Hubei, China. Electronic address: \nmingyi@cug.edu.cn.\n\nLong non-coding RNA (lncRNA) are shown to be closely associated with cancer \nmetastatic events (CME, e.g., cancer cell invasion, intravasation, \nextravasation, proliferation) that collaboratively accelerate malignant cancer \nspread and cause high mortality rate in patients. Clinical trials may accurately \nuncover the relationships between lncRNAs and CMEs; however, it is \ntime-consuming and expensive. With the accumulation of data, there is an urgent \nneed to find efficient ways to identify these relationships. Herein, a graph \nembedding representation-based predictor (VGEA-LCME) for exploring latent \nlncRNA-CME associations is introduced. In VGEA-LCME, a heterogeneous combined \nnetwork is constructed by integrating similarity and linkage matrix that can \nmaintain internal and external characteristics of networks, and a variational \ngraph auto-encoder serves as a feature generator to represent arbitrary lncRNA \nand CME pair. The final robustness predicted result is obtained by ensemble \nclassifier strategy via cross-validation. Experimental comparisons and \nliterature verification show better remarkable performance of VGEA-LCME, \nalthough the similarities between CMEs are challenging to calculate. In \naddition, VGEA-LCME can further identify organ-specific CMEs. To the best of our \nknowledge, this is the first computational attempt to discover the potential \nrelationships between lncRNAs and CMEs. It may provide support and new insight \nfor guiding experimental research of metastatic cancers. The source code and \ndata are available at https://github.com/zhuyuan-cug/VGAE-LCME.\n\nCopyright © 2023 Elsevier Inc. All rights reserved.\n\nDOI: 10.1016/j.ymeth.2023.01.006\nPMID: 36709790\n\nConflict of interest statement: Declaration of Competing Interest The authors \ndeclare that there are no conflict of interests, we do not have any possible \nconflicts of interest.\n\n"


In [19]:
df1.to_csv('C:/Users/nadia/pubmed_results.tsv', sep='\t', index=False,columns=df1.columns[:-1].tolist())

In [None]:
#https://github.com/free1234hm/CLAM

owner = "free1234hm"
repo = "CLAM"
url = f"https://api.github.com/repos/{owner}/{repo}"

response = requests.get(url)
if response.status_code == 200:
    repository_info = response.json()
    created_at = repository_info["created_at"]
    updated_at = repository_info["updated_at"]
created_at, updated_at   

('2022-10-27T15:54:53Z', '2022-10-31T19:16:00Z')

In [None]:
#https://github.com/tyqGitHub/TYQ/tree/master/GACNNMDA - ????

In [None]:
link = "https://github.com/free1234hm/CLAM"

owner = re.findall("\/[^\/.]+/", str(link))[0][1:-1]
repo = re.findall("\/[^\/.]+$", str(link))[0][1:-1]
url = f"https://api.github.com/repos/{owner}/{repo}"

response = requests.get(url)
if response.status_code == 200:
    repository_info = response.json()
    created_at = repository_info["created_at"]
    updated_at = repository_info["updated_at"]
else:
    created_at = None
    updated_at = None

df1.loc[df1["PMID"] == "36760999","Repo_created_at"], df1.loc[df1["PMID"] == "36760999","Repo_updated_at"] = created_at, updated_at

df1.loc[df1["PMID"] == "36760999","Repo_created_at"], df1.loc[df1["PMID"] == "36760999","Repo_updated_at"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.loc[df1["PMID"] == "36760999","Repo_created_at"], df1.loc[df1["PMID"] == "36760999","Repo_updated_at"] = created_at, updated_at
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.loc[df1["PMID"] == "36760999","Repo_created_at"], df1.loc[df1["PMID"] == "36760999","Repo_updated_at"] = created_at, updated_at


(0   NaN
 Name: Repo_created_at, dtype: float64,
 0   NaN
 Name: Repo_updated_at, dtype: float64)

In [None]:
def get_repo_info(link):
    
    owner = re.findall("\/[^\/.]+/", str(link))[0][1:-1]
    repo = re.findall("\/[^\/.]+$", str(link))[0][1:-1]
    url = f"https://api.github.com/repos/{owner}/{repo}"
    
    response = requests.get(url)
    if response.status_code == 200:
        repository_info = response.json()
        created_at = repository_info["created_at"]
        updated_at = repository_info["updated_at"]
        return created_at, updated_at
    else:
        return None, None

In [None]:
get_repo_info(df1.loc[df1["PMID"] == "36760999","GitHub_link_clean"])

(None, None)

In [None]:
for id_iter in df1["PMID"]:
    
    df1.loc[df1["PMID"] == id_iter,"Repo_created_at"], df1.loc[df1["PMID"] == id_iter,"Repo_updated_at"] = get_repo_info(df1.loc[df1["PMID"] == id_iter,"GitHub_link_clean"])
df1

Unnamed: 0,PMID,PubDate,DOI,GitHub_link_raw,GitHub_link_clean,Journal,Title,Phrase,Abstract,Repo_created_at,Repo_updated_at
0,36760999,2023-01-24,10.3389/fgene.2023.1082032,https://github.com/free1234hm/CLAM,https://github.com/free1234hm/CLAM,Frontiers in genetics,Identification of functional gene modules by integrating multi-omics data and known molecular interactions.,"We implemented Correlation-based Local Approximation of Membership as \na user-friendly application available at https://github.com/free1234hm/CLAM.\n\nCopyright © 2023 Chen, Han, Li, Li, Zhang and Zhu.","Front Genet. 2023 Jan 24;14:1082032. doi: 10.3389/fgene.2023.1082032. \neCollection 2023.\n\nIdentification of functional gene modules by integrating multi-omics data and \nknown molecular interactions.\n\nChen X(1)(2), Han M(2), Li Y(3), Li X(2), Zhang J(2), Zhu Y(1)(2).\n\nAuthor information:\n(1)Basic Medical School, Anhui Medical University, Hefei, China.\n(2)National Center for Protein Sciences (Beijing), Beijing Proteome Research \nCenter, Beijing Institute of Lifeomics, Beijing, China.\n(3)Central Research Laboratory, Peking Union Medical College Hospital, Chinese \nAcademy of Medical Sciences and Peking Union Medical College, Beijing, China.\n\nMulti-omics data integration has emerged as a promising approach to identify \npatient subgroups. However, in terms of grouping genes (or gene products) into \nco-expression modules, data integration methods suffer from two main drawbacks. \nFirst, most existing methods only consider genes or samples measured in all \ndifferent datasets. Second, known molecular interactions (e.g., transcriptional \nregulatory interactions, protein-protein interactions and biological pathways) \ncannot be utilized to assist in module detection. Herein, we present a novel \ndata integration framework, Correlation-based Local Approximation of Membership \n(CLAM), which provides two methodological innovations to address these \nlimitations: 1) constructing a trans-omics neighborhood matrix by integrating \nmulti-omics datasets and known molecular interactions, and 2) using a local \napproximation procedure to define gene modules from the matrix. Applying \nCorrelation-based Local Approximation of Membership to human colorectal cancer \n(CRC) and mouse B-cell differentiation multi-omics data obtained from The Cancer \nGenome Atlas (TCGA), Clinical Proteomics Tumor Analysis Consortium (CPTAC), Gene \nExpression Omnibus (GEO) and ProteomeXchange database, we demonstrated its \nsuperior ability to recover biologically relevant modules and gene ontology (GO) \nterms. Further investigation of the colorectal cancer modules revealed numerous \ntranscription factors and KEGG pathways that played crucial roles in colorectal \ncancer progression. Module-based survival analysis constructed four \nsurvival-related networks in which pairwise gene correlations were significantly \ncorrelated with colorectal cancer patient survival. Overall, the series of \nevaluations demonstrated the great potential of Correlation-based Local \nApproximation of Membership for identifying modular biomarkers for complex \ndiseases. We implemented Correlation-based Local Approximation of Membership as \na user-friendly application available at https://github.com/free1234hm/CLAM.\n\nCopyright © 2023 Chen, Han, Li, Li, Zhang and Zhu.\n\nDOI: 10.3389/fgene.2023.1082032\nPMCID: PMC9902936\nPMID: 36760999\n\nConflict of interest statement: The authors declare that the research was \nconducted in the absence of any commercial or financial relationships that could \nbe construed as a potential conflict of interest.\n\n",,
1,36759692,2023-02-09,10.1038/s41598-023-29320-6,https://github.com/Socrates023/DCPHA,https://github.com/Socrates023/DCPHA,Scientific reports,Deep consistency-preserving hash auto-encoders for neuroimage cross-modal retrieval.,We make code and models publicly available: \nhttps://github.com/Socrates023/DCPHA .,"Sci Rep. 2023 Feb 9;13(1):2316. doi: 10.1038/s41598-023-29320-6.\n\nDeep consistency-preserving hash auto-encoders for neuroimage cross-modal \nretrieval.\n\nWang X(1), Zeng X(2).\n\nAuthor information:\n(1)College of Computer Science and Technology, Chongqing University of Posts and \nTelecommunications, Chongqing, 400065, China.\n(2)College of Computer Science and Technology, Chongqing University of Posts and \nTelecommunications, Chongqing, 400065, China. zengxh@cqupt.edu.cn.\n\nCross-modal hashing is an efficient method to embed high-dimensional \nheterogeneous modal feature descriptors into a consistency-preserving Hamming \nspace with low-dimensional. Most existing cross-modal hashing methods have been \nable to bridge the heterogeneous modality gap, but there are still two \nchallenges resulting in limited retrieval accuracy: (1) ignoring the continuous \nsimilarity of samples on manifold; (2) lack of discriminability of hash codes \nwith the same semantics. To cope with these problems, we propose a Deep \nConsistency-Preserving Hash Auto-encoders model, called DCPHA, based on the \nmulti-manifold property of the feature distribution. Specifically, DCPHA \nconsists of a pair of asymmetric auto-encoders and two semantics-preserving \nattention branches working in the encoding and decoding stages, respectively. \nWhen the number of input medical image modalities is greater than 2, the encoder \nis a multiple pseudo-Siamese network designed to extract specific modality \nfeatures of different medical image modalities. In addition, we define the \ncontinuous similarity of heterogeneous and homogeneous samples on Riemann \nmanifold from the perspective of multiple sub-manifolds, respectively, and the \ntwo constraints, i.e., multi-semantic consistency and multi-manifold \nsimilarity-preserving, are embedded in the learning of hash codes to obtain \nhigh-quality hash codes with consistency-preserving. The extensive experiments \nshow that the proposed DCPHA has the most stable and state-of-the-art \nperformance. We make code and models publicly available: \nhttps://github.com/Socrates023/DCPHA .\n\n© 2023. The Author(s).\n\nDOI: 10.1038/s41598-023-29320-6\nPMID: 36759692\n\n",,
2,36759336,2023-02-09,10.1093/bib/bbad044,https://github.com/victorykobets/HiConfidence,https://github.com/victorykobets/HiConfidence,Briefings in bioinformatics,HiConfidence: a novel approach uncovering the biological signal in Hi-C data affected by technical biases.,The method is freely available \nat GitHub: https://github.com/victorykobets/HiConfidence.\n\n© The Author(s) 2023.,"Brief Bioinform. 2023 Feb 9:bbad044. doi: 10.1093/bib/bbad044. Online ahead of \nprint.\n\nHiConfidence: a novel approach uncovering the biological signal in Hi-C data \naffected by technical biases.\n\nKobets VA(1), Ulianov SV(2)(3), Galitsyna AA(1)(2)(4), Doronin SA(5), Mikhaleva \nEA(5), Gelfand MS(1)(4), Shevelyov YY(5), Razin SV(2)(3), Khrameeva EE(1).\n\nAuthor information:\n(1)Skolkovo Institute of Science and Technology, Moscow, 121205, Russia.\n(2)Institute of Gene Biology, Russian Academy of Sciences, Moscow, 119334, \nRussia.\n(3)Faculty of Biology, M.V. Lomonosov Moscow State University, Moscow, 119992, \nRussia.\n(4)A.A. Kharkevich Institute for Information Transmission Problems, Russian \nAcademy of Sciences, Moscow, 127051, Russia.\n(5)Institute of Molecular Genetics of National Research Centre ""Kurchatov \nInstitute"", Moscow, 123182, Russia.\n\nThe chromatin interaction assays, particularly Hi-C, enable detailed studies of \ngenome architecture in multiple organisms and model systems, resulting in a \ndeeper understanding of gene expression regulation mechanisms mediated by \nepigenetics. However, the analysis and interpretation of Hi-C data remain \nchallenging due to technical biases, limiting direct comparisons of datasets \nobtained in different experiments and laboratories. As a result, removing biases \nfrom Hi-C-generated chromatin contact matrices is a critical data analysis step. \nOur novel approach, HiConfidence, eliminates biases from the Hi-C data by \nweighing chromatin contacts according to their consistency between replicates so \nthat low-quality replicates do not substantially influence the result. The \nalgorithm is effective for the analysis of global changes in chromatin \nstructures such as compartments and topologically associating domains. We apply \nthe HiConfidence approach to several Hi-C datasets with significant technical \nbiases, that could not be analyzed effectively using existing methods, and \nobtain meaningful biological conclusions. In particular, HiConfidence aids in \nthe study of how changes in histone acetylation pattern affect chromatin \norganization in Drosophila melanogaster S2 cells. The method is freely available \nat GitHub: https://github.com/victorykobets/HiConfidence.\n\n© The Author(s) 2023. Published by Oxford University Press.\n\nDOI: 10.1093/bib/bbad044\nPMID: 36759336\n\n",,
3,36756726,2023-02-09,10.1111/1755-0998.13765,https://github.com/Caizf-script/HQGR,https://github.com/Caizf-script/HQGR,Molecular ecology resources,Long amplicon HiFi sequencing for mitochondrial DNA genomes.,"The High Quality Reads Generator \n(HQGR) software is provided to facilitate data analyses, which is publicly \naccessible on GitHub (https://github.com/Caizf-script/HQGR). Our long amplicon \nHiFi sequencing pipeline can also be applied in various target enrichment \nstrategies for small genomes and candidate genes.","Mol Ecol Resour. 2023 Feb 9. doi: 10.1111/1755-0998.13765. Online ahead of \nprint.\n\nLong amplicon HiFi sequencing for mitochondrial DNA genomes.\n\nCai ZF(#)(1)(2), Hu JY(#)(3), Yin TT(#)(2), Wang D(3), Shen QK(2), Ma C(2)(4), \nOu DQ(5), Xu MM(2)(4), Shi X(2)(4), Li QL(1)(2), Wu RN(2), Ajuma L(2)(4), Adeola \nAC(2), Zhang YP(1)(2)(4), Peng MS(2)(4).\n\nAuthor information:\n(1)State Key Laboratory for Conservation and Utilization of Bio-resources in \nYunnan, Yunnan University, Kunming, China.\n(2)State Key Laboratory of Genetic Resources and Evolution & Yunnan Laboratory \nof Molecular Biology of Domestic Animals, Kunming Institute of Zoology, Chinese \nAcademy of Sciences, Kunming, China.\n(3)School of Software, Yunnan University, Kunming, China.\n(4)University of Chinese Academy of Sciences, Beijing, China.\n(5)Department of Anesthesiology, First Affiliated Hospital of Kunming Medical \nUniversity, Kunming, China.\n(#)Contributed equally\n\nLong-read sequencing technology is a powerful approach with application in \nvarious genetic and genomic researches. Herein, we developed the pipeline for \nlong amplicon high-fidelity (HiFi) sequencing and then applied it for sequencing \nmitochondrial DNA (mtDNA) genomes from pools of 79 Tibetan Mastiffs. We \namplified the mtDNA genome with long-range PCR using two pairs of primers. Two \nrounds of circular consensus sequencing (CCS) were conducted and their accuracy \nwas evaluated. The results indicate that the second round of CCS can improve the \naccuracy of HiFi reads. In addition, the analysis of 79 high-quality mtDNA \ngenomes shows the Tibetan Mastiffs from outside of the Tibetan Plateau \nexperienced hybridization with other dogs. The High Quality Reads Generator \n(HQGR) software is provided to facilitate data analyses, which is publicly \naccessible on GitHub (https://github.com/Caizf-script/HQGR). Our long amplicon \nHiFi sequencing pipeline can also be applied in various target enrichment \nstrategies for small genomes and candidate genes.\n\nThis article is protected by copyright. All rights reserved.\n\nDOI: 10.1111/1755-0998.13765\nPMID: 36756726\n\n",,
4,36756173,2022-10-23,10.1016/j.csbj.2022.10.016,https://github.com/lichen-lab/MTAE,https://github.com/lichen-lab/MTAE,Computational and structural biotechnology journal,Multi-task deep autoencoder to predict Alzheimer's disease progression using temporal DNA methylation data in peripheral blood.,\nAvailability:: https://github.com/lichen-lab/MTAE.\n\n© 2022 The Author(s).,"Comput Struct Biotechnol J. 2022 Oct 23;20:5761-5774. doi: \n10.1016/j.csbj.2022.10.016. eCollection 2022.\n\nMulti-task deep autoencoder to predict Alzheimer's disease progression using \ntemporal DNA methylation data in peripheral blood.\n\nChen L(1), Saykin AJ(2), Yao B(3), Zhao F(1); Alzheimer’s Disease Neuroimaging \nInitiative (ADNI).\n\nAuthor information:\n(1)Department of Biostatistics, University of Florida, Gainesville, FL 32603, \nUnited States.\n(2)Department of Radiology and Imaging Sciences, Indiana University School of \nMedicine, Indianapolis, IN 46202, United States.\n(3)Department of Human Genetics, Emory University, Atlanta, GA 30322, United \nStates.\n\nTraditional approaches for diagnosing Alzheimer's disease (AD) such as brain \nimaging and cerebrospinal fluid are invasive and expensive. It is desirable to \ndevelop a useful diagnostic tool by exploiting biomarkers obtained from \nperipheral tissues due to their noninvasive and easily accessible \ncharacteristics. However, the capacity of using DNA methylation data in \nperipheral blood for predicting AD progression is rarely known. It is also \nchallenging to develop an efficient prediction model considering the complex and \nhigh-dimensional DNA methylation data in a longitudinal study. Here, we develop \ntwo multi-task deep autoencoders, which are based on the convolutional \nautoencoder and long short-term memory autoencoder to learn the compressed \nfeature representation by jointly minimizing the reconstruction error and \nmaximizing the prediction accuracy. By benchmarking on longitudinal DNA \nmethylation data collected from the peripheral blood in Alzheimer's Disease \nNeuroimaging Initiative, we demonstrate that the proposed multi-task deep \nautoencoders outperform state-of-the-art machine learning approaches for both \npredicting AD progression and reconstructing the temporal DNA methylation \nprofiles. In addition, the proposed multi-task deep autoencoders can predict AD \nprogression accurately using only the historical DNA methylation data and the \nperformance is further improved by including all temporal DNA methylation data. \nAvailability:: https://github.com/lichen-lab/MTAE.\n\n© 2022 The Author(s).\n\nDOI: 10.1016/j.csbj.2022.10.016\nPMCID: PMC9619306\nPMID: 36756173\n\nConflict of interest statement: The authors declare that they have no known \ncompeting financial interests or personal relationships that could have appeared \nto influence the work reported in this paper.\n\n",,
...,...,...,...,...,...,...,...,...,...,...,...
95,36711471,2023-01-20,10.1101/2023.01.17.524477,https://github.com/jianlin-cheng/TransFun,https://github.com/jianlin-cheng/TransFun,bioRxiv : the preprint server for biology,Combining protein sequences and structures with transformers and equivariant graph neural networks to predict protein function.,AVAILABILITY: The source code of TransFun is available at \nhttps://github.com/jianlin-cheng/TransFun.\nCONTACT: chengji@missouri.edu.,"bioRxiv. 2023 Jan 20:2023.01.17.524477. doi: 10.1101/2023.01.17.524477. \nPreprint.\n\nCombining protein sequences and structures with transformers and equivariant \ngraph neural networks to predict protein function.\n\nBoadu F, Cao H, Cheng J.\n\nMOTIVATION: Millions of protein sequences have been generated by numerous genome \nand transcriptome sequencing projects. However, experimentally determining the \nfunction of the proteins is still a time consuming, low-throughput, and \nexpensive process, leading to a large protein sequence-function gap. Therefore, \nit is important to develop computational methods to accurately predict protein \nfunction to fill the gap. Even though many methods have been developed to use \nprotein sequences as input to predict function, much fewer methods leverage \nprotein structures in protein function prediction because there was lack of \naccurate protein structures for most proteins until recently.\nRESULTS: We developed TransFun - a method using a transformer-based protein \nlanguage model and 3D-equivariant graph neural networks to distill information \nfrom both protein sequences and structures to predict protein function. It \nextracts feature embeddings from protein sequences using a pre-trained protein \nlanguage model (ESM) via transfer learning and combines them with 3D structures \nof proteins predicted by AlphaFold2 through equivariant graph neural networks. \nBenchmarked on the CAFA3 test dataset and a new test dataset, TransFun \noutperforms several state-of-the-art methods, indicating the language model and \n3D-equivariant graph neural networks are effective methods to leverage protein \nsequences and structures to improve protein function prediction. Combining \nTransFun predictions and sequence similarity-based predictions can further \nincrease prediction accuracy.\nAVAILABILITY: The source code of TransFun is available at \nhttps://github.com/jianlin-cheng/TransFun.\nCONTACT: chengji@missouri.edu.\n\nDOI: 10.1101/2023.01.17.524477\nPMCID: PMC9882282\nPMID: 36711471\n\n",,
96,36710930,2023-01-13,10.3389/fnmol.2022.1037565,https://github.com/linxi159/CDSlncR,https://github.com/linxi159/CDSlncR,Frontiers in molecular neuroscience,Inferring cell developmental stage-specific lncRNA regulation in the developing human neocortex with CDSlncR.,"CDSlncR is available at https://github.com/linxi159/CDSlncR.\n\nCopyright © 2023 Huang, Ma and Zhang.","Front Mol Neurosci. 2023 Jan 13;15:1037565. doi: 10.3389/fnmol.2022.1037565. \neCollection 2022.\n\nInferring cell developmental stage-specific lncRNA regulation in the developing \nhuman neocortex with CDSlncR.\n\nHuang M(1)(2), Ma J(1)(3), Zhang J(3).\n\nAuthor information:\n(1)Department of Automation, Xiamen University, Xiamen, China.\n(2)Department of Computer Science, University of Tsukuba, Tsukuba, Japan.\n(3)School of Engineering, Dali University, Dali, China.\n\nNoncoding RNAs (ncRNAs) occupy ~98% of the transcriptome in human, and are \nusually not translated into proteins. Among ncRNAs, long non-coding RNAs \n(lncRNAs, >200 nucleotides) are important regulators to modulate gene \nexpression, and are involved in many biological processes (e.g., cell \ndevelopment). To study lncRNA regulation, many computational approaches or tools \nhave been proposed by using bulk transcriptomics data. Nevertheless, previous \nbulk data-driven methods are mostly limited to explore the lncRNA regulation \nregarding all of cells, instead of the lncRNA regulation specific to cell \ndevelopmental stages. Fortunately, recent advance in single-cell sequencing data \nhas provided a way to investigate cell developmental stage-specific lncRNA \nregulation. In this work, we present a novel computational method, CDSlncR (Cell \nDevelopmental Stage-specific lncRNA regulation), which combines putative \nlncRNA-target binding information with single-cell transcriptomics data to infer \ncell developmental stage-specific lncRNA regulation. For each cell developmental \nstage, CDSlncR constructs a cell developmental stage-specific lncRNA regulatory \nnetwork in the cell developmental stage. To illustrate the effectiveness of \nCDSlncR, we apply CDSlncR into single-cell transcriptomics data of the \ndeveloping human neocortex for exploring lncRNA regulation across different \nhuman neocortex developmental stages. Network analysis shows that the lncRNA \nregulation is unique in each developmental stage of human neocortex. As a case \nstudy, we also perform particular analysis on the cell developmental \nstage-specific lncRNA regulation related to 18 known lncRNA biomarkers in autism \nspectrum disorder. Finally, the comparison result indicates that CDSlncR is an \neffective method for predicting cell developmental stage-specific lncRNA \ntargets. CDSlncR is available at https://github.com/linxi159/CDSlncR.\n\nCopyright © 2023 Huang, Ma and Zhang.\n\nDOI: 10.3389/fnmol.2022.1037565\nPMCID: PMC9880432\nPMID: 36710930\n\nConflict of interest statement: The authors declare that the research was \nconducted in the absence of any commercial or financial relationships that could \nbe construed as a potential conflict of interest.\n\n",,
97,36710872,2023-01-23,10.7717/peerj.14706,https://github.com/ningzilan/SSWD,https://github.com/ningzilan/SSWD,PeerJ,A clustering method for small scRNA-seq data based on subspace and weighted distance.,SSWD can be \ndownloaded at https://github.com/ningzilan/SSWD.\n\n©2023 Ning et al.,"PeerJ. 2023 Jan 23;11:e14706. doi: 10.7717/peerj.14706. eCollection 2023.\n\nA clustering method for small scRNA-seq data based on subspace and weighted \ndistance.\n\nNing Z(1)(2), Dai Z(1), Zhang H(2), Chen Y(1), Yuan Z(1).\n\nAuthor information:\n(1)Hunan Engineering & Technology Research Centre for Agricultural Big Data \nAnalysis & Decision-Making, Hunan Agricultural University, Changsha, Hunan, \nChina.\n(2)Hunan Agricultural University, College of Information and Intelligence, \nChangsha, Hunan, China.\n\nBACKGROUND: Identifying the cell types using unsupervised methods is essential \nfor scRNA-seq research. However, conventional similarity measures introduce \nchallenges to single-cell data clustering because of the high dimensional, high \nnoise, and high dropout.\nMETHODS: We proposed a clustering method for small ScRNA-seq data based on \nSubspace and Weighted Distance (SSWD), which follows the assumption that the \nsets of gene subspace composed of similar density-distributing genes can better \ndistinguish cell groups. To accurately capture the intrinsic relationship among \ncells or genes, a new distance metric that combines Euclidean and Pearson \ndistance through a weighting strategy was proposed. The relative \nCalinski-Harabasz (CH) index was used to estimate the cluster numbers instead of \nthe CH index because it is comparable across degrees of freedom.\nRESULTS: We compared SSWD with seven prevailing methods on eight publicly \nscRNA-seq datasets. The experimental results show that the SSWD has better \nclustering accuracy and the partitioning ability of cell groups. SSWD can be \ndownloaded at https://github.com/ningzilan/SSWD.\n\n©2023 Ning et al.\n\nDOI: 10.7717/peerj.14706\nPMCID: PMC9879162\nPMID: 36710872 [Indexed for MEDLINE]\n\nConflict of interest statement: The authors declare there are no competing \ninterests.\n\n",,
98,36709790,2023-01-26,10.1016/j.ymeth.2023.01.006,https://github.com/zhuyuan-cug/VGAE-LCME,https://github.com/zhuyuan-cug/VGAE-LCME,"Methods (San Diego, Calif.)",Predicting latent lncRNA and cancer metastatic event associations via variational graph auto-encoder.,The source code and \ndata are available at https://github.com/zhuyuan-cug/VGAE-LCME.\n\nCopyright © 2023 Elsevier Inc.,"Methods. 2023 Jan 26;211:1-9. doi: 10.1016/j.ymeth.2023.01.006. Online ahead of \nprint.\n\nPredicting latent lncRNA and cancer metastatic event associations via \nvariational graph auto-encoder.\n\nZhu Y(1), Zhang F(2), Zhang S(3), Yi M(4).\n\nAuthor information:\n(1)School of Automation, China University of Geosciences, 388 Lumo Road, \nHongshan District, 430074, Wuhan, Hubei, China; Hubei Key Laboratory of Advanced \nControl and Intelligent Automation for Complex Systems, 388 Lumo Road, Hongshan \nDistrict, 430074, Wuhan, Hubei, China; Engineering Research Center of \nIntelligent Technology for Geo-Exploration, 388 Lumo Road, Hongshan District, \n430074, Wuhan, Hubei, China.\n(2)School of Mathematics and Physics, China University of Geosciences, 388 Lumo \nRoad, Hongshan District, 430074, Wuhan, Hubei, China.\n(3)College of Life Science and Health, Wuhan University of Science and \nTechnology, 974 Heping Avenue, Qingshan District, 430081, Wuhan, Hubei, China. \nElectronic address: shihuazhang@wust.edu.cn.\n(4)School of Mathematics and Physics, China University of Geosciences, 388 Lumo \nRoad, Hongshan District, 430074, Wuhan, Hubei, China. Electronic address: \nmingyi@cug.edu.cn.\n\nLong non-coding RNA (lncRNA) are shown to be closely associated with cancer \nmetastatic events (CME, e.g., cancer cell invasion, intravasation, \nextravasation, proliferation) that collaboratively accelerate malignant cancer \nspread and cause high mortality rate in patients. Clinical trials may accurately \nuncover the relationships between lncRNAs and CMEs; however, it is \ntime-consuming and expensive. With the accumulation of data, there is an urgent \nneed to find efficient ways to identify these relationships. Herein, a graph \nembedding representation-based predictor (VGEA-LCME) for exploring latent \nlncRNA-CME associations is introduced. In VGEA-LCME, a heterogeneous combined \nnetwork is constructed by integrating similarity and linkage matrix that can \nmaintain internal and external characteristics of networks, and a variational \ngraph auto-encoder serves as a feature generator to represent arbitrary lncRNA \nand CME pair. The final robustness predicted result is obtained by ensemble \nclassifier strategy via cross-validation. Experimental comparisons and \nliterature verification show better remarkable performance of VGEA-LCME, \nalthough the similarities between CMEs are challenging to calculate. In \naddition, VGEA-LCME can further identify organ-specific CMEs. To the best of our \nknowledge, this is the first computational attempt to discover the potential \nrelationships between lncRNAs and CMEs. It may provide support and new insight \nfor guiding experimental research of metastatic cancers. The source code and \ndata are available at https://github.com/zhuyuan-cug/VGAE-LCME.\n\nCopyright © 2023 Elsevier Inc. All rights reserved.\n\nDOI: 10.1016/j.ymeth.2023.01.006\nPMID: 36709790\n\nConflict of interest statement: Declaration of Competing Interest The authors \ndeclare that there are no conflict of interests, we do not have any possible \nconflicts of interest.\n\n",,
