In [1]:
import polars as pl # type: ignore

In [2]:
# Download 2024/03/31
# re-download 2024/09/17
# https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2pubmed.gz

In [3]:
# Download 2024/03/31
# re-download 2024/09/17
gene2pubmed = pl.read_csv(
    "../Data/Data_ncbi/gene2pubmed",
    separator="\t"
).rename(
    {
        "#tax_id": "taxonomy id",
        "GeneID": "gene id",
        "PubMed_ID": "pubmed id"
    }
)

display(gene2pubmed.head())

taxonomy id,gene id,pubmed id
i64,i64,i64
24,67441593,1779750
24,67441593,16753031
24,67441593,21620858
24,67441594,21219854
24,67441594,30366028


In [4]:
gene2pubmed_human = gene2pubmed.filter(
    pl.col("taxonomy id") == 9606
).group_by(
    "taxonomy id", "gene id"
).agg(
    pl.col("pubmed id").n_unique().alias("pubmed id count"),
    pl.col("pubmed id").sort().unique().alias("pubmed ids list")
    # pl.col("pubmed id").cast(pl.Utf8).map_elements(lambda x: ", ".join(x), return_dtype=pl.Utf8).alias("pubmed ids list")
).sort(
    "pubmed id count", 
    descending=True
)


# gene2pubmed_human.write_csv("gene2pubmed_human.tsv", separator="\t")
display(gene2pubmed_human)

taxonomy id,gene id,pubmed id count,pubmed ids list
i64,i64,u32,list[i64]
9606,7157,11544,"[1088347, 1303181, … 39207627]"
9606,1956,6650,"[1281549, 1301150, … 39205824]"
9606,7124,6425,"[1279199, 1310388, … 39201392]"
9606,3569,5450,"[1113021, 1291290, … 39231832]"
9606,7422,5207,"[1312256, 1417831, … 39174921]"
…,…,…,…
9606,127273790,1,[30033119]
9606,106736465,1,[25326701]
9606,129662445,1,[32094911]
9606,129937677,1,[35858748]


In [5]:
gene2ensembl = pl.read_csv(
    "../Data/Data_ensembl/ensembl2ncbigeneid.tsv",
    separator="\t"
).rename(
    {
        "NCBI gene (formerly Entrezgene) ID": "gene id"
    }
)

display(gene2ensembl.head(10))

Gene stable ID,HGNC symbol,gene id
str,str,i64
"""ENSG00000210049""","""MT-TF""",
"""ENSG00000211459""","""MT-RNR1""",
"""ENSG00000210077""","""MT-TV""",
"""ENSG00000210082""","""MT-RNR2""",
"""ENSG00000209082""","""MT-TL1""",
"""ENSG00000198888""","""MT-ND1""",4535.0
"""ENSG00000210100""","""MT-TI""",
"""ENSG00000210107""","""MT-TQ""",
"""ENSG00000210112""","""MT-TM""",
"""ENSG00000198763""","""MT-ND2""",4536.0


In [7]:
symbol2pubmed = gene2ensembl.join(
    gene2pubmed_human,
    on="gene id",
    how="left",
    coalesce=True
).sort(
    "pubmed id count",
    descending=True
)

symbol2pubmed_filter = symbol2pubmed.filter(
    (pl.col("pubmed id count").is_not_null()) &
    (pl.col("HGNC symbol") == "USPL1")
)

# .with_columns(
#     pl.col("pubmed ids list").map_elements(
#         lambda x: ",".join(map(str, x)),
#         return_dtype=pl.String
#     ).alias("pubmed ids list")
# )

print(symbol2pubmed_filter.group_by(["HGNC symbol"]).len().sum())
display(symbol2pubmed_filter)

shape: (1, 2)
┌─────────────┬─────┐
│ HGNC symbol ┆ len │
│ ---         ┆ --- │
│ str         ┆ u32 │
╞═════════════╪═════╡
│ null        ┆ 1   │
└─────────────┴─────┘


Gene stable ID,HGNC symbol,gene id,taxonomy id,pubmed id count,pubmed ids list
str,str,i64,i64,u32,list[i64]
"""ENSG00000132952""","""USPL1""",10208,9606,24,"[12477932, 14702039, … 36724073]"


In [36]:
no_coresspondence = symbol2pubmed.filter(
    pl.col("pubmed id count").is_null()
).drop(
    "pubmed ids list",
    "pubmed id count",
    "taxonomy id"
)

display(no_coresspondence)

Gene stable ID,HGNC symbol,gene id
str,str,i64
"""ENSG00000210049""","""MT-TF""",
"""ENSG00000211459""","""MT-RNR1""",
"""ENSG00000210077""","""MT-TV""",
"""ENSG00000210082""","""MT-RNR2""",
"""ENSG00000209082""","""MT-TL1""",
…,…,…
"""ENSG00000232679""","""LINC01705""",105372950
"""ENSG00000200033""","""RNU6-403P""",
"""ENSG00000228437""","""LINC02474""",
"""ENSG00000229463""","""LYST-AS1""",
