In [1]:
import requests
import json
import time
import polars as pl

## Create Gene List

In [3]:
upgenelist = pl.read_csv(
    "../Data/07_extract_gene/HN5_genelist_rice_2505/HN5_genes_up_rice.tsv", 
    separator="\t"
).select(
    "GENEID"
)

downgenelist = pl.read_csv(
    "../Data/07_extract_gene/HN5_genelist_rice_2505/HN5_genes_down_rice.tsv", 
    separator="\t"
).select(
    "GENEID"
)

display(upgenelist.head(), downgenelist.head())

GENEID
str
"""Os04g0107900"""
"""Os01g0136100"""
"""Os02g0259900"""
"""Os02g0259850"""
"""Os03g0245800"""


GENEID
str
"""Os01g0192900"""
"""Os01g0248701"""
"""Os01g0813800"""
"""Os01g0940700"""
"""Os01g0949900"""


In [4]:
# save as .txt (no header) for goatools
upgenelist.write_csv(
    "../Data/09_goatools/HN5_genes_up_rice.txt", 
    separator="\t", 
    include_header=False
)

downgenelist.write_csv(
    "../Data/09_goatools/HN5_genes_down_rice.txt", 
    separator="\t", 
    include_header=False
)

&nbsp;

&nbsp;

&nbsp;

## Create Gene-Gene Ontology association file

In [5]:
#load data (retrieved from ensembl plants release 58)
all_gene_GOA = pl.read_csv(
    "../Data/09_goatools/ensembl/rice_go_annotation_r58.tsv",
    separator="\t"
).select(
    "Gene stable ID",
    "GO term accession"
).filter(
    pl.col("GO term accession").is_not_null()
).sort(
    "Gene stable ID"
).unique(
    subset=["Gene stable ID", "GO term accession"],
    maintain_order=True
)

display(all_gene_GOA)

Gene stable ID,GO term accession
str,str
"""Os01g0100100""","""GO:0090630"""
"""Os01g0100100""","""GO:0005096"""
"""Os01g0100300""","""GO:0005506"""
"""Os01g0100300""","""GO:0020037"""
"""Os01g0100300""","""GO:0016705"""
…,…
"""gene-rps8""","""GO:0003735"""
"""gene-rps8""","""GO:1990904"""
"""gene-rps8""","""GO:0009507"""
"""gene-rps8""","""GO:0009536"""


In [7]:
gene_go_concatenated = all_gene_GOA.group_by(
    "Gene stable ID"
).agg(
    pl.col("GO term accession").map_elements(
        lambda x: ";".join(x),return_dtype=pl.String
    ).alias("GO term accessions")
).sort(
    "Gene stable ID"
)

gene_go_concatenated.write_csv(
    "../Data/09_goatools/rice_go_annotation_r58_concatenated.txt",
    separator="\t",
    include_header=False
)

display(gene_go_concatenated)

Gene stable ID,GO term accessions
str,str
"""Os01g0100100""","""GO:0090630;GO:0005096"""
"""Os01g0100300""","""GO:0005506;GO:0020037;GO:00167…"
"""Os01g0100400""","""GO:0016491;GO:0005507;GO:00058…"
"""Os01g0100500""","""GO:0016020"""
"""Os01g0100600""","""GO:0003676"""
…,…
"""gene-rps4""","""GO:0015935;GO:0019843;GO:00057…"
"""gene-rps7""","""GO:0006412;GO:0005840;GO:00159…"
"""gene-rps7-2""","""GO:0006412;GO:0005840;GO:00159…"
"""gene-rps7-3""","""GO:0006412;GO:0015935;GO:00057…"


In [6]:
# rice_all_gene_list = pl.read_csv(
#     "../Data/Biomart/rice_all_genelist.tsv",
#     separator="\t"
# )

# rice_all_genelist_txt = rice_all_gene_list.select(
#     "Gene stable ID"
# ).write_csv(
#     "../Data/Biomart/rice_all_genelist.txt",
#     separator="\t",
#     include_header=False
# )

# protein_coding_txt = rice_all_gene_list.filter(
#     pl.col("Gene type") == "protein_coding"
# ).select(
#     "Gene stable ID"
# ).write_csv(
#     "../Data/Biomart/rice_all_genelist_protein_coding.txt",
#     separator="\t",
#     include_header=False
# )