# Assign GOSlim domain to each GOSlim accession

## Background

When obtained from Ensembl Biomart, GOSlim data does not have domain information(e.g., cellular_component) listed, so use QuickGO to assign domains to each GOSlim accession.

In [1]:
import requests
import time
import os
#import pandas as pd
import polars as pl #Replace with pandas

## Get domain function through QuickGO API

In [2]:
def get_domain(accession):
    requestURL = f"https://www.ebi.ac.uk/QuickGO/services/ontology/go/search?query={accession}&limit=1&page=1"
    try:
        r = requests.get(requestURL, headers={"Accept": "application/json"}, timeout=30)
        r.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"API request error: {e}")
        return None
    data = r.json()
    return data['results'][0]['aspect'] if data['results'] else None

## MOUSE

In [4]:
ensembl_goslim_mouse = pl.read_csv("../data/biomart_goslim/biomart_mouse_goslim_R110.tsv", # Ensembl release 110
                                   separator="\t",
                                   infer_schema_length=0) # Convert all columns to string

#ensembl_goslim_mouse_unique = ensembl_goslim_mouse.unique(subset=["Gene stable ID", "GOSlim GOA Accession(s)","GOSlim GOA Description"], maintain_order=True)
ensembl_goslim_mouse_unique = ensembl_goslim_mouse.unique(keep="first", maintain_order=True)
display(ensembl_goslim_mouse_unique)

Gene stable ID,Chromosome/scaffold name,Gene start (bp),Gene end (bp),GOSlim GOA Accession(s),GOSlim GOA Description
str,str,str,str,str,str
"""ENSMUSG0000006…","""MT""","""1""","""68""","""GO:0060090""","""molecular adap…"
"""ENSMUSG0000006…","""MT""","""1""","""68""","""GO:0003723""","""RNA binding"""
"""ENSMUSG0000006…","""MT""","""1""","""68""","""GO:0043226""","""organelle"""
"""ENSMUSG0000006…","""MT""","""1""","""68""","""GO:0005739""","""mitochondrion"""
"""ENSMUSG0000006…","""MT""","""70""","""1024""","""GO:0005198""","""structural mol…"
"""ENSMUSG0000006…","""MT""","""70""","""1024""","""GO:0065003""","""protein-contai…"
"""ENSMUSG0000006…","""MT""","""70""","""1024""","""GO:0042254""","""ribosome bioge…"
"""ENSMUSG0000006…","""MT""","""70""","""1024""","""GO:0043226""","""organelle"""
"""ENSMUSG0000006…","""MT""","""70""","""1024""","""GO:0005739""","""mitochondrion"""
"""ENSMUSG0000006…","""MT""","""70""","""1024""","""GO:0005840""","""ribosome"""


In [5]:
unique_goslim_terms_mouse = ensembl_goslim_mouse_unique.unique(subset=['GOSlim GOA Accession(s)']).select('GOSlim GOA Accession(s)')
display(unique_goslim_terms_mouse)

GOSlim GOA Accession(s)
str
"""GO:0005783"""
"""GO:0006914"""
"""GO:0065003"""
"""GO:0007031"""
"""GO:0120274"""
"""GO:0005198"""
"""GO:0042393"""
"""GO:0098631"""
"""GO:0034330"""
"""GO:0006281"""


In [6]:
# pandas -> mouse_goslim_df["GOSlim_domain"] = mouse_goslim_df["GOSlim GOA Accession(s)"].apply(get_domain)
# https://docs.pola.rs/py-polars/html/reference/series/api/polars.Series.map_elements.html
mouse_goslim_df = unique_goslim_terms_mouse.with_columns(
    unique_goslim_terms_mouse["GOSlim GOA Accession(s)"].map_elements(get_domain, return_dtype=str).alias("GOSlim_domain")
)
time.sleep(10)

display(mouse_goslim_df)

GOSlim GOA Accession(s),GOSlim_domain
str,str
"""GO:0005783""","""cellular_compo…"
"""GO:0006914""","""biological_pro…"
"""GO:0065003""","""biological_pro…"
"""GO:0007031""","""biological_pro…"
"""GO:0120274""","""molecular_func…"
"""GO:0005198""","""molecular_func…"
"""GO:0042393""","""molecular_func…"
"""GO:0098631""","""molecular_func…"
"""GO:0034330""","""biological_pro…"
"""GO:0006281""","""biological_pro…"


In [7]:
# pandas -> mouse_merged_goslim_domain = pd.merge(ensembl_goslim_mouse_unique, mouse_goslim_df, how='left', on='GOSlim GOA Accession(s)')
mouse_merged_goslim_domain = ensembl_goslim_mouse_unique.join(
    mouse_goslim_df, 
    on="GOSlim GOA Accession(s)", 
    how="left")
display(mouse_merged_goslim_domain)

Gene stable ID,Chromosome/scaffold name,Gene start (bp),Gene end (bp),GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain
str,str,str,str,str,str,str
"""ENSMUSG0000006…","""MT""","""1""","""68""","""GO:0060090""","""molecular adap…","""molecular_func…"
"""ENSMUSG0000006…","""MT""","""1""","""68""","""GO:0003723""","""RNA binding""","""molecular_func…"
"""ENSMUSG0000006…","""MT""","""1""","""68""","""GO:0043226""","""organelle""","""cellular_compo…"
"""ENSMUSG0000006…","""MT""","""1""","""68""","""GO:0005739""","""mitochondrion""","""cellular_compo…"
"""ENSMUSG0000006…","""MT""","""70""","""1024""","""GO:0005198""","""structural mol…","""molecular_func…"
"""ENSMUSG0000006…","""MT""","""70""","""1024""","""GO:0065003""","""protein-contai…","""biological_pro…"
"""ENSMUSG0000006…","""MT""","""70""","""1024""","""GO:0042254""","""ribosome bioge…","""biological_pro…"
"""ENSMUSG0000006…","""MT""","""70""","""1024""","""GO:0043226""","""organelle""","""cellular_compo…"
"""ENSMUSG0000006…","""MT""","""70""","""1024""","""GO:0005739""","""mitochondrion""","""cellular_compo…"
"""ENSMUSG0000006…","""MT""","""70""","""1024""","""GO:0005840""","""ribosome""","""cellular_compo…"


## HUMAN

In [8]:
ensembl_goslim_human = pl.read_csv("../data/biomart_goslim/biomart_human_goslim_R110.tsv", 
                                   separator="\t",
                                   infer_schema_length=0) # Ensembl release 110
ensembl_goslim_human_unique = ensembl_goslim_human.unique(keep="first", maintain_order=True)
display(ensembl_goslim_human_unique)

Gene stable ID,Chromosome/scaffold name,start2 (bp),end2 (bp),GOSlim GOA Accession(s),GOSlim GOA Description
str,str,str,str,str,str
"""ENSG0000024348…","""1""","""29554""","""31109""","""GO:0031047""","""gene silencing…"
"""ENSG0000028433…","""1""","""30366""","""30503""","""GO:0031047""","""gene silencing…"
"""ENSG0000018609…","""1""","""65419""","""71585""","""GO:0023052""","""signaling"""
"""ENSG0000018609…","""1""","""65419""","""71585""","""GO:0060089""","""molecular tran…"
"""ENSG0000018609…","""1""","""65419""","""71585""","""GO:0005886""","""plasma membran…"
"""ENSG0000018609…","""1""","""65419""","""71585""","""GO:0050877""","""nervous system…"
"""ENSG0000022262…","""1""","""157784""","""157887""","""GO:0003723""","""RNA binding"""
"""ENSG0000022262…","""1""","""157784""","""157887""","""GO:0065003""","""protein-contai…"
"""ENSG0000022262…","""1""","""157784""","""157887""","""GO:0016071""","""mRNA metabolic…"
"""ENSG0000022262…","""1""","""157784""","""157887""","""GO:0043226""","""organelle"""


In [9]:
unique_goslim_terms_human = ensembl_goslim_human_unique.unique(subset=['GOSlim GOA Accession(s)']).select('GOSlim GOA Accession(s)')
display(unique_goslim_terms_human)

GOSlim GOA Accession(s)
str
"""GO:0003013"""
"""GO:0140313"""
"""GO:0005615"""
"""GO:0055086"""
"""GO:0043226"""
"""GO:0016874"""
"""GO:0006520"""
"""GO:0012501"""
"""GO:0007010"""
"""GO:0045182"""


In [10]:
# pandas -> human_goslim_df["GOSlim_domain"] = human_goslim_df["GOSlim GOA Accession(s)"].apply(get_domain)
human_goslim_df = unique_goslim_terms_human.with_columns(
    unique_goslim_terms_human["GOSlim GOA Accession(s)"].map_elements(get_domain, return_dtype=str).alias("GOSlim_domain")
)
time.sleep(10)
display(human_goslim_df)

GOSlim GOA Accession(s),GOSlim_domain
str,str
"""GO:0003013""","""biological_pro…"
"""GO:0140313""","""molecular_func…"
"""GO:0005615""","""cellular_compo…"
"""GO:0055086""","""biological_pro…"
"""GO:0043226""","""cellular_compo…"
"""GO:0016874""","""molecular_func…"
"""GO:0006520""","""biological_pro…"
"""GO:0012501""","""biological_pro…"
"""GO:0007010""","""biological_pro…"
"""GO:0045182""","""molecular_func…"


In [12]:
# pandas -> human_merged_goslim_domain = pd.merge(ensembl_goslim_human_unique, human_goslim_df,how="left",on="GOSlim GOA Accession(s)")

human_merged_goslim_domain = ensembl_goslim_human_unique.join(
    human_goslim_df, 
    on="GOSlim GOA Accession(s)", 
    how="left")
display(human_merged_goslim_domain)

Gene stable ID,Chromosome/scaffold name,start2 (bp),end2 (bp),GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain
str,str,str,str,str,str,str
"""ENSG0000024348…","""1""","""29554""","""31109""","""GO:0031047""","""gene silencing…","""biological_pro…"
"""ENSG0000028433…","""1""","""30366""","""30503""","""GO:0031047""","""gene silencing…","""biological_pro…"
"""ENSG0000018609…","""1""","""65419""","""71585""","""GO:0023052""","""signaling""","""biological_pro…"
"""ENSG0000018609…","""1""","""65419""","""71585""","""GO:0060089""","""molecular tran…","""molecular_func…"
"""ENSG0000018609…","""1""","""65419""","""71585""","""GO:0005886""","""plasma membran…","""cellular_compo…"
"""ENSG0000018609…","""1""","""65419""","""71585""","""GO:0050877""","""nervous system…","""biological_pro…"
"""ENSG0000022262…","""1""","""157784""","""157887""","""GO:0003723""","""RNA binding""","""molecular_func…"
"""ENSG0000022262…","""1""","""157784""","""157887""","""GO:0065003""","""protein-contai…","""biological_pro…"
"""ENSG0000022262…","""1""","""157784""","""157887""","""GO:0016071""","""mRNA metabolic…","""biological_pro…"
"""ENSG0000022262…","""1""","""157784""","""157887""","""GO:0043226""","""organelle""","""cellular_compo…"


## RICE

In [14]:
ensembl_goslim_rice = pl.read_csv("../data/biomart_goslim/biomart_rice_goslim_R56.tsv", 
                                  separator="\t",
                                  infer_schema_length=0) # Ensembl release 56
ensembl_goslim_rice_unique = ensembl_goslim_rice.unique(keep="first", maintain_order=True)
display(ensembl_goslim_rice_unique)

Gene stable ID,Chromosome/scaffold name,start1 (bp),end1 (bp),GOSlim GOA Accession(s),GOSlim GOA Description
str,str,str,str,str,str
"""Os01g0100100""","""1""","""2983""","""10815""","""GO:0006810""","""transport"""
"""Os01g0100100""","""1""","""2983""","""10815""","""GO:0008150""","""biological_pro…"
"""Os01g0100100""","""1""","""2983""","""10815""","""GO:0009987""","""cellular proce…"
"""Os01g0100100""","""1""","""2983""","""10815""","""GO:0003674""","""molecular_func…"
"""Os01g0100100""","""1""","""2983""","""10815""","""GO:0030234""","""enzyme regulat…"
"""Os01g0100100""","""1""","""2983""","""10815""","""GO:0065009""","""regulation of …"
"""Os01g0100300""","""1""","""11372""","""12284""","""GO:0003674""","""molecular_func…"
"""Os01g0100300""","""1""","""11372""","""12284""","""GO:0003824""","""catalytic acti…"
"""Os01g0100300""","""1""","""11372""","""12284""","""GO:0005488""","""binding"""
"""Os01g0100400""","""1""","""12721""","""15685""","""GO:0003674""","""molecular_func…"


In [15]:
unique_goslim_terms_rice = ensembl_goslim_rice_unique.unique(subset=['GOSlim GOA Accession(s)']).select('GOSlim GOA Accession(s)')
display(unique_goslim_terms_rice)

GOSlim GOA Accession(s)
str
"""GO:0008135"""
"""GO:0000003"""
"""GO:0003700"""
"""GO:0005575"""
"""GO:0040007"""
"""GO:0005737"""
"""GO:0045182"""
"""GO:0005739"""
"""GO:0009628"""
"""GO:0009606"""


In [16]:
rice_goslim_df = unique_goslim_terms_rice.with_columns(
    unique_goslim_terms_rice["GOSlim GOA Accession(s)"].map_elements(get_domain, return_dtype=str).alias("GOSlim_domain")
)
time.sleep(10)

display(rice_goslim_df)

GOSlim GOA Accession(s),GOSlim_domain
str,str
"""GO:0008135""","""molecular_func…"
"""GO:0000003""","""biological_pro…"
"""GO:0003700""","""molecular_func…"
"""GO:0005575""","""cellular_compo…"
"""GO:0040007""","""biological_pro…"
"""GO:0005737""","""cellular_compo…"
"""GO:0045182""","""molecular_func…"
"""GO:0005739""","""cellular_compo…"
"""GO:0009628""","""biological_pro…"
"""GO:0009606""","""biological_pro…"


In [17]:
rice_merged_goslim_domain = ensembl_goslim_rice_unique.join(
    rice_goslim_df, 
    on="GOSlim GOA Accession(s)", 
    how="left"
)

display(rice_merged_goslim_domain)

Gene stable ID,Chromosome/scaffold name,start1 (bp),end1 (bp),GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain
str,str,str,str,str,str,str
"""Os01g0100100""","""1""","""2983""","""10815""","""GO:0006810""","""transport""","""biological_pro…"
"""Os01g0100100""","""1""","""2983""","""10815""","""GO:0008150""","""biological_pro…","""biological_pro…"
"""Os01g0100100""","""1""","""2983""","""10815""","""GO:0009987""","""cellular proce…","""biological_pro…"
"""Os01g0100100""","""1""","""2983""","""10815""","""GO:0003674""","""molecular_func…","""molecular_func…"
"""Os01g0100100""","""1""","""2983""","""10815""","""GO:0030234""","""enzyme regulat…","""molecular_func…"
"""Os01g0100100""","""1""","""2983""","""10815""","""GO:0065009""","""regulation of …","""biological_pro…"
"""Os01g0100300""","""1""","""11372""","""12284""","""GO:0003674""","""molecular_func…","""molecular_func…"
"""Os01g0100300""","""1""","""11372""","""12284""","""GO:0003824""","""catalytic acti…","""molecular_func…"
"""Os01g0100300""","""1""","""11372""","""12284""","""GO:0005488""","""binding""","""molecular_func…"
"""Os01g0100400""","""1""","""12721""","""15685""","""GO:0003674""","""molecular_func…","""molecular_func…"


## Save the results

In [19]:
mouse_merged_goslim_domain.write_csv("../data/biomart_goslim/biomart_mouse_goslim_R110_domain.tsv", separator="\t")
human_merged_goslim_domain.write_csv("../data/biomart_goslim/biomart_human_goslim_R110_domain.tsv", separator="\t")
rice_merged_goslim_domain.write_csv("../data/biomart_goslim/biomart_rice_goslim_R56_domain.tsv", separator="\t")