In [1]:
import pandas as pd
import requests
import time
import os

In [2]:
def get_domain(accession):
    requestURL = f"https://www.ebi.ac.uk/QuickGO/services/ontology/go/search?query={accession}&limit=1&page=1"
    try:
        r = requests.get(requestURL, headers={"Accept": "application/json"}, timeout=10)
        r.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"API request error: {e}")
        return None
    data = r.json()
    return data['results'][0]['aspect'] if data['results'] else None

### HUMAN

In [2]:
ensembl_goslim_human = pd.read_csv("../data/biomart_goslim/biomart_human_goslim_R110.tsv", sep="\t", low_memory=False)
ensembl_goslim_human_unique = ensembl_goslim_human.drop_duplicates(subset=['Gene stable ID', 'GOSlim GOA Accession(s)', 'GOSlim GOA Description']) # debug
display(ensembl_goslim_human_unique)

Unnamed: 0,Gene stable ID,Chromosome/scaffold name,start2 (bp),end2 (bp),GOSlim GOA Accession(s),GOSlim GOA Description
0,ENSG00000243485,1,29554,31109,GO:0031047,gene silencing by RNA
1,ENSG00000284332,1,30366,30503,GO:0031047,gene silencing by RNA
2,ENSG00000186092,1,65419,71585,GO:0023052,signaling
3,ENSG00000186092,1,65419,71585,GO:0060089,molecular transducer activity
4,ENSG00000186092,1,65419,71585,GO:0005886,plasma membrane
...,...,...,...,...,...,...
581396,ENSG00000292372,Y,57207346,57212230,GO:0007010,cytoskeleton organization
581397,ENSG00000292372,Y,57207346,57212230,GO:0008092,cytoskeletal protein binding
581398,ENSG00000292372,Y,57207346,57212230,GO:0031410,cytoplasmic vesicle
581399,ENSG00000292372,Y,57207346,57212230,GO:0043226,organelle


In [10]:
unique_goslim_terms_human = ensembl_goslim_human_unique["GOSlim GOA Accession(s)"].unique()
human_goslim_df = pd.DataFrame(unique_goslim_terms_human,columns=["GOSlim GOA Accession(s)"])
display(human_goslim_df)

Unnamed: 0,GOSlim GOA Accession(s)
0,GO:0031047
1,GO:0023052
2,GO:0060089
3,GO:0005886
4,GO:0050877
...,...
132,GO:0009975
133,GO:0071941
134,GO:0071554
135,GO:0007568


In [11]:
human_goslim_df["GOSlim_domain"] = human_goslim_df["GOSlim GOA Accession(s)"].apply(get_domain)
time.sleep(5)

display(human_goslim_df)

Unnamed: 0,GOSlim GOA Accession(s),GOSlim_domain
0,GO:0031047,biological_process
1,GO:0023052,biological_process
2,GO:0060089,molecular_function
3,GO:0005886,cellular_component
4,GO:0050877,biological_process
...,...,...
132,GO:0009975,molecular_function
133,GO:0071941,biological_process
134,GO:0071554,biological_process
135,GO:0007568,biological_process


In [14]:
human_merged_goslim_domain = pd.merge(
    ensembl_goslim_human_unique,
    human_goslim_df,
    how="left",
    on="GOSlim GOA Accession(s)"
)

display(human_merged_goslim_domain)

Unnamed: 0,Gene stable ID,Chromosome/scaffold name,start2 (bp),end2 (bp),GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain
0,ENSG00000243485,1,29554,31109,GO:0031047,gene silencing by RNA,biological_process
1,ENSG00000284332,1,30366,30503,GO:0031047,gene silencing by RNA,biological_process
2,ENSG00000186092,1,65419,71585,GO:0023052,signaling,biological_process
3,ENSG00000186092,1,65419,71585,GO:0060089,molecular transducer activity,molecular_function
4,ENSG00000186092,1,65419,71585,GO:0005886,plasma membrane,cellular_component
...,...,...,...,...,...,...,...
193923,ENSG00000292372,Y,57207346,57212230,GO:0007010,cytoskeleton organization,biological_process
193924,ENSG00000292372,Y,57207346,57212230,GO:0008092,cytoskeletal protein binding,molecular_function
193925,ENSG00000292372,Y,57207346,57212230,GO:0031410,cytoplasmic vesicle,cellular_component
193926,ENSG00000292372,Y,57207346,57212230,GO:0043226,organelle,cellular_component


### RICE

In [15]:
ensembl_goslim_rice = pd.read_csv("../data/biomart_goslim/biomart_rice_goslim_R56.tsv", sep="\t")
ensembl_goslim_rice_unique = ensembl_goslim_rice.drop_duplicates(subset=['Gene stable ID', 'GOSlim GOA Accession(s)', 'GOSlim GOA Description']) # debug
display(ensembl_goslim_rice_unique)

  ensembl_goslim_rice = pd.read_csv("../data/biomart_goslim/biomart_rice_goslim_R56.tsv", sep="\t")


Unnamed: 0,Gene stable ID,Chromosome/scaffold name,start1 (bp),end1 (bp),GOSlim GOA Accession(s),GOSlim GOA Description
0,Os01g0100100,1,2983,10815,GO:0006810,transport
1,Os01g0100100,1,2983,10815,GO:0008150,biological_process
2,Os01g0100100,1,2983,10815,GO:0009987,cellular process
3,Os01g0100100,1,2983,10815,GO:0003674,molecular_function
4,Os01g0100100,1,2983,10815,GO:0030234,enzyme regulator activity
...,...,...,...,...,...,...
258565,gene-rps19,Pt,134200,134481,GO:0003674,molecular_function
258566,gene-rps19,Pt,134200,134481,GO:0005198,structural molecule activity
258567,gene-rps19,Pt,134200,134481,GO:0005488,binding
258568,gene-rps19,Pt,134200,134481,GO:0003723,RNA binding


In [16]:
unique_goslim_terms_rice = ensembl_goslim_rice_unique["GOSlim GOA Accession(s)"].unique()
rice_goslim_df = pd.DataFrame(unique_goslim_terms_rice,columns=["GOSlim GOA Accession(s)"])
display(rice_goslim_df)

Unnamed: 0,GOSlim GOA Accession(s)
0,GO:0006810
1,GO:0008150
2,GO:0009987
3,GO:0003674
4,GO:0030234
...,...
92,GO:0015979
93,GO:0007267
94,GO:0019825
95,GO:0009835


In [17]:
rice_goslim_df["GOSlim_domain"] = rice_goslim_df["GOSlim GOA Accession(s)"].apply(get_domain)
time.sleep(5)

display(rice_goslim_df)

Unnamed: 0,GOSlim GOA Accession(s),GOSlim_domain
0,GO:0006810,biological_process
1,GO:0008150,biological_process
2,GO:0009987,biological_process
3,GO:0003674,molecular_function
4,GO:0030234,molecular_function
...,...,...
92,GO:0015979,biological_process
93,GO:0007267,biological_process
94,GO:0019825,molecular_function
95,GO:0009835,biological_process


In [19]:
rice_merged_goslim_domain = pd.merge(
    ensembl_goslim_rice_unique,
    rice_goslim_df,
    how="left",
    on="GOSlim GOA Accession(s)"
)

display(rice_merged_goslim_domain)

Unnamed: 0,Gene stable ID,Chromosome/scaffold name,start1 (bp),end1 (bp),GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain
0,Os01g0100100,1,2983,10815,GO:0006810,transport,biological_process
1,Os01g0100100,1,2983,10815,GO:0008150,biological_process,biological_process
2,Os01g0100100,1,2983,10815,GO:0009987,cellular process,biological_process
3,Os01g0100100,1,2983,10815,GO:0003674,molecular_function,molecular_function
4,Os01g0100100,1,2983,10815,GO:0030234,enzyme regulator activity,molecular_function
...,...,...,...,...,...,...,...
219494,gene-rps19,Pt,134200,134481,GO:0003674,molecular_function,molecular_function
219495,gene-rps19,Pt,134200,134481,GO:0005198,structural molecule activity,molecular_function
219496,gene-rps19,Pt,134200,134481,GO:0005488,binding,molecular_function
219497,gene-rps19,Pt,134200,134481,GO:0003723,RNA binding,molecular_function


### Save the results

In [None]:
human_merged_goslim_domain.to_csv("../data/biomart_goslim/biomart_human_goslim_R110_domain.tsv", sep="\t", index=False)
rice_merged_goslim_domain.to_csv("../data/biomart_goslim/biomart_rice_goslim_R56_domain.tsv", sep="\t", index=False)