## GOSlim combined all genes based on the GOslim terms (version 0.5)

In [1]:
#import pandas as pd
#import dask.dataframe as dd
import polars as pl

organism1_vs_organism2 = 'HR' # human vs rice

In [2]:
goslim_all_human = pl.read_csv("../data/biomart_goslim/biomart_human_goslim_R110_domain.tsv", 
                               separator="\t",
                               infer_schema_length=0,
                               low_memory=False)
goslim_all_rice = pl.read_csv("../data/biomart_goslim/biomart_rice_goslim_R56_domain.tsv", 
                              separator="\t",
                              infer_schema_length=0,
                              low_memory=False)
columns_of_interest = ["Gene stable ID", "GOSlim GOA Accession(s)", "GOSlim GOA Description", "GOSlim_domain"]

goslim_all_human_filtered = goslim_all_human[columns_of_interest]
goslim_all_rice_filtered = goslim_all_rice[columns_of_interest]

display(goslim_all_human_filtered)
display(goslim_all_rice_filtered)

Gene stable ID,GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain
str,str,str,str
"""ENSG0000024348…","""GO:0031047""","""gene silencing…","""biological_pro…"
"""ENSG0000028433…","""GO:0031047""","""gene silencing…","""biological_pro…"
"""ENSG0000018609…","""GO:0023052""","""signaling""","""biological_pro…"
"""ENSG0000018609…","""GO:0060089""","""molecular tran…","""molecular_func…"
"""ENSG0000018609…","""GO:0005886""","""plasma membran…","""cellular_compo…"
"""ENSG0000018609…","""GO:0050877""","""nervous system…","""biological_pro…"
"""ENSG0000022262…","""GO:0003723""","""RNA binding""","""molecular_func…"
"""ENSG0000022262…","""GO:0065003""","""protein-contai…","""biological_pro…"
"""ENSG0000022262…","""GO:0016071""","""mRNA metabolic…","""biological_pro…"
"""ENSG0000022262…","""GO:0043226""","""organelle""","""cellular_compo…"


Gene stable ID,GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain
str,str,str,str
"""Os01g0100100""","""GO:0006810""","""transport""","""biological_pro…"
"""Os01g0100100""","""GO:0008150""","""biological_pro…","""biological_pro…"
"""Os01g0100100""","""GO:0009987""","""cellular proce…","""biological_pro…"
"""Os01g0100100""","""GO:0003674""","""molecular_func…","""molecular_func…"
"""Os01g0100100""","""GO:0030234""","""enzyme regulat…","""molecular_func…"
"""Os01g0100100""","""GO:0065009""","""regulation of …","""biological_pro…"
"""Os01g0100300""","""GO:0003674""","""molecular_func…","""molecular_func…"
"""Os01g0100300""","""GO:0003824""","""catalytic acti…","""molecular_func…"
"""Os01g0100300""","""GO:0005488""","""binding""","""molecular_func…"
"""Os01g0100400""","""GO:0003674""","""molecular_func…","""molecular_func…"


## Merge dataframe based on common column

In [3]:
#merge dataframe based on common column 'GOSlim GOA Accession(s)', 'GOSlim GOA Description', 'GOSlim_domain'
merged_df_goslim = goslim_all_human_filtered.join(
    goslim_all_rice_filtered,
    left_on=['GOSlim GOA Accession(s)', 'GOSlim GOA Description', 'GOSlim_domain'],
    right_on=['GOSlim GOA Accession(s)','GOSlim GOA Description', 'GOSlim_domain'],
    how='inner',
    suffix='_rice'
)
merged_df_goslim = merged_df_goslim.rename({"Gene stable ID" : "Gene stable ID_human"})
merged_df_goslim = merged_df_goslim.select([
    "GOSlim GOA Accession(s)",
    "GOSlim GOA Description",
    "GOSlim_domain",
    "Gene stable ID_human",
    "Gene stable ID_rice"
])

display(merged_df_goslim)

GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain,Gene stable ID_human,Gene stable ID_rice
str,str,str,str,str
"""GO:0003824""","""catalytic acti…","""molecular_func…","""ENSG0000023733…","""Os01g0100300"""
"""GO:0003824""","""catalytic acti…","""molecular_func…","""ENSG0000016257…","""Os01g0100300"""
"""GO:0003824""","""catalytic acti…","""molecular_func…","""ENSG0000017602…","""Os01g0100300"""
"""GO:0003824""","""catalytic acti…","""molecular_func…","""ENSG0000016008…","""Os01g0100300"""
"""GO:0003824""","""catalytic acti…","""molecular_func…","""ENSG0000016997…","""Os01g0100300"""
"""GO:0003824""","""catalytic acti…","""molecular_func…","""ENSG0000012705…","""Os01g0100300"""
"""GO:0003824""","""catalytic acti…","""molecular_func…","""ENSG0000021591…","""Os01g0100300"""
"""GO:0003824""","""catalytic acti…","""molecular_func…","""ENSG0000016007…","""Os01g0100300"""
"""GO:0003824""","""catalytic acti…","""molecular_func…","""ENSG0000019778…","""Os01g0100300"""
"""GO:0003824""","""catalytic acti…","""molecular_func…","""ENSG0000016007…","""Os01g0100300"""


### Count the number of common columns

In [4]:
counted_df = merged_df_goslim.group_by(["GOSlim GOA Accession(s)", "GOSlim GOA Description"]).count()
sorted_df = counted_df.sort("count", descending=True)
display(sorted_df)

GOSlim GOA Accession(s),GOSlim GOA Description,count
str,str,u32
"""GO:0003824""","""catalytic acti…",62834292
"""GO:0005634""","""nucleus""",50586355
"""GO:0005886""","""plasma membran…",14121523
"""GO:0036211""","""protein modifi…",11265607
"""GO:0016740""","""transferase ac…",10856023
"""GO:0016787""","""hydrolase acti…",8674587
"""GO:0005829""","""cytosol""",7811118
"""GO:0048856""","""anatomical str…",7216908
"""GO:0003677""","""DNA binding""",5819761
"""GO:0003723""","""RNA binding""",4482192


### Debugging

In [5]:
#create set of unique accessions
unique_accessions_h = set(goslim_all_human_filtered['GOSlim GOA Accession(s)'].unique())
unique_accessions_r = set(goslim_all_rice_filtered['GOSlim GOA Accession(s)'].unique())

#check if there are common accessions between human and rice
common_accessions = unique_accessions_r.intersection(unique_accessions_h)
unique_accessions_merged = set(merged_df_goslim["GOSlim GOA Accession(s)"].unique())
missing_accessions = common_accessions.difference(unique_accessions_merged)

#check duplicates
duplicates = merged_df_goslim.is_duplicated() #pandas -> 'duplicated' function
unique_combinations = not duplicates.any() #check if there are any duplicates(return boolean)

#output
print(f"Number of unique accessions in human: {len(unique_accessions_h)}")
print(f"Number of unique accessions in rice: {len(unique_accessions_r)}")
print(f"Number of common accessions: {len(common_accessions)}")

if missing_accessions:
    print(f"Number of missing accessions: {len(missing_accessions)}")
    print(f"Missing accessions: {missing_accessions}")
else:
    print("Number of missing accessions: 0")

print(f"Are the combinations of 'Gene stable ID_rice' and 'Gene stable ID_human' unique?: {unique_combinations}")

#write output
merged_df_goslim.write_csv(f"../data/GOSlim_merge_{organism1_vs_organism2}/GOslim_merge_common_goslim_correspondence_all.tsv", 
                           separator="\t")

Number of unique accessions in human: 137
Number of unique accessions in rice: 97
Number of common accessions: 34
Number of missing accessions: 0
Are the combinations of 'Gene stable ID_rice' and 'Gene stable ID_human' unique?: True


### Extract dataset based on GOslim domain (BP, CC, MF)

In [11]:
# if you search specific string in column, use 'filter' function and 'str.contains' method
cellular_component = merged_df_goslim.filter(pl.col("GOSlim_domain").str.contains("cellular_component"))
molecular_function = merged_df_goslim.filter(pl.col("GOSlim_domain").str.contains("molecular_function"))
biological_process = merged_df_goslim.filter(pl.col("GOSlim_domain").str.contains("biological_process"))


#write output
cellular_component.write_csv(f"../data/GOSlim_merge_{organism1_vs_organism2}/GOslim_merge_common_goslim_correspondence_cc.tsv",
                            separator="\t")
molecular_function.write_csv(f"../data/GOSlim_merge_{organism1_vs_organism2}/GOslim_merge_common_goslim_correspondence_mf.tsv",
                            separator="\t")
biological_process.write_csv(f"../data/GOSlim_merge_{organism1_vs_organism2}/GOslim_merge_common_goslim_correspondence_bp.tsv",
                            separator="\t")


display(cellular_component, molecular_function, biological_process)

GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain,Gene stable ID_human,Gene stable ID_rice
str,str,str,str,str
"""GO:0005886""","""plasma membran…","""cellular_compo…","""ENSG0000018609…","""Os01g0100400"""
"""GO:0005886""","""plasma membran…","""cellular_compo…","""ENSG0000028473…","""Os01g0100400"""
"""GO:0005886""","""plasma membran…","""cellular_compo…","""ENSG0000028466…","""Os01g0100400"""
"""GO:0005886""","""plasma membran…","""cellular_compo…","""ENSG0000018758…","""Os01g0100400"""
"""GO:0005886""","""plasma membran…","""cellular_compo…","""ENSG0000018815…","""Os01g0100400"""
"""GO:0005886""","""plasma membran…","""cellular_compo…","""ENSG0000018689…","""Os01g0100400"""
"""GO:0005886""","""plasma membran…","""cellular_compo…","""ENSG0000018682…","""Os01g0100400"""
"""GO:0005886""","""plasma membran…","""cellular_compo…","""ENSG0000016257…","""Os01g0100400"""
"""GO:0005886""","""plasma membran…","""cellular_compo…","""ENSG0000022405…","""Os01g0100400"""
"""GO:0005886""","""plasma membran…","""cellular_compo…","""ENSG0000016996…","""Os01g0100400"""


GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain,Gene stable ID_human,Gene stable ID_rice
str,str,str,str,str
"""GO:0003824""","""catalytic acti…","""molecular_func…","""ENSG0000023733…","""Os01g0100300"""
"""GO:0003824""","""catalytic acti…","""molecular_func…","""ENSG0000016257…","""Os01g0100300"""
"""GO:0003824""","""catalytic acti…","""molecular_func…","""ENSG0000017602…","""Os01g0100300"""
"""GO:0003824""","""catalytic acti…","""molecular_func…","""ENSG0000016008…","""Os01g0100300"""
"""GO:0003824""","""catalytic acti…","""molecular_func…","""ENSG0000016997…","""Os01g0100300"""
"""GO:0003824""","""catalytic acti…","""molecular_func…","""ENSG0000012705…","""Os01g0100300"""
"""GO:0003824""","""catalytic acti…","""molecular_func…","""ENSG0000021591…","""Os01g0100300"""
"""GO:0003824""","""catalytic acti…","""molecular_func…","""ENSG0000016007…","""Os01g0100300"""
"""GO:0003824""","""catalytic acti…","""molecular_func…","""ENSG0000019778…","""Os01g0100300"""
"""GO:0003824""","""catalytic acti…","""molecular_func…","""ENSG0000016007…","""Os01g0100300"""


GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain,Gene stable ID_human,Gene stable ID_rice
str,str,str,str,str
"""GO:0006629""","""lipid metaboli…","""biological_pro…","""ENSG0000022405…","""Os01g0100900"""
"""GO:0006629""","""lipid metaboli…","""biological_pro…","""ENSG0000014952…","""Os01g0100900"""
"""GO:0006629""","""lipid metaboli…","""biological_pro…","""ENSG0000015787…","""Os01g0100900"""
"""GO:0006629""","""lipid metaboli…","""biological_pro…","""ENSG0000009702…","""Os01g0100900"""
"""GO:0006629""","""lipid metaboli…","""biological_pro…","""ENSG0000004923…","""Os01g0100900"""
"""GO:0006629""","""lipid metaboli…","""biological_pro…","""ENSG0000017160…","""Os01g0100900"""
"""GO:0006629""","""lipid metaboli…","""biological_pro…","""ENSG0000019879…","""Os01g0100900"""
"""GO:0006629""","""lipid metaboli…","""biological_pro…","""ENSG0000020462…","""Os01g0100900"""
"""GO:0006629""","""lipid metaboli…","""biological_pro…","""ENSG0000016249…","""Os01g0100900"""
"""GO:0006629""","""lipid metaboli…","""biological_pro…","""ENSG0000012748…","""Os01g0100900"""
