In [1]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
import os
import dask.dataframe as dd

In [2]:
goslim_all_human = dd.read_csv('../data/biomart_goslim/biomart_human_goslim_R110_domain.tsv', sep='\t',
                               dtype={'Gene stable ID': 'object'}, 
                               low_memory=False)
goslim_all_rice = dd.read_csv('../data/biomart_goslim/biomart_rice_goslim_R56_domain.tsv', sep='\t',
                              dtype={'Gene stable ID': 'object'},
                              low_memory=False)
columns_of_interest = ['Gene stable ID', 'GOSlim GOA Accession(s)', 'GOSlim GOA Description', 'GOSlim_domain']

goslim_all_human_filtered = goslim_all_human[columns_of_interest]
goslim_all_rice_filtered = goslim_all_rice[columns_of_interest]

display(goslim_all_human_filtered.compute())
display(goslim_all_rice_filtered.compute())

Unnamed: 0,Gene stable ID,GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain
0,ENSG00000243485,GO:0031047,gene silencing by RNA,biological_process
1,ENSG00000284332,GO:0031047,gene silencing by RNA,biological_process
2,ENSG00000186092,GO:0023052,signaling,biological_process
3,ENSG00000186092,GO:0060089,molecular transducer activity,molecular_function
4,ENSG00000186092,GO:0005886,plasma membrane,cellular_component
...,...,...,...,...
193923,ENSG00000292372,GO:0007010,cytoskeleton organization,biological_process
193924,ENSG00000292372,GO:0008092,cytoskeletal protein binding,molecular_function
193925,ENSG00000292372,GO:0031410,cytoplasmic vesicle,cellular_component
193926,ENSG00000292372,GO:0043226,organelle,cellular_component


Unnamed: 0,Gene stable ID,GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain
0,Os01g0100100,GO:0006810,transport,biological_process
1,Os01g0100100,GO:0008150,biological_process,biological_process
2,Os01g0100100,GO:0009987,cellular process,biological_process
3,Os01g0100100,GO:0003674,molecular_function,molecular_function
4,Os01g0100100,GO:0030234,enzyme regulator activity,molecular_function
...,...,...,...,...
219494,gene-rps19,GO:0003674,molecular_function,molecular_function
219495,gene-rps19,GO:0005198,structural molecule activity,molecular_function
219496,gene-rps19,GO:0005488,binding,molecular_function
219497,gene-rps19,GO:0003723,RNA binding,molecular_function


In [3]:
# merge dataframe based on common column 'GOSlim GOA Accession(s)'
merged_df_goslim = dd.merge(
    goslim_all_human_filtered, 
    goslim_all_rice_filtered,
    left_on=['GOSlim GOA Accession(s)', 'GOSlim GOA Description', 'GOSlim_domain'],
    right_on=['GOSlim GOA Accession(s)','GOSlim GOA Description', 'GOSlim_domain'],
    how='inner',
    suffixes=('_rice', '_human')
)

# extract necessary columns
merged_df_goslim = merged_df_goslim[['GOSlim GOA Accession(s)',
                                     'GOSlim GOA Description',
                                     'GOSlim_domain',
                                     'Gene stable ID_human',
                                     'Gene stable ID_rice'
                                     ]]

merged_df_goslim_pd = merged_df_goslim.compute()

# Remove duplicate rows based on specific columns
merged_df_goslim_pd.drop_duplicates(inplace=True)

# for debugging purpose
unique_accessions_r = set(goslim_all_human_filtered['GOSlim GOA Accession(s)'].compute().unique())
unique_accessions_h = set(goslim_all_rice_filtered['GOSlim GOA Accession(s)'].compute().unique())
common_accessions = unique_accessions_r.intersection(unique_accessions_h)
unique_accessions_merged = set(merged_df_goslim_pd['GOSlim GOA Accession(s)'].unique())
missing_accessions = common_accessions.difference(unique_accessions_merged)
duplicates = merged_df_goslim_pd.duplicated(subset=['GOSlim GOA Accession(s)','Gene stable ID_rice', 'Gene stable ID_human'], keep=False)
unique_combinations = not duplicates.any()
print(f"Number of common accessions: {len(common_accessions)}")
print(f"Number of missing accessions: {len(missing_accessions)}")
print(f"Missing accessions: {missing_accessions}")
print(f"各'Gene stable ID_rice'と'Gene stable ID_human'の組み合わせはユニークですか？: {unique_combinations}")

merged_df_goslim_pd.to_csv("../data/GOslim_merge/GOslim_merge_common_goslim_correspondence_all_up.tsv", sep='\t', index=False)

display(merged_df_goslim_pd)

Number of common accessions: 34
Number of missing accessions: 0
Missing accessions: set()
各'Gene stable ID_rice'と'Gene stable ID_human'の組み合わせはユニークですか？: True


Unnamed: 0,GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain,Gene stable ID_human,Gene stable ID_rice
0,GO:0005886,plasma membrane,cellular_component,Os01g0100400,ENSG00000186092
1,GO:0005886,plasma membrane,cellular_component,Os01g0103600,ENSG00000186092
2,GO:0005886,plasma membrane,cellular_component,Os01g0104000,ENSG00000186092
3,GO:0005886,plasma membrane,cellular_component,Os01g0108000,ENSG00000186092
4,GO:0005886,plasma membrane,cellular_component,Os01g0110100,ENSG00000186092
...,...,...,...,...,...
205962886,GO:0045182,translation regulator activity,molecular_function,Os12g0507200,ENSG00000205916
205962887,GO:0045182,translation regulator activity,molecular_function,Os12g0541500,ENSG00000205916
205962888,GO:0045182,translation regulator activity,molecular_function,Os12g0607100,ENSG00000205916
205962889,GO:0045182,translation regulator activity,molecular_function,Os12g0617100,ENSG00000205916


### Extract dataset based on GOslim domain (BP, CC, MF)

In [4]:
cellular_component = merged_df_goslim_pd[merged_df_goslim_pd['GOSlim_domain'] == 'cellular_component']
molecular_function = merged_df_goslim_pd[merged_df_goslim_pd['GOSlim_domain'] == 'molecular_function']
biological_process = merged_df_goslim_pd[merged_df_goslim_pd['GOSlim_domain'] == 'biological_process']

cellular_component.to_csv("../data/GOslim_merge/GOslim_merge_common_goslim_correspondence_cc_up.tsv", sep='\t', index=False)
molecular_function.to_csv("../data/GOslim_merge/GOslim_merge_common_goslim_correspondence_mf_up.tsv", sep='\t', index=False)
biological_process.to_csv("../data/GOslim_merge/GOslim_merge_common_goslim_correspondence_bp_up.tsv", sep='\t', index=False)

display(cellular_component)
display(molecular_function)
display(biological_process)

Unnamed: 0,GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain,Gene stable ID_human,Gene stable ID_rice
0,GO:0005886,plasma membrane,cellular_component,Os01g0100400,ENSG00000186092
1,GO:0005886,plasma membrane,cellular_component,Os01g0103600,ENSG00000186092
2,GO:0005886,plasma membrane,cellular_component,Os01g0104000,ENSG00000186092
3,GO:0005886,plasma membrane,cellular_component,Os01g0108000,ENSG00000186092
4,GO:0005886,plasma membrane,cellular_component,Os01g0110100,ENSG00000186092
...,...,...,...,...,...
205657875,GO:0005777,peroxisome,cellular_component,Os12g0162900,ENSG00000101986
205657876,GO:0005777,peroxisome,cellular_component,Os12g0178700,ENSG00000101986
205657877,GO:0005777,peroxisome,cellular_component,Os12g0233300,ENSG00000101986
205657878,GO:0005777,peroxisome,cellular_component,Os12g0257000,ENSG00000101986


Unnamed: 0,GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain,Gene stable ID_human,Gene stable ID_rice
14121523,GO:0003723,RNA binding,molecular_function,Os01g0100700,ENSG00000222623
14121524,GO:0003723,RNA binding,molecular_function,Os01g0101150,ENSG00000222623
14121525,GO:0003723,RNA binding,molecular_function,Os01g0101300,ENSG00000222623
14121526,GO:0003723,RNA binding,molecular_function,Os01g0101600,ENSG00000222623
14121527,GO:0003723,RNA binding,molecular_function,Os01g0118100,ENSG00000222623
...,...,...,...,...,...
205962886,GO:0045182,translation regulator activity,molecular_function,Os12g0507200,ENSG00000205916
205962887,GO:0045182,translation regulator activity,molecular_function,Os12g0541500,ENSG00000205916
205962888,GO:0045182,translation regulator activity,molecular_function,Os12g0607100,ENSG00000205916
205962889,GO:0045182,translation regulator activity,molecular_function,Os12g0617100,ENSG00000205916


Unnamed: 0,GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain,Gene stable ID_human,Gene stable ID_rice
87167021,GO:0036211,protein modification process,biological_process,Os01g0104000,ENSG00000188976
87167022,GO:0036211,protein modification process,biological_process,Os01g0104100,ENSG00000188976
87167023,GO:0036211,protein modification process,biological_process,Os01g0104600,ENSG00000188976
87167024,GO:0036211,protein modification process,biological_process,Os01g0106700,ENSG00000188976
87167025,GO:0036211,protein modification process,biological_process,Os01g0106800,ENSG00000188976
...,...,...,...,...,...
205923506,GO:0006091,generation of precursor metabolites and energy,biological_process,gene-psbB,ENSG00000160211
205923507,GO:0006091,generation of precursor metabolites and energy,biological_process,gene-ndhF,ENSG00000160211
205923508,GO:0006091,generation of precursor metabolites and energy,biological_process,gene-ndhD,ENSG00000160211
205923509,GO:0006091,generation of precursor metabolites and energy,biological_process,gene-psaC,ENSG00000160211
