In [1]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
import os
import dask.dataframe as dd

organism1_vs_organism2 = 'MR' # mouse vs rice

In [3]:
goslim_all_mouse = dd.read_csv('../data/biomart_goslim/biomart_mouse_goslim_R110_domain.tsv', sep='\t',
                               dtype={'Gene stable ID': 'object'}, 
                               low_memory=False)
goslim_all_rice = dd.read_csv('../data/biomart_goslim/biomart_rice_goslim_R56_domain.tsv', sep='\t',
                              dtype={'Gene stable ID': 'object'},
                              low_memory=False)
columns_of_interest = ['Gene stable ID', 'GOSlim GOA Accession(s)', 'GOSlim GOA Description', 'GOSlim_domain']

goslim_all_mouse_filtered = goslim_all_mouse[columns_of_interest]
goslim_all_rice_filtered = goslim_all_rice[columns_of_interest]

display(goslim_all_mouse_filtered.compute())
display(goslim_all_rice_filtered.compute())

Unnamed: 0,Gene stable ID,GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain
0,ENSMUSG00000064336,GO:0060090,molecular adaptor activity,molecular_function
1,ENSMUSG00000064336,GO:0003723,RNA binding,molecular_function
2,ENSMUSG00000064336,GO:0043226,organelle,cellular_component
3,ENSMUSG00000064336,GO:0005739,mitochondrion,cellular_component
4,ENSMUSG00000064337,GO:0005198,structural molecule activity,molecular_function
...,...,...,...,...
170875,ENSMUSG00000015335,GO:0036211,protein modification process,biological_process
170876,ENSMUSG00000015335,GO:0023052,signaling,biological_process
170877,ENSMUSG00000015335,GO:0048856,anatomical structure development,biological_process
170878,ENSMUSG00000015335,GO:0034330,cell junction organization,biological_process


Unnamed: 0,Gene stable ID,GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain
0,Os01g0100100,GO:0006810,transport,biological_process
1,Os01g0100100,GO:0008150,biological_process,biological_process
2,Os01g0100100,GO:0009987,cellular process,biological_process
3,Os01g0100100,GO:0003674,molecular_function,molecular_function
4,Os01g0100100,GO:0030234,enzyme regulator activity,molecular_function
...,...,...,...,...
219494,gene-rps19,GO:0003674,molecular_function,molecular_function
219495,gene-rps19,GO:0005198,structural molecule activity,molecular_function
219496,gene-rps19,GO:0005488,binding,molecular_function
219497,gene-rps19,GO:0003723,RNA binding,molecular_function


In [4]:
# merge dataframe based on common column 'GOSlim GOA Accession(s)'
merged_df_goslim = dd.merge(
    goslim_all_mouse_filtered, 
    goslim_all_rice_filtered,
    left_on=['GOSlim GOA Accession(s)', 'GOSlim GOA Description', 'GOSlim_domain'],
    right_on=['GOSlim GOA Accession(s)','GOSlim GOA Description', 'GOSlim_domain'],
    how='inner',
    suffixes=('_rice', '_human')
)

# extract necessary columns
merged_df_goslim = merged_df_goslim[['GOSlim GOA Accession(s)',
                                     'GOSlim GOA Description',
                                     'GOSlim_domain',
                                     'Gene stable ID_human',
                                     'Gene stable ID_rice'
                                     ]]

merged_df_goslim_pd = merged_df_goslim.compute()

# Remove duplicate rows based on specific columns
merged_df_goslim_pd.drop_duplicates(inplace=True)

# for debugging purpose
unique_accessions_r = set(goslim_all_mouse_filtered['GOSlim GOA Accession(s)'].compute().unique())
unique_accessions_h = set(goslim_all_rice_filtered['GOSlim GOA Accession(s)'].compute().unique())
common_accessions = unique_accessions_r.intersection(unique_accessions_h)
unique_accessions_merged = set(merged_df_goslim_pd['GOSlim GOA Accession(s)'].unique())
missing_accessions = common_accessions.difference(unique_accessions_merged)
duplicates = merged_df_goslim_pd.duplicated(subset=['GOSlim GOA Accession(s)','Gene stable ID_rice', 'Gene stable ID_human'], keep=False)
unique_combinations = not duplicates.any()
print(f"Number of common accessions: {len(common_accessions)}")
print(f"Number of missing accessions: {len(missing_accessions)}")
print(f"Missing accessions: {missing_accessions}")
print(f"各'Gene stable ID_rice'と'Gene stable ID_human'の組み合わせはユニークですか？: {unique_combinations}")

merged_df_goslim_pd.to_csv(f"../data/GOSlim_merge_{organism1_vs_organism2}/GOslim_merge_common_goslim_correspondence_all.tsv", sep='\t', index=False)

display(merged_df_goslim_pd)

Number of common accessions: 34
Number of missing accessions: 0
Missing accessions: set()
各'Gene stable ID_rice'と'Gene stable ID_human'の組み合わせはユニークですか？: True


Unnamed: 0,GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain,Gene stable ID_human,Gene stable ID_rice
0,GO:0003723,RNA binding,molecular_function,Os01g0100700,ENSMUSG00000064336
1,GO:0003723,RNA binding,molecular_function,Os01g0101150,ENSMUSG00000064336
2,GO:0003723,RNA binding,molecular_function,Os01g0101300,ENSMUSG00000064336
3,GO:0003723,RNA binding,molecular_function,Os01g0101600,ENSMUSG00000064336
4,GO:0003723,RNA binding,molecular_function,Os01g0118100,ENSMUSG00000064336
...,...,...,...,...,...
183104384,GO:0030312,external encapsulating structure,cellular_component,Os12g0514500,ENSMUSG00000026938
183104385,GO:0030312,external encapsulating structure,cellular_component,Os12g0530100,ENSMUSG00000026938
183104386,GO:0030312,external encapsulating structure,cellular_component,Os12g0546800,ENSMUSG00000026938
183104387,GO:0030312,external encapsulating structure,cellular_component,Os12g0563700,ENSMUSG00000026938


### Extract dataset based on GOslim domain (BP, CC, MF)

In [6]:
cellular_component = merged_df_goslim_pd[merged_df_goslim_pd['GOSlim_domain'] == 'cellular_component']
molecular_function = merged_df_goslim_pd[merged_df_goslim_pd['GOSlim_domain'] == 'molecular_function']
biological_process = merged_df_goslim_pd[merged_df_goslim_pd['GOSlim_domain'] == 'biological_process']

cellular_component.to_csv(f"../data/GOSlim_merge_{organism1_vs_organism2}/GOslim_merge_common_goslim_correspondence_cc.tsv", sep='\t', index=False)
molecular_function.to_csv(f"../data/GOSlim_merge_{organism1_vs_organism2}/GOslim_merge_common_goslim_correspondence_mf.tsv", sep='\t', index=False)
biological_process.to_csv(f"../data/GOSlim_merge_{organism1_vs_organism2}/GOslim_merge_common_goslim_correspondence_bp.tsv", sep='\t', index=False)

display(cellular_component, molecular_function, biological_process)

Unnamed: 0,GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain,Gene stable ID_human,Gene stable ID_rice
3071712,GO:0005739,mitochondrion,cellular_component,Os01g0105800,ENSMUSG00000064336
3071713,GO:0005739,mitochondrion,cellular_component,Os01g0118000,ENSMUSG00000064336
3071714,GO:0005739,mitochondrion,cellular_component,Os01g0120300,ENSMUSG00000064336
3071715,GO:0005739,mitochondrion,cellular_component,Os01g0120400,ENSMUSG00000064336
3071716,GO:0005739,mitochondrion,cellular_component,Os01g0141600,ENSMUSG00000064336
...,...,...,...,...,...
183104384,GO:0030312,external encapsulating structure,cellular_component,Os12g0514500,ENSMUSG00000026938
183104385,GO:0030312,external encapsulating structure,cellular_component,Os12g0530100,ENSMUSG00000026938
183104386,GO:0030312,external encapsulating structure,cellular_component,Os12g0546800,ENSMUSG00000026938
183104387,GO:0030312,external encapsulating structure,cellular_component,Os12g0563700,ENSMUSG00000026938


Unnamed: 0,GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain,Gene stable ID_human,Gene stable ID_rice
0,GO:0003723,RNA binding,molecular_function,Os01g0100700,ENSMUSG00000064336
1,GO:0003723,RNA binding,molecular_function,Os01g0101150,ENSMUSG00000064336
2,GO:0003723,RNA binding,molecular_function,Os01g0101300,ENSMUSG00000064336
3,GO:0003723,RNA binding,molecular_function,Os01g0101600,ENSMUSG00000064336
4,GO:0003723,RNA binding,molecular_function,Os01g0118100,ENSMUSG00000064336
...,...,...,...,...,...
182930233,GO:0008289,lipid binding,molecular_function,Os12g0555100,ENSMUSG00000019715
182930234,GO:0008289,lipid binding,molecular_function,Os12g0555200,ENSMUSG00000019715
182930235,GO:0008289,lipid binding,molecular_function,Os12g0555300,ENSMUSG00000019715
182930236,GO:0008289,lipid binding,molecular_function,Os12g0555500,ENSMUSG00000019715


Unnamed: 0,GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain,Gene stable ID_human,Gene stable ID_rice
63914407,GO:0006091,generation of precursor metabolites and energy,biological_process,Os01g0110700,ENSMUSG00000064341
63914408,GO:0006091,generation of precursor metabolites and energy,biological_process,Os01g0118000,ENSMUSG00000064341
63914409,GO:0006091,generation of precursor metabolites and energy,biological_process,Os01g0147900,ENSMUSG00000064341
63914410,GO:0006091,generation of precursor metabolites and energy,biological_process,Os01g0174900,ENSMUSG00000064341
63914411,GO:0006091,generation of precursor metabolites and energy,biological_process,Os01g0190400,ENSMUSG00000064341
...,...,...,...,...,...
162934448,GO:0005975,carbohydrate metabolic process,biological_process,Os12g0616900,ENSMUSG00000027207
162934449,GO:0005975,carbohydrate metabolic process,biological_process,Os12g0632700,ENSMUSG00000027207
162934450,GO:0005975,carbohydrate metabolic process,biological_process,Os12g0633000,ENSMUSG00000027207
162934451,GO:0005975,carbohydrate metabolic process,biological_process,Os12g0641400,ENSMUSG00000027207
