## Discription

### Background
Circos plotの描画の処理内で行われている結合をこちらのファイルで行うことを目的に作成しました

In [1]:
import collections
import csv
import datetime
import os
import pandas as pd
import numpy as np
from collections import Counter

In [2]:
# output directory
organism1_vs_organism2 = "MR" # human vs rice
direction = "down"
now = datetime.datetime.now()
circos_dir = f'../data/circos_{direction}_{organism1_vs_organism2}_{now.strftime("%y%m")}'
no_annotation = f'../data/{circos_dir}/no_annotation_{direction}'
os.makedirs(circos_dir, exist_ok=True)
os.makedirs(no_annotation, exist_ok=True)

## MOUSE_DOWN

1. Create a TSV file to plot genes with high HN-score on the Circos plot

2. Extract GOSlim corresponding to the list of up-regulated genes

In [5]:
# 1. Create a TSV file to plot genes with high HN-score on the Circos plot
#organism = "human" (H)
df_m_score = pd.read_csv(f"../data/mouse_annotation/mouse_position_{direction}.tsv", sep='\t')
df_m_score_subset = df_m_score[['Gene stable ID', # create a subset
                        'Chromosome/scaffold name', 
                        'start2 (bp)',
                        'end2 (bp)',
                        'HN-score(HN5)']].copy()
df_m_score_subset['Chromosome/scaffold name'] = 'M_' + df_m_score_subset['Chromosome/scaffold name'].astype(str)
df_m_score_subset['organism'] = 'mouse'# assign the tag

df_m_score_subset.to_csv(f"../data/{circos_dir}/mouse_position_{direction}_assign.tsv", sep='\t', index=False)

display(df_m_score_subset)

Unnamed: 0,Gene stable ID,Chromosome/scaffold name,start2 (bp),end2 (bp),HN-score(HN5),organism
0,ENSMUSG00000039936,M_4,149733625,149787028,-15,mouse
1,ENSMUSG00000047671,M_4,116983991,116985935,-15,mouse
2,ENSMUSG00000051378,M_11,102796355,102815950,-15,mouse
3,ENSMUSG00000000440,M_6,115337912,115467360,-15,mouse
4,ENSMUSG00000046949,M_13,34148670,34172426,-15,mouse
...,...,...,...,...,...,...
254,ENSMUSG00000074141,M_7,44465811,44490233,-30,mouse
255,ENSMUSG00000025652,M_9,108743687,108744631,-31,mouse
256,ENSMUSG00000057072,M_1,190768836,190775139,-31,mouse
257,ENSMUSG00000030858,M_7,130927673,130931245,-33,mouse


In [6]:
# 2. Extract GOSlim corresponding to the list of up-regulated genes
goslim_all_mouse = pd.read_csv('../data/biomart_goslim/biomart_mouse_goslim_R110_domain.tsv', sep='\t',
                               dtype={'Gene stable ID': 'object'}, 
                               low_memory=False)
columns_of_interest_mouse = ['Gene stable ID', 
                             'GOSlim GOA Accession(s)', 
                             'GOSlim GOA Description', 
                             'GOSlim_domain']
goslim_all_mouse_filtered = goslim_all_mouse[columns_of_interest_mouse]

extract_df_m_goslim = pd.merge(
    df_m_score_subset, 
    goslim_all_mouse_filtered, 
    on='Gene stable ID', 
    how='inner'
    )
extract_df_m_goslim = extract_df_m_goslim.drop_duplicates(subset=['Gene stable ID',
                                                                  'GOSlim GOA Accession(s)', 
                                                                  'GOSlim GOA Description'], keep='first').copy()

# Extract genes with no annotations
extract_df_m_goslim_with_indicator = pd.merge(
    df_m_score_subset, 
    goslim_all_mouse_filtered,
    left_on='Gene stable ID',
    right_on='Gene stable ID',
    how='left',
    indicator=True # add indicator column to show the difference
)

# extract_df_h_goslim_with_indicator
left_only_rows = extract_df_m_goslim_with_indicator[extract_df_m_goslim_with_indicator['_merge'] == 'left_only']

extract_df_m_goslim.to_csv(f"../data/{circos_dir}/mouse_goslim_{direction}.tsv", sep='\t', index=False)
left_only_rows.to_csv(f"../data/{no_annotation}/mouse_position_down_no_annotation.tsv", sep='\t', index=False)

display(goslim_all_mouse_filtered, extract_df_m_goslim, left_only_rows)

Unnamed: 0,Gene stable ID,GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain
0,ENSMUSG00000064336,GO:0060090,molecular adaptor activity,molecular_function
1,ENSMUSG00000064336,GO:0003723,RNA binding,molecular_function
2,ENSMUSG00000064336,GO:0043226,organelle,cellular_component
3,ENSMUSG00000064336,GO:0005739,mitochondrion,cellular_component
4,ENSMUSG00000064337,GO:0005198,structural molecule activity,molecular_function
...,...,...,...,...
170875,ENSMUSG00000015335,GO:0036211,protein modification process,biological_process
170876,ENSMUSG00000015335,GO:0023052,signaling,biological_process
170877,ENSMUSG00000015335,GO:0048856,anatomical structure development,biological_process
170878,ENSMUSG00000015335,GO:0034330,cell junction organization,biological_process


Unnamed: 0,Gene stable ID,Chromosome/scaffold name,start2 (bp),end2 (bp),HN-score(HN5),organism,GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain
0,ENSMUSG00000039936,M_4,149733625,149787028,-15,mouse,GO:0003824,catalytic activity,molecular_function
1,ENSMUSG00000039936,M_4,149733625,149787028,-15,mouse,GO:0016740,transferase activity,molecular_function
2,ENSMUSG00000039936,M_4,149733625,149787028,-15,mouse,GO:0006629,lipid metabolic process,biological_process
3,ENSMUSG00000039936,M_4,149733625,149787028,-15,mouse,GO:0023052,signaling,biological_process
4,ENSMUSG00000039936,M_4,149733625,149787028,-15,mouse,GO:0048870,cell motility,biological_process
...,...,...,...,...,...,...,...,...,...
1555,ENSMUSG00000089798,M_5,107682575,107699408,-35,mouse,GO:0022414,reproductive process,biological_process
1556,ENSMUSG00000089798,M_5,107682575,107699408,-35,mouse,GO:0007059,chromosome segregation,biological_process
1557,ENSMUSG00000089798,M_5,107682575,107699408,-35,mouse,GO:0043226,organelle,cellular_component
1558,ENSMUSG00000089798,M_5,107682575,107699408,-35,mouse,GO:0005694,chromosome,cellular_component


Unnamed: 0,Gene stable ID,Chromosome/scaffold name,start2 (bp),end2 (bp),HN-score(HN5),organism,GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain,_merge
64,ENSMUSG00000057047,M_2,173561208,173563882,-15,mouse,,,,left_only
76,ENSMUSG00000025257,M_X,150787578,150799291,-15,mouse,,,,left_only
217,ENSMUSG00000072739,M_14,19010486,19020394,-15,mouse,,,,left_only
258,ENSMUSG00000079580,M_17,29744981,29771359,-15,mouse,,,,left_only
271,ENSMUSG00000095681,M_14,17376458,17398785,-15,mouse,,,,left_only
272,ENSMUSG00000096372,M_14,43268863,43273797,-15,mouse,,,,left_only
296,ENSMUSG00000090872,M_14,17799888,17805576,-15,mouse,,,,left_only
306,ENSMUSG00000094021,M_14,15423279,15433254,-15,mouse,,,,left_only
340,ENSMUSG00000097271,M_5,33787269,33789436,-16,mouse,,,,left_only
373,ENSMUSG00000081906,M_11,83535872,83536447,-16,mouse,,,,left_only


## RICE_down

1. Create a TSV file to plot genes with high HN-score on the Circos plot

2. Extract GOSlim corresponding to the list of up-regulated genes

In [7]:
# 1. Create a TSV file to plot genes with high HN-score on the Circos plot
df_r_score = pd.read_csv(f'../data/rice_annotation/rice_position_{direction}.tsv', sep='\t')
df_r_score_subset = df_r_score[['Gene stable ID', # create a subset
                        'Chromosome/scaffold name', 
                        'start1 (bp)',
                        'end1 (bp)',
                        'HN-score(HN5)']].copy()
df_r_score_subset['Chromosome/scaffold name'] = 'R_' + df_r_score_subset['Chromosome/scaffold name'].astype(str)
df_r_score_subset['organism'] = 'rice'# assign the tag

df_r_score_subset.to_csv(f"../data/{circos_dir}/rice_position_{direction}_assign.tsv", sep='\t', index=False)

display(df_r_score_subset)

Unnamed: 0,Gene stable ID,Chromosome/scaffold name,start1 (bp),end1 (bp),HN-score(HN5),organism
0,Os01g0136300,R_1,1955181,1955914,-40,rice
1,Os05g0588225,R_5,29307170,29307793,-40,rice
2,Os03g0358800,R_3,13915940,13917546,-40,rice
3,Os06g0157900,R_6,2947702,2948085,-40,rice
4,Os11g0439600,R_11,14390405,14398122,-40,rice
...,...,...,...,...,...,...
379,Os01g0952800,R_1,41971444,41978093,-173,rice
380,Os07g0142100,R_7,2175193,2175719,-178,rice
381,Os03g0307200,R_3,10926469,10927729,-182,rice
382,Os07g0142200,R_7,2176824,2177640,-189,rice


In [8]:
# 2. Extract GOSlim corresponding to the list of up-regulated genes
goslim_all_rice = pd.read_csv('../data/biomart_goslim/biomart_rice_goslim_R56_domain.tsv', sep='\t',
                               dtype={'Gene stable ID': 'object'}, 
                               low_memory=False)
columns_of_interest_rice = ['Gene stable ID',
                            'GOSlim GOA Accession(s)', 
                            'GOSlim GOA Description', 
                            'GOSlim_domain']
goslim_all_rice_filtered = goslim_all_rice[columns_of_interest_rice]

extract_df_r_goslim = pd.merge(
    df_r_score_subset, 
    goslim_all_rice_filtered, 
    on='Gene stable ID', 
    how='inner'
    )

extract_df_r_goslim = extract_df_r_goslim.drop_duplicates(subset=['Gene stable ID',
                                                                    'GOSlim GOA Accession(s)', 
                                                                    'GOSlim GOA Description'], keep='first').copy()

# Extract genes with no annotations
extract_df_r_goslim_with_indicator = pd.merge(
    df_r_score_subset, 
    goslim_all_rice_filtered,
    left_on='Gene stable ID',
    right_on='Gene stable ID',
    how='left',
    indicator=True # add indicator column to show the difference
)

# extract_df_r_goslim_with_indicator
left_only_rows = extract_df_r_goslim_with_indicator[extract_df_r_goslim_with_indicator['_merge'] == 'left_only']

extract_df_r_goslim.to_csv(f"../data/{circos_dir}/rice_goslim_{direction}.tsv", sep='\t', index=False)
left_only_rows.to_csv(f"../data/{no_annotation}/rice_position_{direction}_no_annotation.tsv", sep='\t', index=False)

display(goslim_all_rice_filtered, extract_df_r_goslim, left_only_rows)

Unnamed: 0,Gene stable ID,GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain
0,Os01g0100100,GO:0006810,transport,biological_process
1,Os01g0100100,GO:0008150,biological_process,biological_process
2,Os01g0100100,GO:0009987,cellular process,biological_process
3,Os01g0100100,GO:0003674,molecular_function,molecular_function
4,Os01g0100100,GO:0030234,enzyme regulator activity,molecular_function
...,...,...,...,...
219494,gene-rps19,GO:0003674,molecular_function,molecular_function
219495,gene-rps19,GO:0005198,structural molecule activity,molecular_function
219496,gene-rps19,GO:0005488,binding,molecular_function
219497,gene-rps19,GO:0003723,RNA binding,molecular_function


Unnamed: 0,Gene stable ID,Chromosome/scaffold name,start1 (bp),end1 (bp),HN-score(HN5),organism,GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain
0,Os03g0358800,R_3,13915940,13917546,-40,rice,GO:0005575,cellular_component,cellular_component
1,Os03g0358800,R_3,13915940,13917546,-40,rice,GO:0005622,intracellular anatomical structure,cellular_component
2,Os03g0358800,R_3,13915940,13917546,-40,rice,GO:0003674,molecular_function,molecular_function
3,Os03g0358800,R_3,13915940,13917546,-40,rice,GO:0003824,catalytic activity,molecular_function
4,Os03g0358800,R_3,13915940,13917546,-40,rice,GO:0016740,transferase activity,molecular_function
...,...,...,...,...,...,...,...,...,...
2028,Os03g0307300,R_3,10929507,10930895,-196,rice,GO:0016740,transferase activity,molecular_function
2029,Os03g0307300,R_3,10929507,10930895,-196,rice,GO:0008150,biological_process,biological_process
2030,Os03g0307300,R_3,10929507,10930895,-196,rice,GO:0009987,cellular process,biological_process
2031,Os03g0307300,R_3,10929507,10930895,-196,rice,GO:0008152,metabolic process,biological_process


Unnamed: 0,Gene stable ID,Chromosome/scaffold name,start1 (bp),end1 (bp),HN-score(HN5),organism,GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain,_merge
0,Os01g0136300,R_1,1955181,1955914,-40,rice,,,,left_only
1,Os05g0588225,R_5,29307170,29307793,-40,rice,,,,left_only
24,Os07g0600200,R_7,24487821,24488449,-40,rice,,,,left_only
25,Os08g0178650,R_8,4597056,4597381,-40,rice,,,,left_only
32,Os02g0565150,R_2,21457944,21458344,-40,rice,,,,left_only
...,...,...,...,...,...,...,...,...,...,...
2124,Os01g0647200,R_1,26086062,26088337,-165,rice,,,,left_only
2132,Os02g0594166,R_2,23008659,23011091,-170,rice,,,,left_only
2133,Os01g0608101,R_1,24004176,24006793,-171,rice,,,,left_only
2157,Os07g0142100,R_7,2175193,2175719,-178,rice,,,,left_only


## COMBINED_UP( MOUSE_DOWN + RICE_DOWN )

1. chromosome position + gene position + HN-score

2. GOslim terms corresponding to the list of up-regulated genes

In [9]:
# 1. chromosome position + gene position + HN-score
combined_df_score = pd.concat([df_m_score_subset, 
                               df_r_score_subset])
combined_df_score.reset_index(drop=True, inplace=True)
combined_df_score['Gene start (bp)'] = combined_df_score['start1 (bp)'].combine_first(combined_df_score['start2 (bp)'])
combined_df_score['Gene end (bp)'] = combined_df_score['end1 (bp)'].combine_first(combined_df_score['end2 (bp)'])
combined_df_score.drop(['start1 (bp)', 'start2 (bp)', 'end1 (bp)', 'end2 (bp)'], axis=1, inplace=True)

combined_df_score.to_csv(f"../data/{circos_dir}/combined_position_{direction}.tsv", sep='\t', index=False)

display(combined_df_score)

Unnamed: 0,Gene stable ID,Chromosome/scaffold name,HN-score(HN5),organism,Gene start (bp),Gene end (bp)
0,ENSMUSG00000039936,M_4,-15,mouse,149733625.0,149787028.0
1,ENSMUSG00000047671,M_4,-15,mouse,116983991.0,116985935.0
2,ENSMUSG00000051378,M_11,-15,mouse,102796355.0,102815950.0
3,ENSMUSG00000000440,M_6,-15,mouse,115337912.0,115467360.0
4,ENSMUSG00000046949,M_13,-15,mouse,34148670.0,34172426.0
...,...,...,...,...,...,...
638,Os01g0952800,R_1,-173,rice,41971444.0,41978093.0
639,Os07g0142100,R_7,-178,rice,2175193.0,2175719.0
640,Os03g0307200,R_3,-182,rice,10926469.0,10927729.0
641,Os07g0142200,R_7,-189,rice,2176824.0,2177640.0


In [10]:
merged_df_goslim = pd.merge(
    extract_df_m_goslim, 
    extract_df_r_goslim, 
    left_on=['GOSlim GOA Accession(s)', 'GOSlim GOA Description', 'GOSlim_domain'],
    right_on=['GOSlim GOA Accession(s)', 'GOSlim GOA Description', 'GOSlim_domain'],
    how='inner',
    suffixes=('_mouse', '_rice')
)

merged_df_goslim = merged_df_goslim[['GOSlim GOA Accession(s)',
                                     'GOSlim GOA Description',
                                     'GOSlim_domain',
                                     'Gene stable ID_rice',
                                     'Gene stable ID_mouse',
                                     'Chromosome/scaffold name_rice',
                                     'start1 (bp)',
                                     'end1 (bp)',
                                     'Chromosome/scaffold name_mouse',
                                     'start2 (bp)',
                                     'end2 (bp)',
                                     'HN-score(HN5)_rice',
                                     'HN-score(HN5)_mouse']]

# Remove duplicate rows based on specific columns
merged_df_goslim.drop_duplicates(inplace=True)

display(merged_df_goslim)

Unnamed: 0,GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain,Gene stable ID_rice,Gene stable ID_mouse,Chromosome/scaffold name_rice,start1 (bp),end1 (bp),Chromosome/scaffold name_mouse,start2 (bp),end2 (bp),HN-score(HN5)_rice,HN-score(HN5)_mouse
0,GO:0003824,catalytic activity,molecular_function,Os03g0358800,ENSMUSG00000039936,R_3,13915940,13917546,M_4,149733625,149787028,-40,-15
1,GO:0003824,catalytic activity,molecular_function,Os11g0439600,ENSMUSG00000039936,R_11,14390405,14398122,M_4,149733625,149787028,-40,-15
2,GO:0003824,catalytic activity,molecular_function,Os01g0891100,ENSMUSG00000039936,R_1,38750313,38750768,M_4,149733625,149787028,-40,-15
3,GO:0003824,catalytic activity,molecular_function,Os05g0458600,ENSMUSG00000039936,R_5,22533536,22536087,M_4,149733625,149787028,-40,-15
4,GO:0003824,catalytic activity,molecular_function,Os02g0706900,ENSMUSG00000039936,R_2,29224218,29226240,M_4,149733625,149787028,-40,-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16810,GO:0030312,external encapsulating structure,cellular_component,Os02g0783000,ENSMUSG00000021624,R_2,33221378,33223595,M_13,102830066,102876137,-45,-16
16811,GO:0030312,external encapsulating structure,cellular_component,Os06g0193200,ENSMUSG00000021624,R_6,4700512,4703042,M_13,102830066,102876137,-45,-16
16812,GO:0030312,external encapsulating structure,cellular_component,Os09g0472900,ENSMUSG00000021624,R_9,18067436,18068411,M_13,102830066,102876137,-47,-16
16813,GO:0030312,external encapsulating structure,cellular_component,Os01g0284500,ENSMUSG00000021624,R_1,10167946,10168997,M_13,102830066,102876137,-52,-16


In [11]:
# for debugging purpose
unique_accessions_r = set(extract_df_r_goslim['GOSlim GOA Accession(s)'].unique())
unique_accessions_m = set(extract_df_m_goslim['GOSlim GOA Accession(s)'].unique())
common_accessions = unique_accessions_r.intersection(unique_accessions_m)
unique_accessions_merged = set(merged_df_goslim['GOSlim GOA Accession(s)'].unique())
missing_accessions = common_accessions.difference(unique_accessions_merged)
duplicates = merged_df_goslim.duplicated(subset=['GOSlim GOA Accession(s)', 'Gene stable ID_rice', 'Gene stable ID_mouse'], keep=False)
unique_combinations = not duplicates.any()
print(f"Number of common accessions: {len(common_accessions)}")
print(f"Number of missing accessions: {len(missing_accessions)}")
print(f"Missing accessions: {missing_accessions}")
print(f"各'Gene stable ID_rice'と'Gene stable ID_mouse'の組み合わせはユニークですか？: {unique_combinations}")

#Count the occurrences of each 'GOSlim GOA Accession(s)'
accession_counter = Counter(merged_df_goslim['GOSlim GOA Accession(s)'])

#Sort the DataFrame based on the counts
merged_df_goslim['count'] = merged_df_goslim['GOSlim GOA Accession(s)'].map(accession_counter)
merged_df_goslim_sorted = merged_df_goslim.sort_values('count', ascending=False).drop('count', axis=1)
merged_df_goslim_sorted.reset_index(drop=True, inplace=True)
merged_df_goslim_sorted.to_csv(f"../data/{circos_dir}/combined_goslim_{direction}.tsv", sep='\t', index=False)

display(merged_df_goslim_sorted)

Number of common accessions: 26
Number of missing accessions: 0
Missing accessions: set()
各'Gene stable ID_rice'と'Gene stable ID_mouse'の組み合わせはユニークですか？: True


Unnamed: 0,GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain,Gene stable ID_rice,Gene stable ID_mouse,Chromosome/scaffold name_rice,start1 (bp),end1 (bp),Chromosome/scaffold name_mouse,start2 (bp),end2 (bp),HN-score(HN5)_rice,HN-score(HN5)_mouse
0,GO:0003824,catalytic activity,molecular_function,Os03g0358800,ENSMUSG00000039936,R_3,13915940,13917546,M_4,149733625,149787028,-40,-15
1,GO:0003824,catalytic activity,molecular_function,Os10g0527601,ENSMUSG00000071724,R_10,20493427,20494200,M_15,76178548,76181096,-55,-17
2,GO:0003824,catalytic activity,molecular_function,Os04g0493400,ENSMUSG00000071724,R_4,24687753,24689297,M_15,76178548,76181096,-61,-17
3,GO:0003824,catalytic activity,molecular_function,Os01g0591300,ENSMUSG00000071724,R_1,23113647,23120516,M_15,76178548,76181096,-60,-17
4,GO:0003824,catalytic activity,molecular_function,Os02g0240300,ENSMUSG00000071724,R_2,7948543,7949997,M_15,76178548,76181096,-60,-17
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16810,GO:0005764,lysosome,cellular_component,Os02g0715000,ENSMUSG00000020323,R_2,29667743,29669572,M_10,79617308,79626795,-41,-17
16811,GO:0005764,lysosome,cellular_component,Os01g0613500,ENSMUSG00000020323,R_1,24343518,24345172,M_10,79617308,79626795,-75,-17
16812,GO:0005764,lysosome,cellular_component,Os01g0971400,ENSMUSG00000020323,R_1,42855657,42857462,M_10,79617308,79626795,-41,-17
16813,GO:0005635,nuclear envelope,cellular_component,Os12g0236100,ENSMUSG00000037997,R_12,7441389,7446142,M_6,127423803,127471224,-81,-16


## Merge GOslim terms and line enrichment results

In [12]:
enrichment = pd.read_csv(f'../data/line_enrichment_{direction}_MR_2312/goslim_correspondence_q_values_{direction}_MR/goslim_correspondence_fold_enrichment_p_q_{direction}.tsv', sep='\t')

merged_df_goslim_sorted["GOSlim"] = merged_df_goslim_sorted['GOSlim GOA Accession(s)'] + ": " + merged_df_goslim_sorted['GOSlim GOA Description']

merged_df_goslim_sorted = merged_df_goslim_sorted[['GOSlim',
                                                    'GOSlim GOA Accession(s)',
                                                    'GOSlim GOA Description',
                                                    'GOSlim_domain',
                                                    'Gene stable ID_rice',
                                                    'Gene stable ID_mouse',
                                                    'Chromosome/scaffold name_rice',
                                                    'start1 (bp)',
                                                    'end1 (bp)',
                                                    'Chromosome/scaffold name_mouse',
                                                    'start2 (bp)',
                                                    'end2 (bp)',
                                                    'HN-score(HN5)_rice',
                                                    'HN-score(HN5)_mouse']]


merged_df_goslim_enrichment = pd.merge(
    merged_df_goslim_sorted,
    enrichment[['GOSlim', f'counts_{direction}', 'fold_enrichment', 'p_value', 'q_value']],
    on='GOSlim',
    how='left'
)
merged_df_goslim_enrichment.sort_values('fold_enrichment', ascending=False, inplace=True)
merged_df_goslim_enrichment.reset_index(drop=True, inplace=True)
merged_df_goslim_enrichment.to_csv(f"../data/{circos_dir}/combined_goslim_enrichment_{direction}.tsv", sep='\t', index=False)

display(enrichment, merged_df_goslim_sorted, merged_df_goslim_enrichment)

Unnamed: 0,GOSlim,GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain,counts_down,counts_all,down_ratio,all_ratio,fold_enrichment,p_value,q_value
0,GO:0005576: extracellular region,GO:0005576,extracellular region,cellular_component,841,2069023,0.050015,0.0113,4.426216,1.028298e-268,1.7481060000000002e-267
1,GO:0005764: lysosome,GO:0005764,lysosome,cellular_component,6,27234,0.000357,0.000149,2.399063,0.04206618,0.1589167
2,GO:0008289: lipid binding,GO:0008289,lipid binding,molecular_function,56,257535,0.00333,0.001406,2.367847,1.063385e-08,6.02585e-08
3,GO:0005215: transporter activity,GO:0005215,transporter activity,molecular_function,308,1687420,0.018317,0.009216,1.987602,8.451898000000001e-28,7.184113e-27
4,GO:0005975: carbohydrate metabolic process,GO:0005975,carbohydrate metabolic process,biological_process,102,655182,0.006066,0.003578,1.695275,5.39e-07,2.618e-06
5,GO:0003677: DNA binding,GO:0003677,DNA binding,molecular_function,736,4990802,0.04377,0.027257,1.605866,5.5727339999999995e-34,6.315766e-33
6,GO:0003824: catalytic activity,GO:0003824,catalytic activity,molecular_function,7482,56414781,0.44496,0.308102,1.444198,1.3241389999999999e-303,4.502072e-302
7,GO:0016787: hydrolase activity,GO:0016787,hydrolase activity,molecular_function,984,7777545,0.058519,0.042476,1.377699,7.895711e-23,5.369083e-22
8,GO:0006091: generation of precursor metabolite...,GO:0006091,generation of precursor metabolites and energy,biological_process,27,243045,0.001606,0.001327,1.209704,0.1856097,0.6310729
9,GO:0006629: lipid metabolic process,GO:0006629,lipid metabolic process,biological_process,153,1429560,0.009099,0.007807,1.165443,0.03387015,0.1439482


Unnamed: 0,GOSlim,GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain,Gene stable ID_rice,Gene stable ID_mouse,Chromosome/scaffold name_rice,start1 (bp),end1 (bp),Chromosome/scaffold name_mouse,start2 (bp),end2 (bp),HN-score(HN5)_rice,HN-score(HN5)_mouse
0,GO:0003824: catalytic activity,GO:0003824,catalytic activity,molecular_function,Os03g0358800,ENSMUSG00000039936,R_3,13915940,13917546,M_4,149733625,149787028,-40,-15
1,GO:0003824: catalytic activity,GO:0003824,catalytic activity,molecular_function,Os10g0527601,ENSMUSG00000071724,R_10,20493427,20494200,M_15,76178548,76181096,-55,-17
2,GO:0003824: catalytic activity,GO:0003824,catalytic activity,molecular_function,Os04g0493400,ENSMUSG00000071724,R_4,24687753,24689297,M_15,76178548,76181096,-61,-17
3,GO:0003824: catalytic activity,GO:0003824,catalytic activity,molecular_function,Os01g0591300,ENSMUSG00000071724,R_1,23113647,23120516,M_15,76178548,76181096,-60,-17
4,GO:0003824: catalytic activity,GO:0003824,catalytic activity,molecular_function,Os02g0240300,ENSMUSG00000071724,R_2,7948543,7949997,M_15,76178548,76181096,-60,-17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16810,GO:0005764: lysosome,GO:0005764,lysosome,cellular_component,Os02g0715000,ENSMUSG00000020323,R_2,29667743,29669572,M_10,79617308,79626795,-41,-17
16811,GO:0005764: lysosome,GO:0005764,lysosome,cellular_component,Os01g0613500,ENSMUSG00000020323,R_1,24343518,24345172,M_10,79617308,79626795,-75,-17
16812,GO:0005764: lysosome,GO:0005764,lysosome,cellular_component,Os01g0971400,ENSMUSG00000020323,R_1,42855657,42857462,M_10,79617308,79626795,-41,-17
16813,GO:0005635: nuclear envelope,GO:0005635,nuclear envelope,cellular_component,Os12g0236100,ENSMUSG00000037997,R_12,7441389,7446142,M_6,127423803,127471224,-81,-16


Unnamed: 0,GOSlim,GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain,Gene stable ID_rice,Gene stable ID_mouse,Chromosome/scaffold name_rice,start1 (bp),end1 (bp),Chromosome/scaffold name_mouse,start2 (bp),end2 (bp),HN-score(HN5)_rice,HN-score(HN5)_mouse,counts_down,fold_enrichment,p_value,q_value
0,GO:0005576: extracellular region,GO:0005576,extracellular region,cellular_component,Os06g0688200,ENSMUSG00000109523,R_6,28707334,28708349,M_8,70782467,70784237,-77,-17,841,4.426216,1.028298e-268,1.748106e-267
1,GO:0005576: extracellular region,GO:0005576,extracellular region,cellular_component,Os01g0696800,ENSMUSG00000074141,R_1,28835243,28836884,M_7,44465811,44490233,-42,-30,841,4.426216,1.028298e-268,1.748106e-267
2,GO:0005576: extracellular region,GO:0005576,extracellular region,cellular_component,Os06g0306300,ENSMUSG00000030858,R_6,11559570,11560896,M_7,130927673,130931245,-41,-33,841,4.426216,1.028298e-268,1.748106e-267
3,GO:0005576: extracellular region,GO:0005576,extracellular region,cellular_component,Os06g0688200,ENSMUSG00000030858,R_6,28707334,28708349,M_7,130927673,130931245,-77,-33,841,4.426216,1.028298e-268,1.748106e-267
4,GO:0005576: extracellular region,GO:0005576,extracellular region,cellular_component,Os01g0613500,ENSMUSG00000030858,R_1,24343518,24345172,M_7,130927673,130931245,-75,-33,841,4.426216,1.028298e-268,1.748106e-267
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16810,GO:0005783: endoplasmic reticulum,GO:0005783,endoplasmic reticulum,cellular_component,Os05g0482400,ENSMUSG00000031444,R_5,23728568,23738372,M_8,13087308,13106676,-49,-15,8,0.054125,1.000000e+00,1.000000e+00
16811,GO:0005783: endoplasmic reticulum,GO:0005783,endoplasmic reticulum,cellular_component,Os05g0482400,ENSMUSG00000022297,R_5,23728568,23738372,M_15,38869429,38901583,-49,-15,8,0.054125,1.000000e+00,1.000000e+00
16812,GO:0005783: endoplasmic reticulum,GO:0005783,endoplasmic reticulum,cellular_component,Os05g0482400,ENSMUSG00000044916,R_5,23728568,23738372,M_2,92212883,92216053,-49,-25,8,0.054125,1.000000e+00,1.000000e+00
16813,GO:0005783: endoplasmic reticulum,GO:0005783,endoplasmic reticulum,cellular_component,Os05g0482400,ENSMUSG00000047694,R_5,23728568,23738372,M_X,97979922,97992623,-49,-19,8,0.054125,1.000000e+00,1.000000e+00
