In [1]:
import pandas as pd
import os

In [2]:
# output directory and settings
organism = 'rice'
annotation_dir = f'../data/{organism}_annotation'
os.makedirs(annotation_dir, exist_ok=True)

In [3]:
# Upregulated genes
df_up = pd.read_csv(f"../data/HN-score_{organism}/HN5_genes_up_{organism}.tsv", sep="\t") #genes names in this file were manually corrected to the most recent version 
df_up.rename(columns={'HN5': 'HN-score(HN5)'}, inplace=True)
# Downregulated genes
df_down = pd.read_csv(f"../data/HN-score_{organism}/HN5_genes_down_{organism}.tsv", sep="\t")
df_down.rename(columns={'HN5': 'HN-score(HN5)'}, inplace=True)

COMBINED = pd.concat(
    [df_up, df_down],
    ignore_index=True
    )

display(df_up, df_down, COMBINED)

Unnamed: 0,GENEID,HN-score(HN5)
0,Os04g0107900,253
1,Os01g0136100,246
2,Os02g0259900,238
3,Os03g0245800,237
4,Os03g0277300,236
...,...,...
384,Os05g0373900,42
385,Os04g0301500,42
386,Os05g0367000,42
387,Os01g0971800,42


Unnamed: 0,GENEID,HN-score(HN5)
0,Os09g0567500,-40
1,Os04g0468600,-40
2,Os10g0490400,-40
3,Os01g0891100,-40
4,Os05g0458600,-40
...,...,...
379,Os01g0952800,-173
380,Os07g0142100,-178
381,Os03g0307200,-182
382,Os07g0142200,-189


Unnamed: 0,GENEID,HN-score(HN5)
0,Os04g0107900,253
1,Os01g0136100,246
2,Os02g0259900,238
3,Os03g0245800,237
4,Os03g0277300,236
...,...,...
768,Os01g0952800,-173
769,Os07g0142100,-178
770,Os03g0307200,-182
771,Os07g0142200,-189


## merge the Ensembl biomart data (Chromosome position)

- For drawing Circos plot

In [4]:
df_ensembl_position = pd.read_csv(f"../data/biomart_goslim/biomart_{organism}_position_R56.tsv", sep="\t", low_memory=False)

# 1. merged all data
merged_df_position = pd.merge(
    df_ensembl_position,
    COMBINED,
    left_on="Gene stable ID",
    right_on="GENEID",
    how="inner"
)
merged_df_position = merged_df_position.drop(columns=['GENEID'])
merged_df_position = merged_df_position.sort_values(by='HN-score(HN5)', ascending=False).copy()
merged_df_position.to_csv(f"../data/{annotation_dir}/{organism}_position_all.tsv", sep="\t" , index=False)

# 2. merged upregulated genes data
merged_df_position_up = pd.merge(
    df_ensembl_position,
    df_up,
    left_on="Gene stable ID",
    right_on="GENEID",
    how="inner"
)
merged_df_position_up = merged_df_position_up.drop(columns=['GENEID'])
merged_df_position_up = merged_df_position_up.sort_values(by='HN-score(HN5)', ascending=False).copy()
merged_df_position_up.to_csv(f"../data/{annotation_dir}/{organism}_position_up.tsv", sep="\t", index=False)

# 3. merged downregulated genes data
merged_df_position_down = pd.merge(
    df_ensembl_position,
    df_down,
    left_on="Gene stable ID",
    right_on="GENEID",
    how="inner"
)
merged_df_position_down = merged_df_position_down.drop(columns=['GENEID'])
merged_df_position_down = merged_df_position_down.sort_values(by='HN-score(HN5)', ascending=False).copy()
merged_df_position_down.to_csv(f"../data/{annotation_dir}/{organism}_position_down.tsv", sep="\t", index=False)

display(df_ensembl_position, merged_df_position, merged_df_position_up, merged_df_position_down)

Unnamed: 0,Gene stable ID,Chromosome/scaffold name,Gene start (bp),Gene end (bp),Strand
0,ENSRNA049474694,11,12120216,12120334,1
1,ENSRNA049474700,11,12162560,12162678,1
2,ENSRNA049474705,11,12102085,12102203,1
3,ENSRNA049474711,11,12092425,12092544,1
4,ENSRNA049474719,11,12089726,12089844,1
...,...,...,...,...,...
38973,gene-rps2-2,Mt,184187,185638,-1
38974,gene-nad7,Mt,90731,90873,-1
38975,gene-orf284,Mt,337520,338374,1
38976,gene-mat-r,Mt,315812,317848,-1


Unnamed: 0,Gene stable ID,Chromosome/scaffold name,Gene start (bp),Gene end (bp),Strand,HN-score(HN5)
735,Os04g0107900,4,483234,485978,1,253
465,Os01g0136100,1,1948773,1949587,-1,246
367,Os02g0259900,2,9021454,9023102,1,238
559,Os03g0245800,3,7697015,7698027,1,237
152,Os02g0259850,2,9016923,9018833,1,236
...,...,...,...,...,...,...
333,Os01g0952800,1,41971444,41978093,-1,-173
439,Os07g0142100,7,2175193,2175719,1,-178
153,Os03g0307200,3,10926469,10927729,1,-182
123,Os07g0142200,7,2176824,2177640,1,-189


Unnamed: 0,Gene stable ID,Chromosome/scaffold name,Gene start (bp),Gene end (bp),Strand,HN-score(HN5)
368,Os04g0107900,4,483234,485978,1,253
229,Os01g0136100,1,1948773,1949587,-1,246
185,Os02g0259900,2,9021454,9023102,1,238
276,Os03g0245800,3,7697015,7698027,1,237
256,Os03g0277300,3,9411494,9416082,-1,236
...,...,...,...,...,...,...
164,Os03g0820400,3,34427704,34428391,1,42
177,Os10g0328600,10,9212944,9216458,-1,42
199,Os01g0971800,1,42874273,42875515,-1,42
230,Os07g0621600,7,25691906,25692138,-1,42


Unnamed: 0,Gene stable ID,Chromosome/scaffold name,Gene start (bp),Gene end (bp),Strand,HN-score(HN5)
118,Os01g0136300,1,1955181,1955914,-1,-40
45,Os05g0588225,5,29307170,29307793,-1,-40
232,Os03g0358800,3,13915940,13917546,1,-40
236,Os06g0157900,6,2947702,2948085,1,-40
292,Os11g0439600,11,14390405,14398122,-1,-40
...,...,...,...,...,...,...
169,Os01g0952800,1,41971444,41978093,-1,-173
217,Os07g0142100,7,2175193,2175719,1,-178
83,Os03g0307200,3,10926469,10927729,1,-182
68,Os07g0142200,7,2176824,2177640,1,-189


## merge the Ensembl biomart data (GOslim)

- For exexution of GO enrichment analysis

In [5]:
df_ensembl = pd.read_csv(f"../data/biomart_goslim/biomart_{organism}_goslim_R56_domain.tsv", sep="\t", low_memory=False)
df_ensembl_uniq = df_ensembl.drop_duplicates(subset=['Gene stable ID', 
                                                     'GOSlim GOA Accession(s)', 
                                                     'GOSlim GOA Description'], 
                                                     keep='first').copy()
df_ensembl_uniq.rename(columns={'start2 (bp)': 'Gene start (bp)'}, inplace=True)
df_ensembl_uniq.rename(columns={'end2 (bp)': 'Gene end (bp)'}, inplace=True)

# 1. merge all data
merged_df = pd.merge(df_ensembl_uniq,COMBINED,
                        left_on="Gene stable ID",
                        right_on="GENEID",
                     how="inner")

merged_df = merged_df.drop(columns=['GENEID'])
merged_df = merged_df.sort_values(by='HN-score(HN5)', ascending=False).copy()
merged_df.to_csv(f"../data/{annotation_dir}/{organism}_annotation.tsv", sep="\t" , index=False)
merged_df

# 2. merge UP data
merged_df_up = pd.merge(df_ensembl_uniq,df_up,
                        left_on="Gene stable ID",
                        right_on="GENEID",
                     how="inner")

merged_df_up = merged_df_up.drop(columns=['GENEID'])
merged_df_up = merged_df_up.sort_values(by='HN-score(HN5)', ascending=False).copy()
merged_df_up.to_csv(f"../data/{annotation_dir}/{organism}_annotation_up.tsv", sep="\t" , index=False)

# 3. merge DOWN data
merged_df_down = pd.merge(df_ensembl_uniq,df_down,
                        left_on="Gene stable ID",
                        right_on="GENEID",
                     how="inner")

merged_df_down = merged_df_down.drop(columns=['GENEID'])
merged_df_down = merged_df_down.sort_values(by='HN-score(HN5)', ascending=False).copy()
merged_df_down.to_csv(f"../data/{annotation_dir}/{organism}_annotation_down.tsv", sep="\t", index=False)

display(df_ensembl_uniq, merged_df, merged_df_up, merged_df_down)

Unnamed: 0,Gene stable ID,Chromosome/scaffold name,start1 (bp),end1 (bp),GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain
0,Os01g0100100,1,2983,10815,GO:0006810,transport,biological_process
1,Os01g0100100,1,2983,10815,GO:0008150,biological_process,biological_process
2,Os01g0100100,1,2983,10815,GO:0009987,cellular process,biological_process
3,Os01g0100100,1,2983,10815,GO:0003674,molecular_function,molecular_function
4,Os01g0100100,1,2983,10815,GO:0030234,enzyme regulator activity,molecular_function
...,...,...,...,...,...,...,...
219494,gene-rps19,Pt,134200,134481,GO:0003674,molecular_function,molecular_function
219495,gene-rps19,Pt,134200,134481,GO:0005198,structural molecule activity,molecular_function
219496,gene-rps19,Pt,134200,134481,GO:0005488,binding,molecular_function
219497,gene-rps19,Pt,134200,134481,GO:0003723,RNA binding,molecular_function


Unnamed: 0,Gene stable ID,Chromosome/scaffold name,start1 (bp),end1 (bp),GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain,HN-score(HN5)
1853,Os04g0107900,4,483234,485978,GO:0005829,cytosol,cellular_component,253
1843,Os04g0107900,4,483234,485978,GO:0008150,biological_process,biological_process,253
1854,Os04g0107900,4,483234,485978,GO:0003824,catalytic activity,molecular_function,253
1851,Os04g0107900,4,483234,485978,GO:0005622,intracellular anatomical structure,cellular_component,253
1850,Os04g0107900,4,483234,485978,GO:0016020,membrane,cellular_component,253
...,...,...,...,...,...,...,...,...
1475,Os03g0307300,3,10929507,10930895,GO:0008152,metabolic process,biological_process,-196
1474,Os03g0307300,3,10929507,10930895,GO:0009987,cellular process,biological_process,-196
1470,Os03g0307300,3,10929507,10930895,GO:0003674,molecular_function,molecular_function,-196
1471,Os03g0307300,3,10929507,10930895,GO:0003824,catalytic activity,molecular_function,-196


Unnamed: 0,Gene stable ID,Chromosome/scaffold name,start1 (bp),end1 (bp),GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain,HN-score(HN5)
1009,Os04g0107900,4,483234,485978,GO:0016787,hydrolase activity,molecular_function,253
1003,Os04g0107900,4,483234,485978,GO:0005886,plasma membrane,cellular_component,253
995,Os04g0107900,4,483234,485978,GO:0005515,protein binding,molecular_function,253
996,Os04g0107900,4,483234,485978,GO:0005488,binding,molecular_function,253
997,Os04g0107900,4,483234,485978,GO:0008150,biological_process,biological_process,253
...,...,...,...,...,...,...,...,...
292,Os01g0971800,1,42874273,42875515,GO:0008152,metabolic process,biological_process,42
291,Os01g0971800,1,42874273,42875515,GO:0009987,cellular process,biological_process,42
290,Os01g0971800,1,42874273,42875515,GO:0008150,biological_process,biological_process,42
289,Os01g0971800,1,42874273,42875515,GO:0005634,nucleus,cellular_component,42


Unnamed: 0,Gene stable ID,Chromosome/scaffold name,start1 (bp),end1 (bp),GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain,HN-score(HN5)
1431,Os08g0201100,8,5864931,5865851,GO:0005575,cellular_component,cellular_component,-40
1862,Os11g0661600,11,26572255,26574116,GO:0030312,external encapsulating structure,cellular_component,-40
1930,Os12g0512100,12,19788367,19791483,GO:0016020,membrane,cellular_component,-40
1929,Os12g0512100,12,19788367,19791483,GO:0005886,plasma membrane,cellular_component,-40
1928,Os12g0512100,12,19788367,19791483,GO:0005575,cellular_component,cellular_component,-40
...,...,...,...,...,...,...,...,...
679,Os03g0307300,3,10929507,10930895,GO:0009058,biosynthetic process,biological_process,-196
673,Os03g0307300,3,10929507,10930895,GO:0003674,molecular_function,molecular_function,-196
674,Os03g0307300,3,10929507,10930895,GO:0003824,catalytic activity,molecular_function,-196
675,Os03g0307300,3,10929507,10930895,GO:0016740,transferase activity,molecular_function,-196
