In [None]:
import pandas as pd
import os

In [None]:
# output directory and settings
organism = 'rice'
annotation_dir = f'../data/{organism}_annotation'
os.makedirs(annotation_dir, exist_ok=True)

In [None]:
# Upregulated genes
df_up = pd.read_csv(f"../data/HN-score_{organism}/HN5_genes_up_{organism}.tsv", sep="\t") #genes names in this file were manually corrected to the most recent version 
df_up.rename(columns={'HN5': 'HN-score(HN5)'}, inplace=True)
# Downregulated genes
df_down = pd.read_csv(f"../data/HN-score_{organism}/HN5_genes_down_{organism}.tsv", sep="\t")
df_down.rename(columns={'HN5': 'HN-score(HN5)'}, inplace=True)

COMBINED = pd.concat(
    [df_up, df_down],
    ignore_index=True
    )

display(df_up, df_down, COMBINED)

## merge the Ensembl biomart data (Chromosome position)

- For drawing Circos plot

In [None]:
df_ensembl_position = pd.read_csv(f"../data/biomart_goslim/biomart_{organism}_position_R56.tsv", sep="\t", low_memory=False)

# 1. merged all data
merged_df_position = pd.merge(
    df_ensembl_position,
    COMBINED,
    left_on="Gene stable ID",
    right_on="GENEID",
    how="inner"
)
merged_df_position = merged_df_position.drop(columns=['GENEID'])
merged_df_position = merged_df_position.sort_values(by='HN-score(HN5)', ascending=False).copy()


# 2. merged upregulated genes data
merged_df_position_up = pd.merge(
    df_ensembl_position,
    df_up,
    left_on="Gene stable ID",
    right_on="GENEID",
    how="inner"
)
merged_df_position_up = merged_df_position_up.drop(columns=['GENEID'])
merged_df_position_up = merged_df_position_up.sort_values(by='HN-score(HN5)', ascending=False).copy()


# 3. merged downregulated genes data
merged_df_position_down = pd.merge(
    df_ensembl_position,
    df_down,
    left_on="Gene stable ID",
    right_on="GENEID",
    how="inner"
)
merged_df_position_down = merged_df_position_down.drop(columns=['GENEID'])
merged_df_position_down = merged_df_position_down.sort_values(by='HN-score(HN5)', ascending=False).copy()

# rename columns
chromosome_position = [ merged_df_position, merged_df_position_up, merged_df_position_down ]

for df in chromosome_position:
    df.rename(columns={'Gene start (bp)': 'start1 (bp)'}, inplace=True) # start1 (bp) is the same as Gene start (bp)
    df.rename(columns={'Gene end (bp)': 'end1 (bp)'}, inplace=True) # end1 (bp) is the same as Gene end (bp)


# save data
merged_df_position.to_csv(f"../data/{annotation_dir}/{organism}_position_all.tsv", sep="\t" , index=False)
merged_df_position_up.to_csv(f"../data/{annotation_dir}/{organism}_position_up.tsv", sep="\t", index=False)
merged_df_position_down.to_csv(f"../data/{annotation_dir}/{organism}_position_down.tsv", sep="\t", index=False)

display(df_ensembl_position, merged_df_position, merged_df_position_up, merged_df_position_down)

## merge the Ensembl biomart data (GOslim)

- For exexution of GO enrichment analysis

In [None]:
df_ensembl = pd.read_csv(f"../data/biomart_goslim/biomart_{organism}_goslim_R56_domain.tsv", sep="\t", low_memory=False)
df_ensembl_uniq = df_ensembl.drop_duplicates(subset=['Gene stable ID', 
                                                     'GOSlim GOA Accession(s)', 
                                                     'GOSlim GOA Description'], 
                                                     keep='first').copy()
df_ensembl_uniq.rename(columns={'start2 (bp)': 'Gene start (bp)'}, inplace=True)
df_ensembl_uniq.rename(columns={'end2 (bp)': 'Gene end (bp)'}, inplace=True)

# 1. merge all data
merged_df = pd.merge(df_ensembl_uniq,COMBINED,
                        left_on="Gene stable ID",
                        right_on="GENEID",
                     how="inner")

merged_df = merged_df.drop(columns=['GENEID'])
merged_df = merged_df.sort_values(by='HN-score(HN5)', ascending=False).copy()


# 2. merge UP data
merged_df_up = pd.merge(df_ensembl_uniq,df_up,
                        left_on="Gene stable ID",
                        right_on="GENEID",
                     how="inner")

merged_df_up = merged_df_up.drop(columns=['GENEID'])
merged_df_up = merged_df_up.sort_values(by='HN-score(HN5)', ascending=False).copy()


# 3. merge DOWN data
merged_df_down = pd.merge(df_ensembl_uniq,df_down,
                        left_on="Gene stable ID",
                        right_on="GENEID",
                     how="inner")

merged_df_down = merged_df_down.drop(columns=['GENEID'])
merged_df_down = merged_df_down.sort_values(by='HN-score(HN5)', ascending=False).copy()


# save data
merged_df.to_csv(f"../data/{annotation_dir}/{organism}_annotation.tsv", sep="\t" , index=False)
merged_df_up.to_csv(f"../data/{annotation_dir}/{organism}_annotation_up.tsv", sep="\t" , index=False)
merged_df_down.to_csv(f"../data/{annotation_dir}/{organism}_annotation_down.tsv", sep="\t", index=False)

display(df_ensembl_uniq, merged_df, merged_df_up, merged_df_down)