In [1]:
import pandas as pd
import numpy as np
import datetime
import os

now = datetime.datetime.now()
organism = 'rice'
directory = f'../Data/07_extract_gene/HN5_genelist_{organism}_{now.strftime("%y%m")}'
os.makedirs(directory, exist_ok=True)

# RICE

In [2]:
HNscore = pd.read_csv(
    "../Data/06_HNscore/HN-score_rice_250524/HN-score_rice_250524_HN5_all.tsv", 
    sep='\t'
)
display(HNscore)

Unnamed: 0,GENEID,UP5,DOWN5,UNCHANGED5,HN5,UPcount+1 / DOWNcount+1 5
0,Os04g0107900,263,8,89,255,29.333333
1,Os01g0136100,253,8,99,245,28.222222
2,Os02g0259900,246,8,106,238,27.444444
3,Os02g0259850,245,8,107,237,27.333333
4,Os03g0245800,248,11,101,237,20.750000
...,...,...,...,...,...,...
35686,Os01g0952800,7,180,173,-173,0.044199
35687,Os07g0142100,11,187,162,-176,0.063830
35688,Os03g0307200,4,184,172,-180,0.027027
35689,Os07g0142200,9,195,156,-186,0.051020


In [3]:
# Calculate the number of genes in the top 1% and bottom 1%
top_1_percent_count = int(len(HNscore) * 0.01)
bottom_1_percent_count = top_1_percent_count

# retrieve the threshold values for the top 1% and bottom 1%
top_1_percent_threshold = HNscore["HN5"].nlargest(top_1_percent_count).min()
bottom_1_percent_threshold = HNscore["HN5"].nsmallest(bottom_1_percent_count).max()

# retrieve the top 1% of the data (including additional rows with the same HN-score)
top_genes = HNscore[HNscore["HN5"] >= top_1_percent_threshold]
# retrieve the bottom 1% of the data (including additional rows with the same HN-score)
bottom_genes = HNscore[HNscore["HN5"] <= bottom_1_percent_threshold]

# print the results
print(f"Top 1% threshold: {top_1_percent_threshold}")
print(f"Bottom 1% threshold: {bottom_1_percent_threshold}")
print(f"Number of genes in the top 1%: {len(top_genes)}")
print(f"Number of genes in the bottom 1%: {len(bottom_genes)}")

# save the results to a file
top_genes.to_csv(f'{directory}/HN5_genes_up_{organism}.tsv', sep='\t', index=False)
bottom_genes.to_csv(f'{directory}/HN5_genes_down_{organism}.tsv', sep='\t', index=False)

Top 1% threshold: 42
Bottom 1% threshold: -40
Number of genes in the top 1%: 367
Number of genes in the bottom 1%: 370
