In [1]:
import pandas as pd

In [2]:
# Ensembl biomart data (Version: Ensembl Plants Genes 56)
df_ensembl = pd.read_csv("../data/biomart_goslim/biomart_rice_position_R56.tsv", sep="\t")

# Rice HN-score data
df_score = pd.read_csv("../data/HN-score_rice/HN-score_rice_231005_HN5_all.tsv", sep="\t")

In [3]:
# Merge data
df_merge = pd.merge(df_ensembl, 
                    df_score,
                    left_on="Gene stable ID",
                    right_on="GENEID",
                    how="outer")
df_merge

Unnamed: 0,Gene stable ID,Chromosome/scaffold name,Gene start (bp),Gene end (bp),Strand,GENEID,HN-score(HN5)
0,ENSRNA049474694,11,12120216,12120334,1,,
1,ENSRNA049474700,11,12162560,12162678,1,,
2,ENSRNA049474705,11,12102085,12102203,1,,
3,ENSRNA049474711,11,12092425,12092544,1,,
4,ENSRNA049474719,11,12089726,12089844,1,,
...,...,...,...,...,...,...,...
38973,gene-rps2-2,Mt,184187,185638,-1,gene-rps2-2,18.0
38974,gene-nad7,Mt,90731,90873,-1,gene-nad7,10.0
38975,gene-orf284,Mt,337520,338374,1,gene-orf284,25.0
38976,gene-mat-r,Mt,315812,317848,-1,gene-mat-r,7.0


In [4]:
# remove NaN
df_merge.dropna(subset=['HN-score(HN5)'], inplace=True)
df_merge

Unnamed: 0,Gene stable ID,Chromosome/scaffold name,Gene start (bp),Gene end (bp),Strand,GENEID,HN-score(HN5)
621,Os10g0529700,10,20597074,20598092,1,Os10g0529700,-15.0
622,Os07g0668251,7,28215244,28221994,1,Os07g0668251,-2.0
623,Os09g0489500,9,18868479,18877573,-1,Os09g0489500,-4.0
624,Os09g0502200,9,19433960,19437709,-1,Os09g0502200,-7.0
625,Os11g0584100,11,22068104,22070310,1,Os11g0584100,1.0
...,...,...,...,...,...,...,...
38973,gene-rps2-2,Mt,184187,185638,-1,gene-rps2-2,18.0
38974,gene-nad7,Mt,90731,90873,-1,gene-nad7,10.0
38975,gene-orf284,Mt,337520,338374,1,gene-orf284,25.0
38976,gene-mat-r,Mt,315812,317848,-1,gene-mat-r,7.0


In [5]:
# Confirm that it is indeed compatible, then removed "GENEID" column 
df_merge.drop(columns=['GENEID'], inplace=True)
df_merge

Unnamed: 0,Gene stable ID,Chromosome/scaffold name,Gene start (bp),Gene end (bp),Strand,HN-score(HN5)
621,Os10g0529700,10,20597074,20598092,1,-15.0
622,Os07g0668251,7,28215244,28221994,1,-2.0
623,Os09g0489500,9,18868479,18877573,-1,-4.0
624,Os09g0502200,9,19433960,19437709,-1,-7.0
625,Os11g0584100,11,22068104,22070310,1,1.0
...,...,...,...,...,...,...
38973,gene-rps2-2,Mt,184187,185638,-1,18.0
38974,gene-nad7,Mt,90731,90873,-1,10.0
38975,gene-orf284,Mt,337520,338374,1,25.0
38976,gene-mat-r,Mt,315812,317848,-1,7.0


In [7]:
df_up = pd.read_csv("../data/HN-score_rice/HN5_genelist_rice_2311/HN5_genes_up_rice.tsv", sep="\t")
df_up

Unnamed: 0,GENEID,HN5
0,Os04g0107900,253
1,Os01g0136100,246
2,Os02g0259900,238
3,Os03g0245800,237
4,Os03g0277300,236
...,...,...
384,Os05g0373900,42
385,Os04g0301500,42
386,Os05g0367000,42
387,Os01g0971800,42


In [6]:
# read the text file with the list of genes (HN5_genes_up.txt)
with open("HN5_genes_up.txt", "r") as f:
    gene_list_up = [line.strip() for line in f.readlines()] # 

# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.isin.html
df_filtered_up = df_merge[df_merge['Gene stable ID'].isin(gene_list_up)]
df_filtered_up.to_csv("rice_position_up.tsv", sep="\t")
df_filtered_up

Unnamed: 0,Gene stable ID,Chromosome/scaffold name,Gene start (bp),Gene end (bp),Strand,HN-score(HN5)


In [7]:
# read the text file with the list of genes (HN5_genes_down.txt)

with open("HN5_genes_down.txt", "r") as f:
    gene_list_down = [line.strip() for line in f.readlines()]

# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.isin.html
df_filtered_down = df_merge[df_merge['Gene stable ID'].isin(gene_list_down)]
df_filtered_down.to_csv("rice_position_down.tsv", sep="\t")
df_filtered_down

Unnamed: 0,Gene stable ID,Chromosome/scaffold name,Gene start (bp),Gene end (bp),Strand,HN-score(HN5)
658,Os10g0555700,10,21835933,21837651,-1,-42.0
659,Os10g0545500,10,21320574,21322000,1,-43.0
665,Os11g0413800,11,12595106,12596860,-1,-49.0
1192,Os10g0576600,10,22964272,22965575,-1,-52.0
1278,Os10g0508700,10,19530118,19532281,-1,-43.0
...,...,...,...,...,...,...
38609,Os01g0216000,1,6340836,6344286,1,-70.0
38614,Os04g0453200,4,22611924,22620016,1,-45.0
38672,Os09g0424300,9,15385723,15389096,-1,-95.0
38750,Os09g0472700,9,18058240,18058985,1,-64.0


In [8]:
# combine the two dataframes

df_combined = pd.concat([df_filtered_up, 
                         df_filtered_down],
                        ignore_index=True)
df_combined

Unnamed: 0,Gene stable ID,Chromosome/scaffold name,Gene start (bp),Gene end (bp),Strand,HN-score(HN5)
0,Os10g0419300,10,14746572,14750175,-1,53.0
1,Os12g0147200,12,2312450,2313217,1,80.0
2,Os03g0710500,3,28647086,28649570,1,74.0
3,Os03g0822400,3,34533182,34533828,1,63.0
4,Os01g0849000,1,36484020,36484818,-1,49.0
...,...,...,...,...,...,...
751,Os01g0216000,1,6340836,6344286,1,-70.0
752,Os04g0453200,4,22611924,22620016,1,-45.0
753,Os09g0424300,9,15385723,15389096,-1,-95.0
754,Os09g0472700,9,18058240,18058985,1,-64.0


In [9]:
df_combined.to_csv("rice_position_all.tsv", sep="\t", index=False)