In [13]:
import pandas as pd
import math

In [14]:
# human HN-score data
# https://doi.org/10.6084/m9.figshare.23944935.v2
df = pd.read_csv("./HN-score_human_all.csv", sep=",")
df

Unnamed: 0,GeneName,up5,dn5,unchange5,all,HN5
0,HSPA6,247,2,73,322,245
1,HSPA1A,240,1,81,322,239
2,HSPA1B,233,2,87,322,231
3,DNAJB1,212,2,108,322,210
4,BAG3,182,1,139,322,181
...,...,...,...,...,...,...
19700,N4BP1,3,44,275,322,-41
19701,ZNF784,8,50,264,322,-42
19702,TSSK3,28,71,223,322,-43
19703,HRCT1,7,59,256,322,-52


In [15]:
df_filtered = df[["GeneName", "HN5"]]
df_filtered

Unnamed: 0,GeneName,HN5
0,HSPA6,245
1,HSPA1A,239
2,HSPA1B,231
3,DNAJB1,210
4,BAG3,181
...,...,...
19700,N4BP1,-41
19701,ZNF784,-42
19702,TSSK3,-43
19703,HRCT1,-52


In [16]:
total_lines = len(df_filtered)
print("total_lines:", total_lines)

total_lines: 19705


In [17]:
one_percent = round(total_lines * 0.01)

df_top1 = df_filtered.head(one_percent)
df_top1.to_csv("./HN-score_human_top1.csv", index=False)
df_top1

Unnamed: 0,GeneName,HN5
0,HSPA6,245
1,HSPA1A,239
2,HSPA1B,231
3,DNAJB1,210
4,BAG3,181
...,...,...
192,HEY1,51
193,TMEM216,51
194,FAM83C,51
195,FBXW10,51


In [18]:
df_down1 = df_filtered.tail(one_percent)
df_down1.to_csv("./HN-score_human_down1.csv", index=False)
df_down1

Unnamed: 0,GeneName,HN5
19508,KANK1,-14
19509,PYM1,-14
19510,SNN,-14
19511,HMGXB4,-14
19512,NAGS,-14
...,...,...
19700,N4BP1,-41
19701,ZNF784,-42
19702,TSSK3,-43
19703,HRCT1,-52


#### Convert the gene names to Ensemble gene IDs

In [10]:
# Get ensemble gene ID from gProfiler UP 1%
df2 = pd.read_csv("./gProfiler_hsapiens_2023-10-7 21-55-23.csv", sep=",")
convert_ID = df2[["initial_alias", "converted_alias", "name"]]
convert_ID

Unnamed: 0,initial_alias,converted_alias,name
0,HSPA6,ENSG00000173110,HSPA6
1,HSPA1A,ENSG00000204389,HSPA1A
2,HSPA1B,ENSG00000204388,HSPA1B
3,DNAJB1,ENSG00000132002,DNAJB1
4,BAG3,ENSG00000151929,BAG3
...,...,...,...
192,HEY1,ENSG00000164683,HEY1
193,TMEM216,ENSG00000187049,TMEM216
194,FAM83C,ENSG00000125998,FAM83C
195,FBXW10,ENSG00000171931,FBXW10


In [12]:
merged_df_top1 = pd.merge(df_top1, 
                          convert_ID, 
                          left_on="GeneName", 
                          right_on="initial_alias", 
                          how="outer")

Unnamed: 0,GeneName,HN5,initial_alias,converted_alias,name
0,HSPA6,245,HSPA6,ENSG00000173110,HSPA6
1,HSPA1A,239,HSPA1A,ENSG00000204389,HSPA1A
2,HSPA1B,231,HSPA1B,ENSG00000204388,HSPA1B
3,DNAJB1,210,DNAJB1,ENSG00000132002,DNAJB1
4,BAG3,181,BAG3,ENSG00000151929,BAG3
...,...,...,...,...,...
192,HEY1,51,HEY1,ENSG00000164683,HEY1
193,TMEM216,51,TMEM216,ENSG00000187049,TMEM216
194,FAM83C,51,FAM83C,ENSG00000125998,FAM83C
195,FBXW10,51,FBXW10,ENSG00000171931,FBXW10


In [23]:
UP1 = merged_df_top1[["converted_alias", "HN5", "name"]]
UP1

Unnamed: 0,converted_alias,HN5,name
0,ENSG00000173110,245,HSPA6
1,ENSG00000204389,239,HSPA1A
2,ENSG00000204388,231,HSPA1B
3,ENSG00000132002,210,DNAJB1
4,ENSG00000151929,181,BAG3
...,...,...,...
192,ENSG00000164683,51,HEY1
193,ENSG00000187049,51,TMEM216
194,ENSG00000125998,51,FAM83C
195,ENSG00000171931,51,FBXW10


In [19]:
# Get ensemble gene ID from gProfiler DOWN 1%
df3 = pd.read_csv("./gProfiler_hsapiens_2023-10-7 22-33-54.csv", sep=",")
convert_ID_down = df3[["initial_alias", "converted_alias", "name"]]
convert_ID_down

Unnamed: 0,initial_alias,converted_alias,name
0,KANK1,ENSG00000107104,KANK1
1,PYM1,ENSG00000170473,PYM1
2,SNN,ENSG00000184602,SNN
3,HMGXB4,ENSG00000100281,HMGXB4
4,NAGS,ENSG00000161653,NAGS
...,...,...,...
192,N4BP1,ENSG00000102921,N4BP1
193,ZNF784,ENSG00000179922,ZNF784
194,TSSK3,ENSG00000162526,TSSK3
195,HRCT1,ENSG00000196196,HRCT1


In [20]:
merged_df_down1 = pd.merge(df_down1,
                           convert_ID_down,
                            left_on="GeneName",
                            right_on="initial_alias",
                            how="outer")
merged_df_down1

Unnamed: 0,GeneName,HN5,initial_alias,converted_alias,name
0,KANK1,-14,KANK1,ENSG00000107104,KANK1
1,PYM1,-14,PYM1,ENSG00000170473,PYM1
2,SNN,-14,SNN,ENSG00000184602,SNN
3,HMGXB4,-14,HMGXB4,ENSG00000100281,HMGXB4
4,NAGS,-14,NAGS,ENSG00000161653,NAGS
...,...,...,...,...,...
192,N4BP1,-41,N4BP1,ENSG00000102921,N4BP1
193,ZNF784,-42,ZNF784,ENSG00000179922,ZNF784
194,TSSK3,-43,TSSK3,ENSG00000162526,TSSK3
195,HRCT1,-52,HRCT1,ENSG00000196196,HRCT1


In [24]:
DOWN1 = merged_df_down1[["converted_alias", "HN5", "name"]]
DOWN1

Unnamed: 0,converted_alias,HN5,name
0,ENSG00000107104,-14,KANK1
1,ENSG00000170473,-14,PYM1
2,ENSG00000184602,-14,SNN
3,ENSG00000100281,-14,HMGXB4
4,ENSG00000161653,-14,NAGS
...,...,...,...
192,ENSG00000102921,-41,N4BP1
193,ENSG00000179922,-42,ZNF784
194,ENSG00000162526,-43,TSSK3
195,ENSG00000196196,-52,HRCT1


In [25]:
combined = pd.concat(
    [UP1, DOWN1],
    ignore_index=True
)
combined

Unnamed: 0,converted_alias,HN5,name
0,ENSG00000173110,245,HSPA6
1,ENSG00000204389,239,HSPA1A
2,ENSG00000204388,231,HSPA1B
3,ENSG00000132002,210,DNAJB1
4,ENSG00000151929,181,BAG3
...,...,...,...
389,ENSG00000102921,-41,N4BP1
390,ENSG00000179922,-42,ZNF784
391,ENSG00000162526,-43,TSSK3
392,ENSG00000196196,-52,HRCT1


#### merge the Ensembl biomart data

In [21]:
df_ensembl = pd.read_csv("./biomart_human_position_R110.tsv", sep="\t")
df_ensembl

Unnamed: 0,Gene stable ID,Chromosome/scaffold name,Gene start (bp),Gene end (bp),Strand
0,ENSG00000210049,MT,577,647,1
1,ENSG00000211459,MT,648,1601,1
2,ENSG00000210077,MT,1602,1670,1
3,ENSG00000210082,MT,1671,3229,1
4,ENSG00000209082,MT,3230,3304,1
...,...,...,...,...,...
70111,ENSG00000236500,1,15614643,15614867,-1
70112,ENSG00000197312,1,15617458,15669044,1
70113,ENSG00000215695,1,15659713,15662033,1
70114,ENSG00000271742,1,15682873,15683128,-1


In [31]:
# 1. merge all data
merged_df = pd.merge(df_ensembl,combined,
                        left_on="Gene stable ID",
                        right_on="converted_alias",
                     how="inner")

merged_df.to_csv("./human_position_all.tsv", sep="\t")
merged_df

Unnamed: 0,Gene stable ID,Chromosome/scaffold name,Gene start (bp),Gene end (bp),Strand,converted_alias,HN5,name
0,ENSG00000132952,13,30617693,30660770,1,ENSG00000132952,89,USPL1
1,ENSG00000198743,21,34073578,34106260,1,ENSG00000198743,58,SLC5A3
2,ENSG00000174015,13,45702320,45714559,1,ENSG00000174015,-16,CBY2
3,ENSG00000101463,20,24469629,24666616,1,ENSG00000101463,-14,SYNDIG1
4,ENSG00000215547,20,31257664,31259632,1,ENSG00000215547,83,DEFB115
...,...,...,...,...,...,...,...,...
389,ENSG00000134690,1,37692481,37709719,1,ENSG00000134690,-19,CDCA8
390,ENSG00000116161,1,174999163,175012027,1,ENSG00000116161,52,CACYBP
391,ENSG00000162772,1,212565334,212620777,1,ENSG00000162772,88,ATF3
392,ENSG00000169174,1,55039447,55064852,1,ENSG00000169174,-17,PCSK9


In [32]:
# 2. merge top1 data
merged_df_top1 = pd.merge(df_ensembl,UP1,
                        left_on="Gene stable ID",
                        right_on="converted_alias",
                     how="inner")

merged_df_top1.to_csv("./human_position_up.tsv", sep="\t")
merged_df_top1

Unnamed: 0,Gene stable ID,Chromosome/scaffold name,Gene start (bp),Gene end (bp),Strand,converted_alias,HN5,name
0,ENSG00000132952,13,30617693,30660770,1,ENSG00000132952,89,USPL1
1,ENSG00000198743,21,34073578,34106260,1,ENSG00000198743,58,SLC5A3
2,ENSG00000215547,20,31257664,31259632,1,ENSG00000215547,83,DEFB115
3,ENSG00000139597,13,32400723,32428311,-1,ENSG00000139597,58,N4BP2L1
4,ENSG00000122035,13,27270830,27275192,1,ENSG00000122035,58,RASL11A
...,...,...,...,...,...,...,...,...
192,ENSG00000116741,1,192809039,192812275,1,ENSG00000116741,122,RGS2
193,ENSG00000162616,1,77979175,78017964,1,ENSG00000162616,149,DNAJB4
194,ENSG00000177606,1,58776845,58784048,-1,ENSG00000177606,63,JUN
195,ENSG00000116161,1,174999163,175012027,1,ENSG00000116161,52,CACYBP


In [33]:
# 3. merge down1 data
merged_df_down1 = pd.merge(df_ensembl,DOWN1,
                        left_on="Gene stable ID",
                        right_on="converted_alias",
                     how="inner")

merged_df_down1.to_csv("./human_position_down.tsv", sep="\t")
merged_df_down1

Unnamed: 0,Gene stable ID,Chromosome/scaffold name,Gene start (bp),Gene end (bp),Strand,converted_alias,HN5,name
0,ENSG00000174015,13,45702320,45714559,1,ENSG00000174015,-16,CBY2
1,ENSG00000101463,20,24469629,24666616,1,ENSG00000101463,-14,SYNDIG1
2,ENSG00000241945,21,44107373,44131181,1,ENSG00000241945,-26,PWP2
3,ENSG00000101452,20,38962299,39039723,1,ENSG00000101452,-21,DHX35
4,ENSG00000132825,20,59936663,59940305,-1,ENSG00000132825,-16,PPP1R3D
...,...,...,...,...,...,...,...,...
192,ENSG00000181773,1,27392622,27395814,1,ENSG00000181773,-21,GPR3
193,ENSG00000117152,1,163068775,163076802,1,ENSG00000117152,-19,RGS4
194,ENSG00000134690,1,37692481,37709719,1,ENSG00000134690,-19,CDCA8
195,ENSG00000169174,1,55039447,55064852,1,ENSG00000169174,-17,PCSK9
