In [1]:
import pandas as pd

In [2]:
# human HN-score data
# https://doi.org/10.6084/m9.figshare.23944935.v2
df = pd.read_csv("../data/HN-score_human/HN-score_human_all.csv", sep=",")
total_lines = len(df)
print("total_lines:", total_lines)
df

total_lines: 19705


Unnamed: 0,GeneName,up5,dn5,unchange5,all,HN5
0,HSPA6,247,2,73,322,245
1,HSPA1A,240,1,81,322,239
2,HSPA1B,233,2,87,322,231
3,DNAJB1,212,2,108,322,210
4,BAG3,182,1,139,322,181
...,...,...,...,...,...,...
19700,N4BP1,3,44,275,322,-41
19701,ZNF784,8,50,264,322,-42
19702,TSSK3,28,71,223,322,-43
19703,HRCT1,7,59,256,322,-52


In [3]:
# Upregulated genes
df_up = pd.read_csv("../data/HN-score_human/HN5_genelist_human_2311/HN5_genes_up_human.tsv", sep="\t")
df_up

Unnamed: 0,GeneName,up5,dn5,unchange5,all,HN5
0,HSPA6,247,2,73,322,245
1,HSPA1A,240,1,81,322,239
2,HSPA1B,233,2,87,322,231
3,DNAJB1,212,2,108,322,210
4,BAG3,182,1,139,322,181
...,...,...,...,...,...,...
195,FBXW10,52,1,269,322,51
196,SPOCK2,52,1,269,322,51
197,SBSN,53,2,267,322,51
198,PNMA8A,53,2,267,322,51


In [4]:
# Downregulated genes
df_down = pd.read_csv("../data/HN-score_human/HN5_genelist_human_2311/HN5_genes_down_human.tsv", sep="\t")
df_down

Unnamed: 0,GeneName,up5,dn5,unchange5,all,HN5
0,E2F8,5,19,298,322,-14
1,TRIB3,5,19,298,322,-14
2,TSNARE1,11,25,286,322,-14
3,CLCF1,4,18,300,322,-14
4,PRAMEF17,2,16,304,322,-14
...,...,...,...,...,...,...
199,N4BP1,3,44,275,322,-41
200,ZNF784,8,50,264,322,-42
201,TSSK3,28,71,223,322,-43
202,HRCT1,7,59,256,322,-52


#### Convert the gene names to Ensemble gene IDs

In [5]:
# Get ensemble gene ID from gProfiler UP 1%
df2 = pd.read_csv("../data/gprofiler/gProfiler_hsapiens_2023-11-5 14-34-25.csv", sep=",")
df2_ID = df2[["initial_alias", "converted_alias", "name"]]
df2_ID

Unnamed: 0,initial_alias,converted_alias,name
0,ACCS,ENSG00000110455,ACCS
1,ACTRT3,ENSG00000184378,ACTRT3
2,ADCY10,ENSG00000143199,ADCY10
3,ADGRF1,ENSG00000153292,ADGRF1
4,ADORA2A,ENSG00000128271,ADORA2A
...,...,...,...
399,ZNF624,ENSG00000197566,ZNF624
400,ZNF627,ENSG00000198551,ZNF627
401,ZNF775,ENSG00000196456,ZNF775
402,ZNF784,ENSG00000179922,ZNF784


In [6]:
# check duplicates
duplicates = df2_ID[df2_ID.duplicated(subset=["initial_alias"], keep=False)]
duplicates

Unnamed: 0,initial_alias,converted_alias,name


In [13]:
merged_df_up = pd.merge(  df_up, 
                          df2_ID, 
                          left_on="GeneName", 
                          right_on="initial_alias", 
                          how="inner")

gene_names_up = set(df_up['GeneName'])
gene_names_merged = set(merged_df_up['GeneName'])
missing_gene_names = gene_names_up - gene_names_merged
missing_rows = df_up[df_up['GeneName'].isin(missing_gene_names)]
print(missing_rows)

UP = merged_df_up[["converted_alias", "HN5", "name"]]
UP

Empty DataFrame
Columns: [GeneName, up5, dn5, unchange5, all, HN5]
Index: []


Unnamed: 0,converted_alias,HN5,name
0,ENSG00000173110,245,HSPA6
1,ENSG00000204389,239,HSPA1A
2,ENSG00000204388,231,HSPA1B
3,ENSG00000132002,210,DNAJB1
4,ENSG00000151929,181,BAG3
...,...,...,...
195,ENSG00000171931,51,FBXW10
196,ENSG00000107742,51,SPOCK2
197,ENSG00000189001,51,SBSN
198,ENSG00000182013,51,PNMA8A


In [15]:
gene_names_up = set(df_down['GeneName'])
gene_names_merged = set(merged_df_up['GeneName'])
missing_gene_names = gene_names_up - gene_names_merged
missing_rows = df_up[df_up['GeneName'].isin(missing_gene_names)]
print(missing_rows)

Empty DataFrame
Columns: [GeneName, up5, dn5, unchange5, all, HN5]
Index: []


In [14]:
merged_df_down = pd.merge(df_down,
                           df2_ID,
                            left_on="GeneName",
                            right_on="initial_alias",
                            how="inner")

DOWN = merged_df_down[["converted_alias", "HN5", "name"]]
DOWN

Unnamed: 0,converted_alias,HN5,name
0,ENSG00000129173,-14,E2F8
1,ENSG00000101255,-14,TRIB3
2,ENSG00000171045,-14,TSNARE1
3,ENSG00000175505,-14,CLCF1
4,ENSG00000204479,-14,PRAMEF17
...,...,...,...
199,ENSG00000102921,-41,N4BP1
200,ENSG00000179922,-42,ZNF784
201,ENSG00000162526,-43,TSSK3
202,ENSG00000196196,-52,HRCT1


In [16]:
combined = pd.concat(
    [UP, DOWN],
    ignore_index=True
)
combined

Unnamed: 0,converted_alias,HN5,name
0,ENSG00000173110,245,HSPA6
1,ENSG00000204389,239,HSPA1A
2,ENSG00000204388,231,HSPA1B
3,ENSG00000132002,210,DNAJB1
4,ENSG00000151929,181,BAG3
...,...,...,...
399,ENSG00000102921,-41,N4BP1
400,ENSG00000179922,-42,ZNF784
401,ENSG00000162526,-43,TSSK3
402,ENSG00000196196,-52,HRCT1


#### merge the Ensembl biomart data

In [17]:
df_ensembl = pd.read_csv("../data/biomart_goslim/biomart_human_goslim_R110.tsv", sep="\t")
df_ensembl

  df_ensembl = pd.read_csv("../data/biomart_goslim/biomart_human_goslim_R110.tsv", sep="\t")


Unnamed: 0,Gene stable ID,Chromosome/scaffold name,start2 (bp),end2 (bp),GOSlim GOA Accession(s),GOSlim GOA Description
0,ENSG00000243485,1,29554,31109,GO:0031047,gene silencing by RNA
1,ENSG00000284332,1,30366,30503,GO:0031047,gene silencing by RNA
2,ENSG00000186092,1,65419,71585,GO:0023052,signaling
3,ENSG00000186092,1,65419,71585,GO:0060089,molecular transducer activity
4,ENSG00000186092,1,65419,71585,GO:0005886,plasma membrane
...,...,...,...,...,...,...
581396,ENSG00000292372,Y,57207346,57212230,GO:0007010,cytoskeleton organization
581397,ENSG00000292372,Y,57207346,57212230,GO:0008092,cytoskeletal protein binding
581398,ENSG00000292372,Y,57207346,57212230,GO:0031410,cytoplasmic vesicle
581399,ENSG00000292372,Y,57207346,57212230,GO:0043226,organelle


In [18]:
# 1. merge all data
merged_df = pd.merge(df_ensembl,combined,
                        left_on="Gene stable ID",
                        right_on="converted_alias",
                     how="inner")

merged_df.to_csv("../data/human_annotation/human_position.tsv", sep="\t")
merged_df

Unnamed: 0,Gene stable ID,Chromosome/scaffold name,start2 (bp),end2 (bp),GOSlim GOA Accession(s),GOSlim GOA Description,converted_alias,HN5,name
0,ENSG00000204479,1,13389632,13392629,GO:0012501,programmed cell death,ENSG00000204479,-14,PRAMEF17
1,ENSG00000204479,1,13389632,13392629,GO:0006351,DNA-templated transcription,ENSG00000204479,-14,PRAMEF17
2,ENSG00000204479,1,13389632,13392629,GO:0006355,regulation of DNA-templated transcription,ENSG00000204479,-14,PRAMEF17
3,ENSG00000204479,1,13389632,13392629,GO:0030154,cell differentiation,ENSG00000204479,-14,PRAMEF17
4,ENSG00000215695,1,15659713,15662033,GO:0043226,organelle,ENSG00000215695,-26,RSC1A1
...,...,...,...,...,...,...,...,...,...
8619,ENSG00000130822,X,153669733,153689010,GO:0140096,"catalytic activity, acting on a protein",ENSG00000130822,64,PNCK
8620,ENSG00000130822,X,153669733,153689010,GO:0036211,protein modification process,ENSG00000130822,64,PNCK
8621,ENSG00000130822,X,153669733,153689010,GO:0003824,catalytic activity,ENSG00000130822,64,PNCK
8622,ENSG00000130822,X,153669733,153689010,GO:0016740,transferase activity,ENSG00000130822,64,PNCK


In [19]:
# 2. merge UP data
merged_df_top = pd.merge(df_ensembl,UP,
                        left_on="Gene stable ID",
                        right_on="converted_alias",
                     how="inner")

merged_df_top.to_csv("../data/human_annotation/human_position_up", sep="\t")
merged_df_top

Unnamed: 0,Gene stable ID,Chromosome/scaffold name,start2 (bp),end2 (bp),GOSlim GOA Accession(s),GOSlim GOA Description,converted_alias,HN5,name
0,ENSG00000177606,1,58776845,58784048,GO:0043226,organelle,ENSG00000177606,63,JUN
1,ENSG00000177606,1,58776845,58784048,GO:0005634,nucleus,ENSG00000177606,63,JUN
2,ENSG00000177606,1,58776845,58784048,GO:0003723,RNA binding,ENSG00000177606,63,JUN
3,ENSG00000177606,1,58776845,58784048,GO:0005654,nucleoplasm,ENSG00000177606,63,JUN
4,ENSG00000177606,1,58776845,58784048,GO:0006351,DNA-templated transcription,ENSG00000177606,63,JUN
...,...,...,...,...,...,...,...,...,...
4843,ENSG00000130822,X,153669733,153689010,GO:0140096,"catalytic activity, acting on a protein",ENSG00000130822,64,PNCK
4844,ENSG00000130822,X,153669733,153689010,GO:0036211,protein modification process,ENSG00000130822,64,PNCK
4845,ENSG00000130822,X,153669733,153689010,GO:0003824,catalytic activity,ENSG00000130822,64,PNCK
4846,ENSG00000130822,X,153669733,153689010,GO:0016740,transferase activity,ENSG00000130822,64,PNCK


In [20]:
# 3. merge DOWN data
merged_df_down = pd.merge(df_ensembl,DOWN,
                        left_on="Gene stable ID",
                        right_on="converted_alias",
                     how="inner")

merged_df_down.to_csv("../data/human_annotation/human_position_down.tsv", sep="\t")
merged_df_down

Unnamed: 0,Gene stable ID,Chromosome/scaffold name,start2 (bp),end2 (bp),GOSlim GOA Accession(s),GOSlim GOA Description,converted_alias,HN5,name
0,ENSG00000204479,1,13389632,13392629,GO:0012501,programmed cell death,ENSG00000204479,-14,PRAMEF17
1,ENSG00000204479,1,13389632,13392629,GO:0006351,DNA-templated transcription,ENSG00000204479,-14,PRAMEF17
2,ENSG00000204479,1,13389632,13392629,GO:0006355,regulation of DNA-templated transcription,ENSG00000204479,-14,PRAMEF17
3,ENSG00000204479,1,13389632,13392629,GO:0030154,cell differentiation,ENSG00000204479,-14,PRAMEF17
4,ENSG00000215695,1,15659713,15662033,GO:0043226,organelle,ENSG00000215695,-26,RSC1A1
...,...,...,...,...,...,...,...,...,...
3771,ENSG00000183479,X,153444473,153470587,GO:0005634,nucleus,ENSG00000183479,-26,TREX2
3772,ENSG00000183479,X,153444473,153470587,GO:0006281,DNA repair,ENSG00000183479,-26,TREX2
3773,ENSG00000183479,X,153444473,153470587,GO:0060090,molecular adaptor activity,ENSG00000183479,-26,TREX2
3774,ENSG00000183479,X,153444473,153470587,GO:0003677,DNA binding,ENSG00000183479,-26,TREX2
