In [1]:
import pandas as pd
import os

In [2]:
# output directory and settings
organism = 'human'
annotation_dir = f'../data/{organism}_annotation'
os.makedirs(annotation_dir, exist_ok=True)

In [3]:
# human HN-score data
# https://doi.org/10.6084/m9.figshare.23944935.v2
df = pd.read_csv(f"../data/HN-score_{organism}/HN-score_{organism}_all.csv", sep=",")


# Upregulated genes
df_up = pd.read_csv(f"../data/HN-score_{organism}/HN5_genes_up_{organism}.tsv", sep="\t") #genes names in this file were manually corrected to the most recent version 
df_up['GeneName'] = df_up['GeneName'].str.upper()
# Downregulated genes
df_down = pd.read_csv(f"../data/HN-score_{organism}/HN5_genes_down_{organism}.tsv", sep="\t")
df_down['GeneName'] = df_down['GeneName'].str.upper()

display(df, df_up, df_down)

Unnamed: 0,GeneName,up5,dn5,unchange5,all,HN5
0,HSPA6,247,2,73,322,245
1,HSPA1A,240,1,81,322,239
2,HSPA1B,233,2,87,322,231
3,DNAJB1,212,2,108,322,210
4,BAG3,182,1,139,322,181
...,...,...,...,...,...,...
19700,N4BP1,3,44,275,322,-41
19701,ZNF784,8,50,264,322,-42
19702,TSSK3,28,71,223,322,-43
19703,HRCT1,7,59,256,322,-52


Unnamed: 0,GeneName,up5,dn5,unchange5,all,HN5
0,HSPA6,247,2,73,322,245
1,HSPA1A,240,1,81,322,239
2,HSPA1B,233,2,87,322,231
3,DNAJB1,212,2,108,322,210
4,BAG3,182,1,139,322,181
...,...,...,...,...,...,...
195,FBXW10,52,1,269,322,51
196,SPOCK2,52,1,269,322,51
197,SBSN,53,2,267,322,51
198,PNMA8A,53,2,267,322,51


Unnamed: 0,GeneName,up5,dn5,unchange5,all,HN5
0,E2F8,5,19,298,322,-14
1,TRIB3,5,19,298,322,-14
2,TSNARE1,11,25,286,322,-14
3,CLCF1,4,18,300,322,-14
4,PRAMEF17,2,16,304,322,-14
...,...,...,...,...,...,...
199,N4BP1,3,44,275,322,-41
200,ZNF784,8,50,264,322,-42
201,TSSK3,28,71,223,322,-43
202,HRCT1,7,59,256,322,-52


## Convert the gene names to Ensemble gene IDs

In [4]:
df2 = pd.read_csv("../data/gprofiler/gProfiler_hsapiens_2023-11-13 10-00-00.csv", sep=",")
df2_ID = df2[["initial_alias", "converted_alias", "name"]]

display(df2_ID)

Unnamed: 0,initial_alias,converted_alias,name
0,HSPA6,ENSG00000173110,HSPA6
1,HSPA1A,ENSG00000204389,HSPA1A
2,HSPA1B,ENSG00000204388,HSPA1B
3,DNAJB1,ENSG00000132002,DNAJB1
4,BAG3,ENSG00000151929,BAG3
...,...,...,...
399,N4BP1,ENSG00000102921,N4BP1
400,ZNF784,ENSG00000179922,ZNF784
401,TSSK3,ENSG00000162526,TSSK3
402,HRCT1,ENSG00000196196,HRCT1


In [5]:
# Get ensemble gene ID from gProfiler UP 1%

# check duplicates
duplicates = df2_ID[df2_ID.duplicated(subset=["initial_alias"], keep=False)]

# upregulated genes
merged_df_up = pd.merge(  df_up, 
                          df2_ID, 
                          left_on="GeneName", 
                          right_on="initial_alias", 
                          how="inner")

gene_names_up = set(df_up['GeneName'])
gene_names_merged = set(merged_df_up['GeneName'])
missing_gene_names = gene_names_up - gene_names_merged
missing_rows_up = df_up[df_up['GeneName'].isin(missing_gene_names)]

UP = merged_df_up[["converted_alias", "HN5", "name"]].copy()
UP.rename(columns={'HN5': 'HN-score(HN5)'}, inplace=True)

# downregulated genes
merged_df_down = pd.merge(df_down,
                           df2_ID,
                            left_on="GeneName",
                            right_on="initial_alias",
                            how="inner")


gene_names_down = set(df_down['GeneName'])
gene_names_merged_down = set(merged_df_down['GeneName'])
missing_gene_names_down = gene_names_down - gene_names_merged_down
missing_rows_down = df_down[df_down['GeneName'].isin(missing_gene_names_down)]

DOWN = merged_df_down[["converted_alias", "HN5", "name"]].copy()
DOWN.rename(columns={'HN5': 'HN-score(HN5)'}, inplace=True)

COMBINED = pd.concat(
    [UP, DOWN],
    ignore_index=True
)

display(missing_rows_up, missing_rows_down, UP, DOWN, COMBINED)

Unnamed: 0,GeneName,up5,dn5,unchange5,all,HN5


Unnamed: 0,GeneName,up5,dn5,unchange5,all,HN5


Unnamed: 0,converted_alias,HN-score(HN5),name
0,ENSG00000173110,245,HSPA6
1,ENSG00000204389,239,HSPA1A
2,ENSG00000204388,231,HSPA1B
3,ENSG00000132002,210,DNAJB1
4,ENSG00000151929,181,BAG3
...,...,...,...
195,ENSG00000171931,51,FBXW10
196,ENSG00000107742,51,SPOCK2
197,ENSG00000189001,51,SBSN
198,ENSG00000182013,51,PNMA8A


Unnamed: 0,converted_alias,HN-score(HN5),name
0,ENSG00000129173,-14,E2F8
1,ENSG00000101255,-14,TRIB3
2,ENSG00000171045,-14,TSNARE1
3,ENSG00000175505,-14,CLCF1
4,ENSG00000204479,-14,PRAMEF17
...,...,...,...
199,ENSG00000102921,-41,N4BP1
200,ENSG00000179922,-42,ZNF784
201,ENSG00000162526,-43,TSSK3
202,ENSG00000196196,-52,HRCT1


Unnamed: 0,converted_alias,HN-score(HN5),name
0,ENSG00000173110,245,HSPA6
1,ENSG00000204389,239,HSPA1A
2,ENSG00000204388,231,HSPA1B
3,ENSG00000132002,210,DNAJB1
4,ENSG00000151929,181,BAG3
...,...,...,...
399,ENSG00000102921,-41,N4BP1
400,ENSG00000179922,-42,ZNF784
401,ENSG00000162526,-43,TSSK3
402,ENSG00000196196,-52,HRCT1


## merge the Ensembl biomart data (Chromosome position)

- For drawing Circos plot

In [6]:
df_ensembl_position = pd.read_csv(f"../data/biomart_goslim/biomart_{organism}_position_R110.tsv", sep="\t", low_memory=False)

# 1. merged all data
merged_df_position = pd.merge(
    df_ensembl_position,
    COMBINED,
    left_on="Gene stable ID",
    right_on="converted_alias",
    how="inner"
)
merged_df_position = merged_df_position.drop(columns=['converted_alias'])
merged_df_position = merged_df_position.sort_values(by='HN-score(HN5)', ascending=False).copy()
merged_df_position.to_csv(f"../data/{annotation_dir}/{organism}_position_all.tsv", sep="\t" , index=False)

# 2. merged up genes data
merged_df_position_up = pd.merge(
    df_ensembl_position,
    UP,
    left_on="Gene stable ID",
    right_on="converted_alias",
    how="inner"
)
merged_df_position_up = merged_df_position_up.drop(columns=['converted_alias'])
merged_df_position_up = merged_df_position_up.sort_values(by='HN-score(HN5)', ascending=False).copy()
merged_df_position_up.to_csv(f"../data/{annotation_dir}/{organism}_position_up.tsv", sep="\t", index=False)

# 3. merged down genes data
merged_df_position_down = pd.merge(
    df_ensembl_position,
    DOWN,
    left_on="Gene stable ID",
    right_on="converted_alias",
    how="inner"
)
merged_df_position_down = merged_df_position_down.drop(columns=['converted_alias'])
merged_df_position_down = merged_df_position_down.sort_values(by='HN-score(HN5)', ascending=False).copy()
merged_df_position_down.to_csv(f"../data/{annotation_dir}/{organism}_position_down.tsv", sep="\t", index=False)

display(df_ensembl_position, merged_df_position, merged_df_position_up, merged_df_position_down)

Unnamed: 0,Gene stable ID,Chromosome/scaffold name,Gene start (bp),Gene end (bp),Strand
0,ENSG00000210049,MT,577,647,1
1,ENSG00000211459,MT,648,1601,1
2,ENSG00000210077,MT,1602,1670,1
3,ENSG00000210082,MT,1671,3229,1
4,ENSG00000209082,MT,3230,3304,1
...,...,...,...,...,...
70111,ENSG00000236500,1,15614643,15614867,-1
70112,ENSG00000197312,1,15617458,15669044,1
70113,ENSG00000215695,1,15659713,15662033,1
70114,ENSG00000271742,1,15682873,15683128,-1


Unnamed: 0,Gene stable ID,Chromosome/scaffold name,Gene start (bp),Gene end (bp),Strand,HN-score(HN5),name
294,ENSG00000173110,1,161524540,161526894,1,245,HSPA6
155,ENSG00000204389,6,31815543,31817946,1,239,HSPA1A
156,ENSG00000204388,6,31827738,31830254,1,231,HSPA1B
203,ENSG00000132002,19,14514769,14560391,-1,210,DNAJB1
88,ENSG00000151929,10,119651380,119677819,1,181,BAG3
...,...,...,...,...,...,...,...
37,ENSG00000102921,16,48538726,48620148,-1,-41,N4BP1
160,ENSG00000179922,19,55620741,55624566,-1,-42,ZNF784
317,ENSG00000162526,1,32351521,32364312,1,-43,TSSK3
66,ENSG00000196196,9,35906202,35907136,1,-52,HRCT1


Unnamed: 0,Gene stable ID,Chromosome/scaffold name,Gene start (bp),Gene end (bp),Strand,HN-score(HN5),name
143,ENSG00000173110,1,161524540,161526894,1,245,HSPA6
74,ENSG00000204389,6,31815543,31817946,1,239,HSPA1A
75,ENSG00000204388,6,31827738,31830254,1,231,HSPA1B
97,ENSG00000132002,19,14514769,14560391,-1,210,DNAJB1
38,ENSG00000151929,10,119651380,119677819,1,181,BAG3
...,...,...,...,...,...,...,...
157,ENSG00000187049,11,61392393,61398866,1,51,TMEM216
52,ENSG00000176381,6,166305300,166308448,-1,51,PRR18
51,ENSG00000171931,17,18744026,18779349,1,51,FBXW10
7,ENSG00000125998,20,35285731,35292425,-1,51,FAM83C


Unnamed: 0,Gene stable ID,Chromosome/scaffold name,Gene start (bp),Gene end (bp),Strand,HN-score(HN5),name
164,ENSG00000170473,12,55901413,55932618,-1,-14,PYM1
19,ENSG00000184675,X,64185117,64205708,-1,-14,AMER1
124,ENSG00000161653,17,44004622,44009068,1,-14,NAGS
23,ENSG00000100281,22,35257452,35295807,1,-14,HMGXB4
126,ENSG00000132773,1,45340052,45343973,1,-14,TOE1
...,...,...,...,...,...,...,...
18,ENSG00000102921,16,48538726,48620148,-1,-41,N4BP1
82,ENSG00000179922,19,55620741,55624566,-1,-42,ZNF784
165,ENSG00000162526,1,32351521,32364312,1,-43,TSSK3
34,ENSG00000196196,9,35906202,35907136,1,-52,HRCT1


## merge the Ensembl biomart data (GOslim)

- For exexution of GO enrichment analysis

In [7]:
df_ensembl = pd.read_csv(f"../data/biomart_goslim/biomart_{organism}_goslim_R110_domain.tsv", sep="\t", low_memory=False)
df_ensembl_uniq = df_ensembl.drop_duplicates(subset=['Gene stable ID', 
                                                     'GOSlim GOA Accession(s)', 
                                                     'GOSlim GOA Description'], 
                                                     keep='first').copy()
df_ensembl_uniq.rename(columns={'start2 (bp)': 'Gene start(bp)'}, inplace=True)
df_ensembl_uniq.rename(columns={'end2 (bp)': 'Gene end(bp)'}, inplace=True)

# 1. merge all data
merged_df = pd.merge(df_ensembl_uniq,COMBINED,
                        left_on="Gene stable ID",
                        right_on="converted_alias",
                     how="inner")

merged_df = merged_df.drop(columns=['converted_alias'])
merged_df = merged_df.sort_values(by='HN-score(HN5)', ascending=False).copy()
merged_df.to_csv(f"../data/{annotation_dir}/{organism}_annotation.tsv", sep="\t" , index=False)
merged_df

# 2. merge UP data
merged_df_up = pd.merge(df_ensembl_uniq,UP,
                        left_on="Gene stable ID",
                        right_on="converted_alias",
                     how="inner")

merged_df_up = merged_df_up.drop(columns=['converted_alias'])
merged_df_up = merged_df_up.sort_values(by='HN-score(HN5)', ascending=False).copy()
merged_df_up.to_csv(f"../data/{annotation_dir}/{organism}_annotation_up.tsv", sep="\t" , index=False)

# 3. merge DOWN data
merged_df_down = pd.merge(df_ensembl_uniq,DOWN,
                        left_on="Gene stable ID",
                        right_on="converted_alias",
                     how="inner")

merged_df_down = merged_df_down.drop(columns=['converted_alias'])
merged_df_down = merged_df_down.sort_values(by='HN-score(HN5)', ascending=False).copy()
merged_df_down.to_csv(f"../data/{annotation_dir}/{organism}_annotation_down.tsv", sep="\t", index=False)

display(df_ensembl_uniq, merged_df, merged_df_up, merged_df_down)

Unnamed: 0,Gene stable ID,Chromosome/scaffold name,Gene start(bp),Gene end(bp),GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain
0,ENSG00000243485,1,29554,31109,GO:0031047,gene silencing by RNA,biological_process
1,ENSG00000284332,1,30366,30503,GO:0031047,gene silencing by RNA,biological_process
2,ENSG00000186092,1,65419,71585,GO:0023052,signaling,biological_process
3,ENSG00000186092,1,65419,71585,GO:0060089,molecular transducer activity,molecular_function
4,ENSG00000186092,1,65419,71585,GO:0005886,plasma membrane,cellular_component
...,...,...,...,...,...,...,...
193923,ENSG00000292372,Y,57207346,57212230,GO:0007010,cytoskeleton organization,biological_process
193924,ENSG00000292372,Y,57207346,57212230,GO:0008092,cytoskeletal protein binding,molecular_function
193925,ENSG00000292372,Y,57207346,57212230,GO:0031410,cytoplasmic vesicle,cellular_component
193926,ENSG00000292372,Y,57207346,57212230,GO:0043226,organelle,cellular_component


Unnamed: 0,Gene stable ID,Chromosome/scaffold name,Gene start(bp),Gene end(bp),GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain,HN-score(HN5),name
211,ENSG00000173110,1,161524540,161526894,GO:0140657,ATP-dependent activity,molecular_function,245,HSPA6
217,ENSG00000173110,1,161524540,161526894,GO:0005634,nucleus,cellular_component,245,HSPA6
223,ENSG00000173110,1,161524540,161526894,GO:0031410,cytoplasmic vesicle,cellular_component,245,HSPA6
221,ENSG00000173110,1,161524540,161526894,GO:0003824,catalytic activity,molecular_function,245,HSPA6
220,ENSG00000173110,1,161524540,161526894,GO:0005615,extracellular space,cellular_component,245,HSPA6
...,...,...,...,...,...,...,...,...,...
907,ENSG00000124575,6,26234212,26234987,GO:0065003,protein-containing complex assembly,biological_process,-71,H1-3
906,ENSG00000124575,6,26234212,26234987,GO:0005694,chromosome,cellular_component,-71,H1-3
905,ENSG00000124575,6,26234212,26234987,GO:0043226,organelle,cellular_component,-71,H1-3
904,ENSG00000124575,6,26234212,26234987,GO:0005198,structural molecule activity,molecular_function,-71,H1-3


Unnamed: 0,Gene stable ID,Chromosome/scaffold name,Gene start(bp),Gene end(bp),GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain,HN-score(HN5),name
59,ENSG00000173110,1,161524540,161526894,GO:0003824,catalytic activity,molecular_function,245,HSPA6
57,ENSG00000173110,1,161524540,161526894,GO:0005576,extracellular region,cellular_component,245,HSPA6
49,ENSG00000173110,1,161524540,161526894,GO:0140657,ATP-dependent activity,molecular_function,245,HSPA6
51,ENSG00000173110,1,161524540,161526894,GO:0043226,organelle,cellular_component,245,HSPA6
52,ENSG00000173110,1,161524540,161526894,GO:0005856,cytoskeleton,cellular_component,245,HSPA6
...,...,...,...,...,...,...,...,...,...
1689,ENSG00000184205,X,53082367,53088540,GO:0005730,nucleolus,cellular_component,51,TSPYL2
1690,ENSG00000184205,X,53082367,53088540,GO:0023052,signaling,biological_process,51,TSPYL2
1691,ENSG00000184205,X,53082367,53088540,GO:0042393,histone binding,molecular_function,51,TSPYL2
1692,ENSG00000184205,X,53082367,53088540,GO:0005694,chromosome,cellular_component,51,TSPYL2


Unnamed: 0,Gene stable ID,Chromosome/scaffold name,Gene start(bp),Gene end(bp),GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain,HN-score(HN5),name
0,ENSG00000204479,1,13389632,13392629,GO:0012501,programmed cell death,biological_process,-14,PRAMEF17
476,ENSG00000112984,5,138178719,138187723,GO:0043226,organelle,cellular_component,-14,KIF20A
474,ENSG00000112984,5,138178719,138187723,GO:0003774,cytoskeletal motor activity,molecular_function,-14,KIF20A
473,ENSG00000112984,5,138178719,138187723,GO:0140657,ATP-dependent activity,molecular_function,-14,KIF20A
472,ENSG00000112984,5,138178719,138187723,GO:0008092,cytoskeletal protein binding,molecular_function,-14,KIF20A
...,...,...,...,...,...,...,...,...,...
544,ENSG00000124575,6,26234212,26234987,GO:0005694,chromosome,cellular_component,-71,H1-3
543,ENSG00000124575,6,26234212,26234987,GO:0043226,organelle,cellular_component,-71,H1-3
542,ENSG00000124575,6,26234212,26234987,GO:0005198,structural molecule activity,molecular_function,-71,H1-3
541,ENSG00000124575,6,26234212,26234987,GO:0003677,DNA binding,molecular_function,-71,H1-3
