In [1]:
import pandas as pd
import os

In [2]:
# output directory and settings
organism = 'mouse'
annotation_dir = f'../data/{organism}_annotation'
os.makedirs(annotation_dir, exist_ok=True)

In [3]:
# human HN-score data
# https://doi.org/10.6084/m9.figshare.23944935.v2
df = pd.read_csv(f"../data/HN-score_{organism}/HN-score_{organism}_all.csv", sep=",")


# Upregulated genes
df_up = pd.read_csv(f"../data/HN-score_{organism}/HN5_genes_up_{organism}.tsv", sep="\t") #genes names in this file were manually corrected to the most recent version 
df_up['GeneName'] = df_up['GeneName'].str.upper()
# Downregulated genes
df_down = pd.read_csv(f"../data/HN-score_{organism}/HN5_genes_down_{organism}.tsv", sep="\t")
df_down['GeneName'] = df_down['GeneName'].str.upper()

display(df, df_up, df_down)

Unnamed: 0,GeneName,up5,dn5,unchange5,all,HN5
0,Hspa1b,152,3,87,242,149
1,Hspa1a,140,3,99,242,137
2,Hspb1,112,1,129,242,111
3,Gm20481,95,7,140,242,88
4,Atf3,81,3,158,242,78
...,...,...,...,...,...,...
22211,Il4i1,7,37,198,242,-30
22212,Tmem89,8,39,195,242,-31
22213,Spata45,18,49,175,242,-31
22214,Fam24b,14,47,181,242,-33


Unnamed: 0,GeneName,up5,dn5,unchange5,all,HN5
0,HSPA1B,152,3,87,242,149
1,HSPA1A,140,3,99,242,137
2,HSPB1,112,1,129,242,111
3,GM20481,95,7,140,242,88
4,ATF3,81,3,158,242,78
...,...,...,...,...,...,...
250,GSTA2,19,6,217,242,13
251,H3C2,34,21,187,242,13
252,CTSK,22,9,211,242,13
253,GM3285,13,0,229,242,13


Unnamed: 0,GeneName,up5,dn5,unchange5,all,HN5
0,PDE6H,7,22,213,242,-15
1,F10,1,16,225,242,-15
2,ZFP354B,2,17,223,242,-15
3,PPARG,2,17,223,242,-15
4,ZNHIT3,2,17,223,242,-15
...,...,...,...,...,...,...
261,IL4I1,7,37,198,242,-30
262,TMEM89,8,39,195,242,-31
263,SPATA45,18,49,175,242,-31
264,FAM24B,14,47,181,242,-33


## Convert the gene names to Ensemble gene IDs

In [4]:
df2 = pd.read_csv("../data/gprofiler/gProfiler_mmusculus_2023-11-24 16-34-34.csv", sep=",")
df2_ID = df2[["initial_alias", "converted_alias", "name"]]

display(df2_ID)

Unnamed: 0,initial_alias,converted_alias,name
0,1190005I06RIK,ENSMUSG00000043687,1190005I06Rik
1,1700010B08RIK,ENSMUSG00000057047,1700010B08Rik
2,1700028K03RIK,ENSMUSG00000089798,1700028K03Rik
3,1700109H08RIK,ENSMUSG00000008307,1700109H08Rik
4,4930474N05RIK,ENSMUSG00000096405,4930474N05Rik
...,...,...,...
516,ZFP973,ENSMUSG00000078879,Zfp973
517,ZFP974,ENSMUSG00000070709,Zfp974
518,ZFP977,ENSMUSG00000092335,Zfp977
519,ZNHIT3,ENSMUSG00000020526,Znhit3


In [5]:
# Get ensemble gene ID from gProfiler UP 1%

# check duplicates
duplicates = df2_ID[df2_ID.duplicated(subset=["initial_alias"], keep=False)]

# upregulated genes
merged_df_up = pd.merge(  df_up, 
                          df2_ID, 
                          left_on="GeneName", 
                          right_on="initial_alias", 
                          how="inner")

gene_names_up = set(df_up['GeneName'])
gene_names_merged = set(merged_df_up['GeneName'])
missing_gene_names = gene_names_up - gene_names_merged
missing_rows_up = df_up[df_up['GeneName'].isin(missing_gene_names)]

UP = merged_df_up[["converted_alias", "HN5", "name"]].copy()
UP.rename(columns={'HN5': 'HN-score(HN5)'}, inplace=True)

# downregulated genes
merged_df_down = pd.merge(df_down,
                           df2_ID,
                            left_on="GeneName",
                            right_on="initial_alias",
                            how="inner")


gene_names_down = set(df_down['GeneName'])
gene_names_merged_down = set(merged_df_down['GeneName'])
missing_gene_names_down = gene_names_down - gene_names_merged_down
missing_rows_down = df_down[df_down['GeneName'].isin(missing_gene_names_down)]

DOWN = merged_df_down[["converted_alias", "HN5", "name"]].copy()
DOWN.rename(columns={'HN5': 'HN-score(HN5)'}, inplace=True)

COMBINED = pd.concat(
    [UP, DOWN],
    ignore_index=True
)

display(missing_rows_up, missing_rows_down, UP, DOWN, COMBINED)

Unnamed: 0,GeneName,up5,dn5,unchange5,all,HN5


Unnamed: 0,GeneName,up5,dn5,unchange5,all,HN5


Unnamed: 0,converted_alias,HN-score(HN5),name
0,ENSMUSG00000090877,149,Hspa1b
1,ENSMUSG00000091971,137,Hspa1a
2,ENSMUSG00000004951,111,Hspb1
3,ENSMUSG00000092609,88,Gm20481
4,ENSMUSG00000026628,78,Atf3
...,...,...,...
250,ENSMUSG00000057933,13,Gsta2
251,ENSMUSG00000069267,13,H3c2
252,ENSMUSG00000028111,13,Ctsk
253,ENSMUSG00000111915,13,Gm3285


Unnamed: 0,converted_alias,HN-score(HN5),name
0,ENSMUSG00000064330,-15,Pde6h
1,ENSMUSG00000031444,-15,F10
2,ENSMUSG00000020335,-15,Zfp354b
3,ENSMUSG00000000440,-15,Pparg
4,ENSMUSG00000020526,-15,Znhit3
...,...,...,...
261,ENSMUSG00000074141,-30,Il4i1
262,ENSMUSG00000025652,-31,Tmem89
263,ENSMUSG00000057072,-31,Spata45
264,ENSMUSG00000030858,-33,Fam24b


Unnamed: 0,converted_alias,HN-score(HN5),name
0,ENSMUSG00000090877,149,Hspa1b
1,ENSMUSG00000091971,137,Hspa1a
2,ENSMUSG00000004951,111,Hspb1
3,ENSMUSG00000092609,88,Gm20481
4,ENSMUSG00000026628,78,Atf3
...,...,...,...
516,ENSMUSG00000074141,-30,Il4i1
517,ENSMUSG00000025652,-31,Tmem89
518,ENSMUSG00000057072,-31,Spata45
519,ENSMUSG00000030858,-33,Fam24b


## merge the Ensembl biomart data (Chromosome position)

- For drawing Circos plot

In [6]:
df_ensembl_position = pd.read_csv(f"../data/biomart_goslim/biomart_{organism}_position_R110.tsv", sep="\t", low_memory=False)

# 1. merged all data
merged_df_position = pd.merge(
    df_ensembl_position,
    COMBINED,
    left_on="Gene stable ID",
    right_on="converted_alias",
    how="inner"
)
merged_df_position = merged_df_position.drop(columns=['converted_alias'])
merged_df_position = merged_df_position.sort_values(by='HN-score(HN5)', ascending=False).copy()
merged_df_position.to_csv(f"../data/{annotation_dir}/{organism}_position_all.tsv", sep="\t" , index=False)

# 2. merged up genes data
merged_df_position_up = pd.merge(
    df_ensembl_position,
    UP,
    left_on="Gene stable ID",
    right_on="converted_alias",
    how="inner"
)
merged_df_position_up = merged_df_position_up.drop(columns=['converted_alias'])
merged_df_position_up = merged_df_position_up.sort_values(by='HN-score(HN5)', ascending=False).copy()
merged_df_position_up.to_csv(f"../data/{annotation_dir}/{organism}_position_up.tsv", sep="\t", index=False)

# 3. merged down genes data
merged_df_position_down = pd.merge(
    df_ensembl_position,
    DOWN,
    left_on="Gene stable ID",
    right_on="converted_alias",
    how="inner"
)
merged_df_position_down = merged_df_position_down.drop(columns=['converted_alias'])
merged_df_position_down = merged_df_position_down.sort_values(by='HN-score(HN5)', ascending=False).copy()
merged_df_position_down.to_csv(f"../data/{annotation_dir}/{organism}_position_down.tsv", sep="\t", index=False)

display(df_ensembl_position, merged_df_position, merged_df_position_up, merged_df_position_down)

Unnamed: 0,Gene stable ID,Chromosome/scaffold name,Gene start (bp),Gene end (bp),Strand
0,ENSMUSG00000064336,MT,1,68,1
1,ENSMUSG00000064337,MT,70,1024,1
2,ENSMUSG00000064338,MT,1025,1093,1
3,ENSMUSG00000064339,MT,1094,2675,1
4,ENSMUSG00000064340,MT,2676,2750,1
...,...,...,...,...,...
56936,ENSMUSG00000054766,2,29947390,29962589,1
56937,ENSMUSG00000026785,2,29967696,29981034,1
56938,ENSMUSG00000015335,2,29980956,29983660,-1
56939,ENSMUSG00000039686,2,29987295,30014597,-1


Unnamed: 0,Gene stable ID,Chromosome/scaffold name,Gene start (bp),Gene end (bp),Strand,HN-score(HN5),name
351,ENSMUSG00000090877,17,35175412,35178214,-1,149,Hspa1b
352,ENSMUSG00000091971,17,35188166,35191132,-1,137,Hspa1a
425,ENSMUSG00000004951,5,135916773,135918417,1,111,Hspb1
369,ENSMUSG00000092609,17,35188888,35191112,1,88,Gm20481
198,ENSMUSG00000026628,1,190902493,190950236,-1,78,Atf3
...,...,...,...,...,...,...,...
130,ENSMUSG00000074141,7,44465811,44490233,1,-30,Il4i1
195,ENSMUSG00000057072,1,190768836,190775139,1,-31,Spata45
385,ENSMUSG00000025652,9,108743687,108744631,1,-31,Tmem89
333,ENSMUSG00000030858,7,130927673,130931245,-1,-33,Fam24b


Unnamed: 0,Gene stable ID,Chromosome/scaffold name,Gene start (bp),Gene end (bp),Strand,HN-score(HN5),name
171,ENSMUSG00000090877,17,35175412,35178214,-1,149,Hspa1b
172,ENSMUSG00000091971,17,35188166,35191132,-1,137,Hspa1a
213,ENSMUSG00000004951,5,135916773,135918417,1,111,Hspb1
183,ENSMUSG00000092609,17,35188888,35191112,1,88,Gm20481
113,ENSMUSG00000026628,1,190902493,190950236,-1,78,Atf3
...,...,...,...,...,...,...,...
211,ENSMUSG00000037887,7,141633227,141649580,-1,13,Dusp8
66,ENSMUSG00000035775,11,99319229,99328976,-1,13,Krt20
65,ENSMUSG00000054034,X,135101697,135104625,-1,13,Tceal5
63,ENSMUSG00000098715,6,48695541,48729578,1,13,Gm28053


Unnamed: 0,Gene stable ID,Chromosome/scaffold name,Gene start (bp),Gene end (bp),Strand,HN-score(HN5),name
40,ENSMUSG00000039936,4,149733625,149787028,-1,-15,Pik3cd
104,ENSMUSG00000047671,4,116983991,116985935,1,-15,Dynlt4
138,ENSMUSG00000051378,11,102796355,102815950,-1,-15,Kif18b
135,ENSMUSG00000000440,6,115337912,115467360,1,-15,Pparg
39,ENSMUSG00000046949,13,34148670,34172426,1,-15,Nqo2
...,...,...,...,...,...,...,...
50,ENSMUSG00000074141,7,44465811,44490233,1,-30,Il4i1
193,ENSMUSG00000025652,9,108743687,108744631,1,-31,Tmem89
83,ENSMUSG00000057072,1,190768836,190775139,1,-31,Spata45
171,ENSMUSG00000030858,7,130927673,130931245,-1,-33,Fam24b


## merge the Ensembl biomart data (GOslim)

- For exexution of GO enrichment analysis

In [7]:
df_ensembl = pd.read_csv(f"../data/biomart_goslim/biomart_{organism}_goslim_R110_domain.tsv", sep="\t", low_memory=False)
df_ensembl_uniq = df_ensembl.drop_duplicates(subset=['Gene stable ID', 
                                                     'GOSlim GOA Accession(s)', 
                                                     'GOSlim GOA Description'], 
                                                     keep='first').copy()
df_ensembl_uniq.rename(columns={'start2 (bp)': 'Gene start(bp)'}, inplace=True)
df_ensembl_uniq.rename(columns={'end2 (bp)': 'Gene end(bp)'}, inplace=True)

# 1. merge all data
merged_df = pd.merge(df_ensembl_uniq,COMBINED,
                        left_on="Gene stable ID",
                        right_on="converted_alias",
                     how="inner")

merged_df = merged_df.drop(columns=['converted_alias'])
merged_df = merged_df.sort_values(by='HN-score(HN5)', ascending=False).copy()
merged_df.to_csv(f"../data/{annotation_dir}/{organism}_annotation.tsv", sep="\t" , index=False)
merged_df

# 2. merge UP data
merged_df_up = pd.merge(df_ensembl_uniq,UP,
                        left_on="Gene stable ID",
                        right_on="converted_alias",
                     how="inner")

merged_df_up = merged_df_up.drop(columns=['converted_alias'])
merged_df_up = merged_df_up.sort_values(by='HN-score(HN5)', ascending=False).copy()
merged_df_up.to_csv(f"../data/{annotation_dir}/{organism}_annotation_up.tsv", sep="\t" , index=False)

# 3. merge DOWN data
merged_df_down = pd.merge(df_ensembl_uniq,DOWN,
                        left_on="Gene stable ID",
                        right_on="converted_alias",
                     how="inner")

merged_df_down = merged_df_down.drop(columns=['converted_alias'])
merged_df_down = merged_df_down.sort_values(by='HN-score(HN5)', ascending=False).copy()
merged_df_down.to_csv(f"../data/{annotation_dir}/{organism}_annotation_down.tsv", sep="\t", index=False)

display(df_ensembl_uniq, merged_df, merged_df_up, merged_df_down)

Unnamed: 0,Gene stable ID,Chromosome/scaffold name,Gene start (bp),Gene end (bp),GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain
0,ENSMUSG00000064336,MT,1,68,GO:0060090,molecular adaptor activity,molecular_function
1,ENSMUSG00000064336,MT,1,68,GO:0003723,RNA binding,molecular_function
2,ENSMUSG00000064336,MT,1,68,GO:0043226,organelle,cellular_component
3,ENSMUSG00000064336,MT,1,68,GO:0005739,mitochondrion,cellular_component
4,ENSMUSG00000064337,MT,70,1024,GO:0005198,structural molecule activity,molecular_function
...,...,...,...,...,...,...,...
170875,ENSMUSG00000015335,2,29980956,29983660,GO:0036211,protein modification process,biological_process
170876,ENSMUSG00000015335,2,29980956,29983660,GO:0023052,signaling,biological_process
170877,ENSMUSG00000015335,2,29980956,29983660,GO:0048856,anatomical structure development,biological_process
170878,ENSMUSG00000015335,2,29980956,29983660,GO:0034330,cell junction organization,biological_process


Unnamed: 0,Gene stable ID,Chromosome/scaffold name,Gene start (bp),Gene end (bp),GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain,HN-score(HN5),name
2469,ENSMUSG00000090877,17,35175412,35178214,GO:0000278,mitotic cell cycle,biological_process,149,Hspa1b
2461,ENSMUSG00000090877,17,35175412,35178214,GO:0016787,hydrolase activity,molecular_function,149,Hspa1b
2471,ENSMUSG00000090877,17,35175412,35178214,GO:0065003,protein-containing complex assembly,biological_process,149,Hspa1b
2468,ENSMUSG00000090877,17,35175412,35178214,GO:0007010,cytoskeleton organization,biological_process,149,Hspa1b
2467,ENSMUSG00000090877,17,35175412,35178214,GO:0006457,protein folding,biological_process,149,Hspa1b
...,...,...,...,...,...,...,...,...,...
3436,ENSMUSG00000089798,5,107682575,107699408,GO:0043226,organelle,cellular_component,-35,1700028K03Rik
3435,ENSMUSG00000089798,5,107682575,107699408,GO:0007059,chromosome segregation,biological_process,-35,1700028K03Rik
3434,ENSMUSG00000089798,5,107682575,107699408,GO:0022414,reproductive process,biological_process,-35,1700028K03Rik
3433,ENSMUSG00000089798,5,107682575,107699408,GO:0140013,meiotic nuclear division,biological_process,-35,1700028K03Rik


Unnamed: 0,Gene stable ID,Chromosome/scaffold name,Gene start (bp),Gene end (bp),GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain,HN-score(HN5),name
1374,ENSMUSG00000090877,17,35175412,35178214,GO:0030163,protein catabolic process,biological_process,149,Hspa1b
1379,ENSMUSG00000090877,17,35175412,35178214,GO:0140014,mitotic nuclear division,biological_process,149,Hspa1b
1370,ENSMUSG00000090877,17,35175412,35178214,GO:0016787,hydrolase activity,molecular_function,149,Hspa1b
1371,ENSMUSG00000090877,17,35175412,35178214,GO:0005886,plasma membrane,cellular_component,149,Hspa1b
1372,ENSMUSG00000090877,17,35175412,35178214,GO:0005856,cytoskeleton,cellular_component,149,Hspa1b
...,...,...,...,...,...,...,...,...,...
597,ENSMUSG00000054034,X,135101697,135104625,GO:0005634,nucleus,cellular_component,13,Tceal5
598,ENSMUSG00000054034,X,135101697,135104625,GO:0045182,translation regulator activity,molecular_function,13,Tceal5
599,ENSMUSG00000035775,11,99319229,99328976,GO:0005198,structural molecule activity,molecular_function,13,Krt20
600,ENSMUSG00000035775,11,99319229,99328976,GO:0005829,cytosol,cellular_component,13,Krt20


Unnamed: 0,Gene stable ID,Chromosome/scaffold name,Gene start (bp),Gene end (bp),GOSlim GOA Accession(s),GOSlim GOA Description,GOSlim_domain,HN-score(HN5),name
673,ENSMUSG00000059395,13,21651217,21652679,GO:0043226,organelle,cellular_component,-15,Nkapl
1195,ENSMUSG00000007122,1,172037461,172047435,GO:0065003,protein-containing complex assembly,biological_process,-15,Casq1
660,ENSMUSG00000045664,19,5965664,5974844,GO:0005829,cytosol,cellular_component,-15,Cdc42ep2
661,ENSMUSG00000045664,19,5965664,5974844,GO:0002376,immune system process,biological_process,-15,Cdc42ep2
662,ENSMUSG00000045664,19,5965664,5974844,GO:0098542,defense response to other organism,biological_process,-15,Cdc42ep2
...,...,...,...,...,...,...,...,...,...
1455,ENSMUSG00000089798,5,107682575,107699408,GO:0043226,organelle,cellular_component,-35,1700028K03Rik
1456,ENSMUSG00000089798,5,107682575,107699408,GO:0005694,chromosome,cellular_component,-35,1700028K03Rik
1453,ENSMUSG00000089798,5,107682575,107699408,GO:0022414,reproductive process,biological_process,-35,1700028K03Rik
1452,ENSMUSG00000089798,5,107682575,107699408,GO:0140013,meiotic nuclear division,biological_process,-35,1700028K03Rik
