In [1]:
import os
import os.path as osp
import numpy as np
import pandas as pd

from config import PATH_RAW, PATH_PROCESSED

os.listdir(PATH_RAW)

['gene_gene_relation.csv',
 'Gene_to_pheno.csv',
 'snps_labeled_230201.csv',
 'snps_list.csv',
 'labeled_230201_clinvar.csv',
 'snps_list_embedding.npy',
 'label_snp_disease.csv',
 'Mapping_snps_genes.csv',
 'labeled_230201_clinvar_embedding.npy',
 'gene_diseases.csv',
 'gene_embeddings.csv',
 'trait_HPO_embedding.csv',
 'trait_relation_HPO_HPO.csv',
 'snps_labeled_230201.npy',
 'disease_traits.csv',
 'gene_coexpression_gene2vec_dim_200_iter_9.txt']

# graph

## embeddings

In [2]:
# [24,447 x 200]
gene_local_x = pd.read_csv(osp.join(PATH_RAW, 'gene_embeddings.csv'), index_col='gene')
# [8,622 x 768]
gene_global_x = pd.read_csv(osp.join(PATH_RAW, 'gene_coexpression_gene2vec_dim_200_iter_9.txt'), sep='\s+', header=None, index_col=0)

# Standardization
gene_local_x = (gene_local_x - gene_local_x.mean()) / gene_local_x.std()
gene_global_x = (gene_global_x - gene_global_x.mean()) / gene_global_x.std()

# [8,127 x 968]
gene_x = pd.merge(gene_local_x, gene_global_x, left_index=True, right_index=True, how='inner')
split = gene_local_x.shape[1]
gene_x.iloc[:, :split].to_csv(osp.join(PATH_PROCESSED, 'gene_local_x.csv'))
gene_x.iloc[:, split:].to_csv(osp.join(PATH_PROCESSED, 'gene_global_x.csv'))

In [3]:
trait_x = pd.read_csv(osp.join(PATH_RAW, 'trait_HPO_embedding.csv'), index_col='HPO', usecols=['HPO'] + [f'col_{i}' for i in range(1, 768+1)])

# [8,526 x 768]
trait_x.columns = range(trait_x.shape[1])
trait_x.to_csv(osp.join(PATH_PROCESSED, 'trait_x.csv'))

## relations

In [4]:
# [642,150 x 2]
raw_gene_to_gene = pd.read_csv(osp.join(PATH_RAW, 'gene_gene_relation.csv'))

# [171,534 x 2]
gene_to_gene = raw_gene_to_gene[
    raw_gene_to_gene['gene1'].isin(gene_x.index) &
    raw_gene_to_gene['gene2'].isin(gene_x.index)
]
gene_to_gene.to_csv(osp.join(PATH_PROCESSED, 'gene_to_gene.csv'), index=False)

In [5]:
# [270,287 x 2]
raw_gene_to_trait = pd.read_csv(osp.join(PATH_RAW, 'Gene_to_pheno.csv'))

# [221,916 x 2]
gene_to_trait = raw_gene_to_trait[
    raw_gene_to_trait['Gene Name'].isin(gene_x.index) &
    raw_gene_to_trait['HPO'].isin(trait_x.index)
]
gene_to_trait.to_csv(osp.join(PATH_PROCESSED, 'gene_to_trait.csv'), index=False)

In [6]:
# [15,300 x 2]
raw_trait_to_trait = pd.read_csv(osp.join(PATH_RAW, 'trait_relation_HPO_HPO.csv'))

# [7,732 x 2]
trait_to_trait = raw_trait_to_trait[
    raw_trait_to_trait['HPO 1'].isin(trait_x.index) &
    raw_trait_to_trait['HPO 2'].isin(trait_x.index)
]
trait_to_trait.to_csv(osp.join(PATH_PROCESSED, 'trait_to_trait.csv'), index=False)

In [7]:
# [114,050 x 2]
raw_disease_to_traits = pd.read_csv(osp.join(PATH_RAW, 'disease_traits.csv'), index_col='disease_index')

# [4,256 x n]
raw_disease_to_traits = raw_disease_to_traits.groupby('disease_index')['hpo_id'].apply(list)

# [4,256 x n]
disease_to_traits = raw_disease_to_traits[
    raw_disease_to_traits.apply(lambda hpos: all(hpo in trait_x.index for hpo in hpos))
]

# disease_to_traits.to_csv(osp.join(PATH_PROCESSED, 'disease_to_traits.csv'))

In [8]:
# [8,164 x 2]
raw_gene_to_disease = pd.read_csv(osp.join(PATH_RAW, 'gene_diseases.csv'), index_col=None, usecols=['Gene Symbol', 'disease_index'])

# [4,752 x 2]
gene_to_disease = raw_gene_to_disease[
    raw_gene_to_disease['Gene Symbol'].isin(gene_x.index) &
    raw_gene_to_disease['disease_index'].isin(disease_to_traits.index)
]

# [4,762 x 3]
gene_to_traits = pd.merge(gene_to_disease, disease_to_traits, how='inner', on='disease_index')[['Gene Symbol', 'disease_index', 'hpo_id']]
gene_to_traits.to_csv(osp.join(PATH_PROCESSED, 'gene_to_traits.csv'), index=False)

# labels

## embedding

In [9]:
variant_df = pd.read_csv(osp.join(PATH_RAW, 'snps_labeled_230201.csv'), index_col='SNPs')
variant_arr = np.load(osp.join(PATH_RAW, 'snps_labeled_230201.npy'))
assert variant_df.shape[0] == variant_arr.shape[0], "Mismatch between number of SNPs and feature rows"
# [97,737 x 768]
raw_variant_x = pd.DataFrame(variant_arr, index=variant_df.index)
raw_variant_x

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
SNPs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NM_000014.6(A2M):c.829G>A (p.Asp277Asn),-0.010030,-0.004678,-0.003894,-0.029185,-0.012140,0.004721,-0.008216,-0.018250,0.022564,0.003189,...,0.014815,0.029100,0.017228,-0.012612,0.012130,-0.004093,-0.007587,-0.015427,0.008467,-0.019443
NM_000014.6(A2M):c.3092G>A (p.Arg1031Gln),-0.010203,-0.003665,-0.003978,-0.029019,-0.010637,0.004634,-0.009549,-0.017601,0.022589,0.002845,...,0.015914,0.028691,0.017341,-0.012975,0.011548,-0.004160,-0.007452,-0.015280,0.008016,-0.020172
NM_000014.6(A2M):c.2111G>A (p.Arg704His),-0.009717,-0.003764,-0.003818,-0.028815,-0.010637,0.004590,-0.009292,-0.017699,0.022617,0.002831,...,0.015583,0.028711,0.016991,-0.012973,0.011551,-0.004408,-0.007649,-0.015140,0.007790,-0.020036
NM_000014.6(A2M):c.2915G>A (p.Cys972Tyr),-0.009627,-0.003995,-0.004618,-0.028913,-0.010261,0.005093,-0.009371,-0.017756,0.022011,0.002158,...,0.016931,0.029058,0.016150,-0.013060,0.011680,-0.004296,-0.007881,-0.015007,0.007271,-0.021057
NM_000014.6(A2M):c.2998A>G (p.Ile1000Val),-0.010055,-0.003649,-0.004112,-0.028982,-0.010733,0.004663,-0.009242,-0.017738,0.022982,0.003147,...,0.015910,0.028889,0.017318,-0.012829,0.011566,-0.004295,-0.007680,-0.015432,0.008236,-0.020124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NM_001042697.2(ZSWIM7):c.176C>T (p.Ser59Leu),-0.014990,-0.017000,-0.006751,-0.006652,-0.001368,0.000074,0.027775,0.007331,0.041379,0.034610,...,-0.000733,0.013854,0.005506,-0.002614,0.031842,-0.006742,0.003648,-0.011730,0.004335,-0.047161
NM_017975.5(ZWILCH):c.49C>T (p.Leu17Phe),-0.010196,-0.009583,-0.004359,-0.001254,-0.003396,-0.011568,0.021575,-0.013702,0.024417,0.006282,...,0.008871,0.016491,0.028825,-0.013371,0.011086,-0.005522,0.000179,-0.008438,0.003431,-0.026423
NM_001004339.3(ZYG11A):c.371G>A (p.Arg124His),0.001320,0.002179,-0.011384,-0.020258,-0.022669,0.009651,0.023247,-0.022677,0.070586,0.005886,...,0.015946,0.043089,0.047286,-0.013481,0.009090,-0.009097,0.008506,-0.007968,0.028604,-0.032567
NM_003461.5(ZYX):c.578G>A (p.Gly193Asp),0.007548,-0.010813,-0.002046,-0.005040,-0.028927,-0.006940,-0.004344,-0.026326,0.072114,0.024151,...,0.008146,0.039640,0.037723,-0.014253,0.018846,-0.008219,-0.002374,-0.020904,0.019991,-0.010182


## relations

In [10]:
# [16,292 x 2]
raw_variant_to_disease = pd.read_csv(osp.join(PATH_RAW, 'label_snp_disease.csv'), usecols=['snps', 'disease_index'])

# [13,025 x 2]
variant_to_disease = raw_variant_to_disease[
    raw_variant_to_disease['snps'].isin(raw_variant_x.index) &
    raw_variant_to_disease['disease_index'].isin(disease_to_traits.index)    
]

# [13,025 x 3]
variant_disease_traits = pd.merge(variant_to_disease, disease_to_traits, on='disease_index', how='inner')
variant_disease_traits

Unnamed: 0,snps,disease_index,hpo_id
0,NM_015665.6(AAAS):c.938T>C (p.Val313Ala),869,"[HP:0000846, HP:0001250, HP:0002571, HP:000744..."
1,NM_015665.6(AAAS):c.887C>A (p.Ser296Tyr),869,"[HP:0000846, HP:0001250, HP:0002571, HP:000744..."
2,NM_015665.6(AAAS):c.787T>C (p.Ser263Pro),869,"[HP:0000846, HP:0001250, HP:0002571, HP:000744..."
3,NM_015665.6(AAAS):c.43C>A (p.Gln15Lys),869,"[HP:0000846, HP:0001250, HP:0002571, HP:000744..."
4,NM_005763.4(AASS):c.395G>A (p.Arg132His),2203,"[HP:0025331, HP:0030051, HP:0031867, HP:004028..."
...,...,...,...
13020,NM_014795.4(ZEB2):c.851G>A (p.Cys284Tyr),2152,"[HP:0000020, HP:0000028, HP:0000047, HP:000017..."
13021,NM_001284236.3(ZFYVE16):c.3442G>T (p.Asp1148Tyr),46724,"[HP:0100659, HP:0100761, HP:0100784]"
13022,NM_007129.5(ZIC2):c.1225C>T (p.Arg409Trp),2162,"[HP:0000028, HP:0000079, HP:0000093, HP:000016..."
13023,NM_004773.4(ZNHIT3):c.92C>T (p.Ser31Leu),2836,"[HP:0000174, HP:0000177, HP:0000194, HP:000021..."


In [11]:
# [583,722 x 2]
raw_variant_to_gene = pd.read_csv(osp.join(PATH_RAW, 'Mapping_snps_genes.csv'))

# [10,547 x 2]
variant_to_gene = raw_variant_to_gene[
    raw_variant_to_gene['snps'].isin(variant_to_disease['snps']) &
    raw_variant_to_gene['genes'].isin(gene_x.index)
]

# [12,914 x 4]
variant_gene_disease_traits = pd.merge(variant_disease_traits, variant_to_gene, on='snps', how='inner')[['snps', 'genes', 'disease_index', 'hpo_id']]

variant_gene_disease_traits = variant_gene_disease_traits.set_index('snps')
variant_gene_disease_traits.to_csv(osp.join(PATH_PROCESSED, 'labels.csv'))

## optimization

In [12]:
# [97,737 x 768]
# [10,547 x 768], optionally, consider dropping unused rows to optimize memory usage
variant_x = raw_variant_x[
    raw_variant_x.index.isin(variant_gene_disease_traits.index)
]
variant_x = (variant_x - variant_x.mean()) / variant_x.std()

variant_x.to_csv(osp.join(PATH_PROCESSED, 'variant_x.csv'))

In [13]:
# [4,276 x 2]
# [848 x 2]
disease_to_traits = variant_gene_disease_traits.groupby('disease_index')['hpo_id'].first()
disease_to_traits.to_csv(osp.join(PATH_PROCESSED, 'disease_to_traits.csv'))