In [1]:
import pathlib

import pandas as pd

In [2]:
codes_df = pd.read_csv('../data/phenotypes_meta/chosen_icd_codes.csv', header=None, names=['code'])

codes_df.head(2)

Unnamed: 0,code
0,A09
1,C18


# Heritability information

In [4]:
h2_df = pd.read_csv('../data/neale_heritability_estimates/ukb31063_h2_all.02Oct2019.tsv.gz', 
                    usecols=['phenotype', 'sex', 'n_cases', 'n_controls', 'h2_liability', 
                             'h2_liability_se', 'h2_z', 'h2_p'], sep='\t', compression='gzip')

h2_df.head(2)

Unnamed: 0,phenotype,sex,n_cases,n_controls,h2_liability,h2_liability_se,h2_z,h2_p
0,100001_irnt,both_sexes,,,0.068818,0.016857,4.082528,2.2e-05
1,100001_raw,both_sexes,,,0.069468,0.016275,4.268457,1e-05


In [8]:
relevant_h2_df = (
    h2_df
    .query('sex == "both_sexes"')
    .merge(codes_df, left_on='phenotype', right_on='code')
    .filter(items=['code', 'n_cases', 'n_controls', 'h2_liability', 'h2_liability_se', 'h2_z', 'h2_p'])
)

relevant_h2_df.to_csv('../data/phenotypes_meta/heritability.tsv', sep='\t', index=False)

relevant_h2_df.head(2)

Unnamed: 0,code,n_cases,n_controls,h2_liability,h2_liability_se,h2_z,h2_p
0,A09,2161.0,359033.0,-0.029687,0.043285,-0.685859,0.753599
1,C18,2226.0,358968.0,0.120329,0.042461,2.833887,0.002299


# Genetic correlation information

In [13]:
rg_df = pd.read_csv('../data/phenotypes_meta/genetic_correlation.tsv', sep='\t')

rg_df.head(2)

Unnamed: 0,p1,p2,rg,se,p
0,A09,C18,-1.013,1.092,0.3535
1,A09,C34,-0.07082,0.6842,0.9176


# Phenotypic correlation information

In [14]:
rp_df = pd.read_csv('../data/phenotypes_meta/phenotypic_correlation.tsv', sep='\t')

rp_df.head(2)

Unnamed: 0,p1,p2,rp,cp
0,A09,C18,0.022819,0.022819
1,A09,C34,0.023801,0.023801


# Combine information

In [22]:
coheritability_df = (
    rg_df
    .merge(rp_df, on=['p1', 'p2'])
    .merge(relevant_h2_df, left_on='p1', right_on='code')
    .merge(relevant_h2_df, left_on='p2', right_on='code', suffixes=['_1', '_2'])
    .assign(coheritability=lambda df: (df['h2_liability_1'] * df['h2_liability_2'])**0.5 * df['rg'] / df['rp'])
    .filter(items=['code_1', 'code_2', 'h2_liability_1', 'h2_liability_2', 'rg', 'rp', 'coheritability'])
)

coheritability_df.to_csv('../data/phenotypes_meta/coheritability.tsv', sep='\t', index=False)

coheritability_df.head(2)

Unnamed: 0,code_1,code_2,h2_liability_1,h2_liability_2,rg,rp,coheritability
0,A09,C18,-0.029687,0.120329,-1.013,0.022819,
1,A09,C34,-0.029687,0.116966,-0.07082,0.023801,
