In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import tqdm.notebook

# Computing phenotypic correlations

This is loosely based on https://gist.github.com/ce-carey/6480d6544f132829d9579b2a1f1455b4.

We will compute the correlations between residualized phenotypes after adjusting for the GWAS covariates.

# 1. Build outcomes (phenotypes) table

In [2]:
relevant_icd_10_df = pd.read_csv('../data/phenotypes_meta/chosen_icd_codes.csv', header=None, names=['code'])

relevant_icd_10_df.head(2)

Unnamed: 0,code
0,A09
1,C18


In [3]:
# Copy the table to the local directory
# icd_10_df = pd.read_csv('/data1/deep_storage/ukbiobank/ukb_datapulls/ukb23674_icd10_long.txt.gz', 
#                         sep='\t', compression='gzip')

# icd_10_df.to_csv('../data/phenotypes/ukb_raw_icd10_long.tsv.gz', sep='\t', index=False, compression='gzip')

icd_10_df = pd.read_csv('../data/phenotypes/ukb_raw_icd10_long.tsv.gz', sep='\t', compression='gzip')

icd_10_df.head(0)

Unnamed: 0,eid,icd10_code,column_code


In [4]:
all_eid_df = icd_10_df[['eid']].drop_duplicates()

all_eid_df.to_csv('../data/phenotypes/all_eids.tsv', sep='\t', index=False)

all_eid_df.head(0)

Unnamed: 0,eid


In [5]:
relevant_code_occurrences_df = (
    icd_10_df
    .assign(
        code=lambda df: df['icd10_code'].apply(lambda x: x[:3]),
        had_code=1
    )
    .merge(relevant_icd_10_df, on='code')
    .groupby(['eid', 'code'])
    .agg({'had_code': 'max'})
    .reset_index()
    .query('had_code == 1')
    .filter(items=['eid', 'code'])
)

relevant_code_occurrences_df.to_csv('../data/phenotypes/relevant_occurrences_long.tsv', sep='\t', index=False)

relevant_code_occurrences_df.head(0)

Unnamed: 0,eid,code


In [6]:
wide_phenotypes_df = (
    all_eid_df
    .merge(relevant_code_occurrences_df.assign(had=1), on='eid', how='left')
    .pivot_table(index='eid', columns='code', values='had', aggfunc='max', fill_value=0)
    .reset_index()
    .rename(columns={'eid': 'IID'})
)

wide_phenotypes_df.to_csv('../data/phenotypes/relevant_occurrences_wide.tsv', sep='\t', index=False)

wide_phenotypes_df.head(0)

code,IID,A09,C18,C34,C43,C44,C50,C67,D12,D17,...,Z01,Z03,Z08,Z09,Z12,Z42,Z43,Z45,Z47,Z53


# 2. Build covariates table

In [7]:
# # Copy the covariates file to the local directory and add a header
# covar_df = pd.read_csv('/data1/deep_storage/ukbiobank/pt2281/covar_chip/covar_chr1_chip.txt', 
#                        sep='\s+', header=None, usecols=list(range(14)), 
#                        names=['FID', 'IID', 'sex', 'age', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 
#                               'PC6', 'PC7', 'PC8', 'PC9', 'PC10'])

# covar_df.to_csv('../data/phenotypes/raw_covariates.covar', sep='\t', index=False)

covar_df = pd.read_csv('../data/phenotypes/raw_covariates.covar', sep='\t')

covar_df.head(0)

Unnamed: 0,FID,IID,sex,age,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10


In [8]:
gwas_covar_df = (
    covar_df
    .assign(
        age_squared=lambda df: df['age']**2,
        age_sex=lambda df: df['sex'] * df['age'],
        age_squared_sex=lambda df: df['sex'] * df['age_squared']
    )
)

gwas_covar_df.to_csv('../data/phenotypes/gwas_covariates.covar', sep='\t', index=False)

gwas_covar_df.head(0)

Unnamed: 0,FID,IID,sex,age,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,age_squared,age_sex,age_squared_sex


# 3. Compute residualized phenotypes

In [9]:
full_df = wide_phenotypes_df.merge(gwas_covar_df, on='IID').drop(columns=['FID'])

full_df.head(0)

Unnamed: 0,IID,A09,C18,C34,C43,C44,C50,C67,D12,D17,...,PC4,PC5,PC6,PC7,PC8,PC9,PC10,age_squared,age_sex,age_squared_sex


In [10]:
phenotype_codes = relevant_icd_10_df['code'].values.tolist()

X = full_df[['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 
             'age', 'sex', 'age_squared', 'age_sex', 'age_squared_sex']]

resid_df = full_df[['IID'] + phenotype_codes].copy()

for phenotype_code in tqdm.notebook.tqdm(phenotype_codes):
    reg = sm.OLS(full_df[phenotype_code].values, X)
    res = reg.fit()
    resid_df[phenotype_code] = res.resid

resid_df.to_csv('../data/phenotypes/residualized_phenotypes.tsv.gz', sep='\t', index=False, 
                compression='gzip')
    
resid_df.head(0)

  0%|          | 0/142 [00:00<?, ?it/s]

Unnamed: 0,IID,A09,C18,C34,C43,C44,C50,C67,D12,D17,...,Z01,Z03,Z08,Z09,Z12,Z42,Z43,Z45,Z47,Z53


# 4. Compute correlation between residualized phenotypes

In [11]:
correlation_df = (
    resid_df
    .drop(columns=['IID'])
    .corr()
    .stack()
    .reset_index()
    .rename(columns={'level_0': 'p1', 'level_1': 'p2', 0: 'rp'})
    .query('p1 < p2')
)

In [12]:
def normalize(values):
    return (values - values.mean()) / values.std()

In [13]:
covariance_df = (
    resid_df
    .drop(columns=['IID'])
    .apply(normalize, axis=0)
    .cov()
    .stack()
    .reset_index()
    .rename(columns={'level_0': 'p1', 'level_1': 'p2', 0: 'cp'})
    .query('p1 < p2')
)

In [14]:
phenotypic_correlation_df = correlation_df.merge(covariance_df, on=['p1', 'p2'])

phenotypic_correlation_df.to_csv('../data/phenotypes_meta/phenotypic_correlation.tsv', sep='\t', index=False)

phenotypic_correlation_df.head(2)

Unnamed: 0,p1,p2,rp,cp
0,A09,C18,0.022819,0.022819
1,A09,C34,0.023801,0.023801
