In [1]:
import pandas as pd
import os
from statsmodels.stats.multitest import fdrcorrection

In [2]:
pqtl_type = 'bag'
# data_path: /path/to/pqtl/data/ -> including downloaded xxx.txt.gz files
data_dir = 'deCODE_pqtl_data_raw'
data_list = os.listdir(data_dir)
# all pqtl files are end with txt.gz
data_list = list(filter(lambda x: x.endswith('txt.gz'), data_list))
# sig_pqtls = []
# print(data_list)
print(len(data_list))

76


In [3]:
for i, pqtls in enumerate(data_list):
    print(f'{i + 1}: preprocessing on {data_list[i]} ...')
    if not os.path.exists(os.path.join('deCODE_pqtl_data_significant', data_list[i])):
        # preprocessing...
        # read pqtl summary data
        pqtl_data = pd.read_csv(os.path.join(data_dir, data_list[i]), compression='gzip', sep='\t', low_memory=False)

        # get pqtl id
        filenames = data_list[i].split('.')

        # remove pqtl without rsids
        pqtl_data = pqtl_data[~pqtl_data['rsids'].isna()].copy()

        # calculate fdr p-value (may not be used)
        fdr_pval = fdrcorrection(pqtl_data['Pval'].to_numpy(), is_sorted=False)
        pqtl_data['FDR'] = fdr_pval[1]

        # select significant SNPs with p-value < 5e-6 | FDR < 0.05
        pqtl_data_sig = pqtl_data[(pqtl_data['Pval'] < 5e-6) | (pqtl_data['FDR'] < 0.05)].copy()
        pqtl_data_sig['id'] = filenames[0]
        pqtl_data_sig.to_csv(f'deCODE_pqtl_data_significant/{data_list[i]}', sep='\t', index=False, compression='gzip')
    else:
        # if preprocessed, just read from local dir
        # pqtl_data_sig = pd.read_csv(os.path.join('deCODE_pqtl_data_significant', data_list[i]), sep='\t', compression='gzip')
        pass
    # sig_pqtls.append(pqtl_data_sig)
print('Done')

1: preprocessing on 10546_2_TSPEAR_TSEAR.txt.gz ...
2: preprocessing on 10809_14_KLRB1_KLRB1.txt.gz ...
3: preprocessing on 10818_36_SMPD1_ASM.txt.gz ...
4: preprocessing on 11338_49_BLK_BLK.txt.gz ...
5: preprocessing on 11360_39_RRM1_RRM1.txt.gz ...
6: preprocessing on 11448_34_GALK1_GALK1.txt.gz ...
7: preprocessing on 11617_1_ITGAL_LFA_1_alpha_L_chain.txt.gz ...
8: preprocessing on 11709_29_CPT1B_CPT1B.txt.gz ...
9: preprocessing on 12427_8_CDC25B_MPIP2.txt.gz ...
10: preprocessing on 12553_5_VRK1_VRK1.txt.gz ...
11: preprocessing on 12664_19_PTPN13_PTN13.txt.gz ...
12: preprocessing on 13133_73_LTBP4_LTBP4.txt.gz ...
13: preprocessing on 13463_1_PXDN_PXDN.txt.gz ...
14: preprocessing on 13955_33_DAPK1_DAPK1.txt.gz ...
15: preprocessing on 14069_61_CA4_Carbonic_Anhydrase_IV.txt.gz ...
16: preprocessing on 14136_234_CD93_C1QR1.txt.gz ...
17: preprocessing on 14645_253_GSTA4_GSTA4.txt.gz ...
18: preprocessing on 15347_12_HPX_Hemopexin.txt.gz ...
19: preprocessing on 15437_11_RAMP3_RA

In [4]:
df = pd.read_csv(f'data/pqtls_deCODE_{pqtl_type}.csv')
pqtl_lists = df['SeqId'].tolist()
pqtl_data_list = os.listdir('deCODE_pqtl_data_significant')

pqtls_bag = []

def find_index(seq_id):
    for uid, idx in enumerate(pqtl_data_list):
        if idx.startswith(seq_id):
            return uid

In [5]:
for pqtl in pqtl_lists:
    pqtl_file =  pqtl_data_list[find_index(pqtl)]
    df_pqtl_sig = pd.read_csv(os.path.join('deCODE_pqtl_data_significant', pqtl_file), sep='\t', compression='gzip')
    df_pqtl_sig = df_pqtl_sig[df_pqtl_sig['FDR'] < 0.05].copy()
    pqtls_bag.append(df_pqtl_sig)

In [6]:
# concat all pqtls
df_sig_all = pd.concat(pqtls_bag, axis=0)
# get Seq_id from id: 10546_2_TSPEAR_TSEAR -> 10546_2
df_sig_all['Seq_id'] = df_sig_all['id'].apply(lambda x:  '_'.join(x.split('_')[0:2]))
print(len(pd.unique(df_sig_all['id'])))
df_sig_all.to_csv(f'pqtl_data/pqtls_decode_{pqtl_type}.csv', index=False)

34
