In [1]:
import pandas as pd
import os
from statsmodels.stats.multitest import fdrcorrection

In [12]:
pqtl_type = 'bagm3'
# data_path: /path/to/pqtl/data/ -> including downloaded xxx.txt.gz files
data_dir = 'deCODE_pqtl_data_raw'
data_list = os.listdir(data_dir)
# all pqtl files are end with txt.gz
data_list = list(filter(lambda x: x.endswith('txt.gz'), data_list))
# sig_pqtls = []
print(data_list)
print(len(data_list))

['10546_2_TSPEAR_TSEAR.txt.gz', '10809_14_KLRB1_KLRB1.txt.gz', '10818_36_SMPD1_ASM.txt.gz', '11360_39_RRM1_RRM1.txt.gz', '11617_1_ITGAL_LFA_1_alpha_L_chain.txt.gz', '12553_5_VRK1_VRK1.txt.gz', '13133_73_LTBP4_LTBP4.txt.gz', '13955_33_DAPK1_DAPK1.txt.gz', '15347_12_HPX_Hemopexin.txt.gz', '15482_12_A2ML1_A2ML1.txt.gz', '17704_74_HSPH1_HS105.txt.gz', '17786_5_GGPS1_GGPPS.txt.gz', '2558_51_POMC_b_Endorphin.txt.gz', '2855_49_MAPK3_ERK_1.txt.gz', '2993_1_IL18RAP_IL_18_Rb.txt.gz', '3629_60_CDC42BPB_MRCKB.txt.gz', '3813_3_FYN_FYN.txt.gz', '3825_18_MAPK8_MK08.txt.gz', '4374_45_GDF15_MIC_1.txt.gz', '4890_10_POMC_ACTH.txt.gz', '5096_51_KIR3DL2_KI3L2.txt.gz', '5723_4_INSL3_INSL3.txt.gz', '5737_61_SEMA4D_SEM4D.txt.gz', '5854_60_MAPT_tau.txt.gz', '5936_53_TNF_TNF_a.txt.gz', '5939_42_TNFSF12_TWEAK.txt.gz', '6207_10_PSAP_prosaposin.txt.gz', '6252_62_SCGB3A1_Secretoglobin_family_3A_member_1.txt.gz', '6369_82_DEAF1_DEAF1.txt.gz', '6609_22_CNP_CN37.txt.gz', '9094_5_CLEC4C_CLC4C.txt.gz', '9204_33_POMC_Cor

In [13]:
for i, pqtls in enumerate(data_list):
    print(f'{i + 1}: preprocessing on {data_list[i]} ...')
    if not os.path.exists(os.path.join('deCODE_pqtl_data_significant', data_list[i])):
        # preprocessing...
        # read pqtl summary data
        pqtl_data = pd.read_csv(os.path.join(data_dir, data_list[i]), compression='gzip', sep='\t', low_memory=False)

        # get pqtl id
        filenames = data_list[i].split('.')

        # remove pqtl without rsids
        pqtl_data = pqtl_data[~pqtl_data['rsids'].isna()].copy()

        # calculate fdr p-value (may not be used)
        fdr_pval = fdrcorrection(pqtl_data['Pval'].to_numpy(), is_sorted=False)
        pqtl_data['FDR'] = fdr_pval[1]

        # select significant SNPs with p-value < 5e-6 | FDR < 0.05
        pqtl_data_sig = pqtl_data[(pqtl_data['Pval'] < 5e-6) | (pqtl_data['FDR'] < 0.05)].copy()
        pqtl_data_sig['id'] = filenames[0]
        pqtl_data_sig.to_csv(f'deCODE_pqtl_data_significant/{data_list[i]}', sep='\t', index=False, compression='gzip')
    else:
        # if preprocessed, just read from local dir
        pqtl_data_sig = pd.read_csv(os.path.join('deCODE_pqtl_data_significant', data_list[i]), sep='\t', compression='gzip')

    # sig_pqtls.append(pqtl_data_sig)
print('Done')

1: preprocessing on 10546_2_TSPEAR_TSEAR.txt.gz ...
2: preprocessing on 10809_14_KLRB1_KLRB1.txt.gz ...
3: preprocessing on 10818_36_SMPD1_ASM.txt.gz ...
4: preprocessing on 11360_39_RRM1_RRM1.txt.gz ...
5: preprocessing on 11617_1_ITGAL_LFA_1_alpha_L_chain.txt.gz ...
6: preprocessing on 12553_5_VRK1_VRK1.txt.gz ...
7: preprocessing on 13133_73_LTBP4_LTBP4.txt.gz ...
8: preprocessing on 13955_33_DAPK1_DAPK1.txt.gz ...
9: preprocessing on 15347_12_HPX_Hemopexin.txt.gz ...
10: preprocessing on 15482_12_A2ML1_A2ML1.txt.gz ...
11: preprocessing on 17704_74_HSPH1_HS105.txt.gz ...
12: preprocessing on 17786_5_GGPS1_GGPPS.txt.gz ...
13: preprocessing on 2558_51_POMC_b_Endorphin.txt.gz ...
14: preprocessing on 2855_49_MAPK3_ERK_1.txt.gz ...
15: preprocessing on 2993_1_IL18RAP_IL_18_Rb.txt.gz ...
16: preprocessing on 3629_60_CDC42BPB_MRCKB.txt.gz ...
17: preprocessing on 3813_3_FYN_FYN.txt.gz ...
18: preprocessing on 3825_18_MAPK8_MK08.txt.gz ...
19: preprocessing on 4374_45_GDF15_MIC_1.txt.gz 

In [18]:
df = pd.read_csv(f'data/pqtls_deCODE_{pqtl_type}.csv')
pqtl_lists = df['SeqId'].tolist()
pqtl_data_list = os.listdir('deCODE_pqtl_data_significant')

pqtls_bag = []

def find_index(seq_id):
    for uid, idx in enumerate(pqtl_data_list):
        if idx.startswith(seq_id):
            return uid

In [19]:
for pqtl in pqtl_lists:
    pqtl_file =  pqtl_data_list[find_index(pqtl)]
    df_pqtl_sig = pd.read_csv(os.path.join('deCODE_pqtl_data_significant', pqtl_file), sep='\t', compression='gzip')
    df_pqtl_sig = df_pqtl_sig[df_pqtl_sig['FDR'] < 0.05].copy()
    pqtls_bag.append(df_pqtl_sig)

In [20]:
# concat all pqtls
df_sig_all = pd.concat(pqtls_bag, axis=0)
print(len(pd.unique(df_sig_all['id'])))
df_sig_all.to_csv(f'pqtl_data/pqtls_decode_{pqtl_type}.csv', index=False)

12
