In [2]:
import pandas as pd
import os

In [4]:
gwas_path = 'E:/downloads/gwas'
gwas_files = os.listdir(gwas_path)
print(gwas_files)

['finngen_R9_CARDIAC_ARRHYTM.gz', 'finngen_R9_E4_OBESITY.gz', 'finngen_R9_H7_RETINOPATHYDIAB.gz', 'finngen_R9_I9_CHD.gz', 'finngen_R9_I9_HEARTFAIL.gz', 'finngen_R9_I9_HYPTENS.gz', 'finngen_R9_N14_ACUTERENFAIL.gz', 'finngen_R9_N14_CHRONKIDNEYDIS.gz', 'finngen_R9_N14_GLOMER_NEPHRITIS.gz', 'TAGC_meta-analyses_results_for_asthma_risk', 'TAGC_meta-analyses_results_for_asthma_risk.zip']


In [5]:
# finngen gwas
gwas_files_finngen = [f for f in gwas_files if 'finngen' in f]

In [6]:
df_asthma = pd.read_csv('E:/downloads/gwas/TAGC_Multiancestry_and_European-Ancestry_Meta-analyses_Results.tsv', sep='\t', low_memory=False)

In [7]:
df_asthma.columns

Index(['chr', 'rsid', 'position', 'reference_allele', 'alternate_allele',
       'Multiancestry_beta_fix', 'Multiancestry_se_fix',
       'Multiancestry_pval_fix', 'Multiancestry_beta_rand',
       'Multiancestry_se_rand', 'Multiancestry_pval_rand',
       'Multiancestry_HetQtest', 'Multiancestry_df_HetQtest',
       'Multiancestry_pval_HetQtest', 'European_ancestry_beta_fix',
       'European_ancestry_se_fix', 'European_ancestry_pval_fix',
       'European_ancestry_beta_rand', 'European_ancestry_se_rand',
       'European_ancestry_pval_rand', 'European_ancestry_HetQtest',
       'European_ancestry_df_HetQtest', 'European_ancestry_pval_HetQtest'],
      dtype='object')

In [9]:
# change column names, rename to: CHR, BP, SNP, A1, A2, P, BETA, SE (other selected columns:  EAF, N)
df_asthma = df_asthma.rename(columns={
    'chr': 'CHR',
    'position': 'BP',
    'rsid': 'SNP',
    'alternate_allele': 'A1',
    'reference_allele': 'A2',
    'Multiancestry_beta_fix': 'BETA',
    'Multiancestry_se_fix': 'SE',
    'Multiancestry_pval_fix': 'P',
})
df_asthma = df_asthma[['CHR', 'BP', 'SNP', 'A1', 'A2', 'P', 'BETA', 'SE']]

In [10]:
print(df_asthma.head())

   CHR       BP        SNP A1 A2         P      BETA        SE
0    1   752566  rs3094315  A  G  0.680173 -0.008309  0.020156
1    1   779322  rs4040617  G  A  0.908569 -0.002831  0.024648
2    1   785050  rs2905062  A  G  0.901650  0.003518  0.028470
3    1   785989  rs2980300  C  T  0.900844  0.003523  0.028272
4    1  1003629  rs4075116  T  C  0.883253  0.002322  0.015811


In [11]:
top_hits = df_asthma[df_asthma['P'] < 5e-8]

In [12]:
# save to csv
df_asthma.to_csv('data/gwas_summary/disease/Asthma.txt.gz', index=False, sep=' ', compression='gzip', na_rep='NA')

In [13]:
top_hits.to_csv('data/gwas_summary/disease/tophits/Asthma_top.csv', index=False, na_rep='NA')

In [14]:
df_gwas = pd.read_csv('E:/downloads/gwas/finngen_R9_I9_CHD.gz', sep='\t', low_memory=False, compression='gzip')

In [15]:
df_gwas.columns

Index(['#chrom', 'pos', 'ref', 'alt', 'rsids', 'nearest_genes', 'pval',
       'mlogp', 'beta', 'sebeta', 'af_alt', 'af_alt_cases', 'af_alt_controls'],
      dtype='object')

In [16]:
# change column names, rename to: CHR, BP, SNP, A1, A2, P, BETA, SE (other selected columns:  EAF, N)
df_gwas = df_gwas.rename(columns={
    '#chrom': 'CHR',
    'pos': 'BP',
    'rsids': 'SNP',
    'alt': 'A1',
    'ref': 'A2',
    'pval': 'P',
    'beta': 'BETA',
    'sebeta': 'SE',
    'af_alt': 'EAF',
})
df_gwas = df_gwas[['CHR', 'BP', 'SNP', 'A1', 'A2', 'P', 'BETA', 'SE', 'EAF']]

In [18]:
# top hits
top_hits = df_gwas[df_gwas['P'] < 5e-8]

In [19]:
# save to csv
df_gwas.to_csv('data/gwas_summary/disease/CHD.txt.gz', index=False, sep=' ', compression='gzip', na_rep='NA')
top_hits.to_csv('data/gwas_summary/disease/tophits/CHD_top.csv', index=False, na_rep='NA')

In [21]:
for f in gwas_files_finngen:
    print('processing:', f)
    # ignore CHD because it is already processed
    if f == 'finngen_R9_I9_CHD.gz':
        continue
    df_gwas = pd.read_csv(f'{gwas_path}/{f}', sep='\t', low_memory=False, compression='gzip')
    df_gwas = df_gwas.rename(columns={
        '#chrom': 'CHR',
        'pos': 'BP',
        'rsids': 'SNP',
        'alt': 'A1',
        'ref': 'A2',
        'pval': 'P',
        'beta': 'BETA',
        'sebeta': 'SE',
        'af_alt': 'EAF',
    })
    df_gwas = df_gwas[['CHR', 'BP', 'SNP', 'A1', 'A2', 'P', 'BETA', 'SE', 'EAF']]
    top_hits = df_gwas[df_gwas['P'] < 5e-8]
    print('saving results, number of SNPs:', len(df_gwas), 'top hits:', len(top_hits))
    df_gwas.to_csv(f'data/gwas_summary/disease/{f}.txt.gz', index=False, sep=' ', compression='gzip', na_rep='NA')
    top_hits.to_csv(f'data/gwas_summary/disease/tophits/{f}_top.csv', index=False, na_rep='NA')

processing: finngen_R9_CARDIAC_ARRHYTM.gz
saving results, number of SNPs: 20167610 top hits: 4252
processing: finngen_R9_E4_OBESITY.gz
saving results, number of SNPs: 20170233 top hits: 1775
processing: finngen_R9_H7_RETINOPATHYDIAB.gz
saving results, number of SNPs: 20169735 top hits: 24955
processing: finngen_R9_I9_CHD.gz
processing: finngen_R9_I9_HEARTFAIL.gz
saving results, number of SNPs: 20170236 top hits: 315
processing: finngen_R9_I9_HYPTENS.gz
saving results, number of SNPs: 20170234 top hits: 16643
processing: finngen_R9_N14_ACUTERENFAIL.gz
saving results, number of SNPs: 20170075 top hits: 1
processing: finngen_R9_N14_CHRONKIDNEYDIS.gz
saving results, number of SNPs: 20170113 top hits: 780
processing: finngen_R9_N14_GLOMER_NEPHRITIS.gz
saving results, number of SNPs: 20170236 top hits: 4


In [27]:
df_gwas = pd.read_csv('data/gwas_summary/disease/GN.txt.gz', sep=' ', compression='gzip', low_memory=False)

In [28]:
df_top = df_gwas[df_gwas['P'] < 5e-6]

In [29]:
df_top.to_csv('data/gwas_summary/disease/tophits/GN_top.csv', index=False, na_rep='NA')