In [2]:
import sys
sys.path.insert(0,'../model')

import pickle as pkl
import pandas as pd
import numpy as np
from datasets import RealDataset
from models import CoordinatedModel
from scipy.stats import pearsonr
from scipy.stats import hypergeom, binom
from functools import reduce
import scipy.stats

In [10]:
phenos = {
    'IGF-1': {
        'gwas_file':'./snps/igf1.txt',
        'ann_file' : './pathways/igf1.hits.closest.core.txt',
        'efa_file' : './pkl/igf1_sinarm-igf1-range_top100.pkl',
        'snp_file' : './snps/igf1_sinarm-igf1-range_top100.snps',
        'used_file': './snps/sinarm-igf1-range_top100.txt',
    },
    'Urate': {
        'gwas_file': './snps/urate.txt',
        'ann_file' : './pathways/urate.hits.closest.core.txt',
        'efa_file' : './pkl/urate_sinarm-urate-range_top100.pkl',
        'snp_file' : './snps/urate_sinarm-urate-range_top100.snps',
        'used_file': './snps/sinarm-urate-range_top100.txt',
    },
    'Testosterone (Male)': {
        'gwas_file': './snps/maletest.txt',
        'ann_file' : './pathways/testosterone.males.hits.closest.core.txt',
        'efa_file' : './pkl/male_testosterone_sinarm-maletest-range_top100.pkl',
        'snp_file' : './snps/male_testosterone_sinarm-maletest-range_top100.snps',
        'used_file': './snps/sinarm-maletest-range_top100.txt',
    },
    'Testosterone (Female)': {
        'gwas_file': './snps/femaletest.txt',
        'ann_file' : './pathways/testosterone.females.hits.closest.core.txt',
        'efa_file' : './pkl/female_testosterone_sinarm-femaletest-range_top100.pkl',
        'snp_file' : './snps/female_testosterone_sinarm-femaletest-range_top100.snps',
        'used_file': './snps/sinarm-femaletest-range_top100.txt',
    }
}

def enrich(pathway, ref, nsnps, top_snps):
    res = dict()
    for k,v in pathway.items():
        res[k] = 1 -scipy.stats.hypergeom.cdf(v-1, nsnps, ref[k], top_snps)
    return pd.Series(res, dtype='float64')


def get_efa_df(gwas_file, ann_file, efa_file, snp_file, used_file):
    df = pd.read_csv(gwas_file, sep='\t')
    df = df[df['CHROM'] != 'X']
    df = df[df['CHROM'] != 'Y']
    df = df.sort_values(by = 'P', kind='stable')
    df['IDX'] = df['CHROM'].astype('str') + ':' + df['POS'].astype('str') + '_' + df['REF'] + '_' + df['A1']

    used = pd.read_csv(used_file, header=None, sep='\s+')
    df = used.merge(df, left_on=3, right_on='IDX')

    df['CHROM'] = df['CHROM'].astype('int')
    df = df.sort_values(by='POS')
    df = df.sort_values(by='CHROM', kind='stable').reset_index()
    df = df.loc[:,['CHROM', 'POS', 'REF', 'A1', 'P', 'BETA', 'IDX']]

    with open(efa_file, 'rb') as f:
        efa = pkl.load(f)

    # efa.pathways -= efa.beta/2
    df['U1_orig'] = efa.pathways[:,0]
    df['U2_orig'] = efa.pathways[:,1]
    efa.pathways *= efa.beta/np.abs(efa.beta)
    df['U1'] = efa.pathways[:,0]
    df['U2'] = efa.pathways[:,1]
    df['U1_res'] = efa.pathways[:,0] - (efa.pathways[:,0] + efa.pathways[:,1])/2
    df['U2_res'] = efa.pathways[:,1] - (efa.pathways[:,0] + efa.pathways[:,1])/2


    snps = pd.read_csv(snp_file, header=None)
    df['SNP'] = snps
    df['LAMBDA'] = efa.weights[0, 1]

    ann = pd.read_csv(ann_file, sep='\t', header=None, na_values='.')
    ann = ann.iloc[:,[3, 7, 8, 9]]
    ann.columns = ['IDX', 'GENE', 'PATHWAY', 'DIST']
    ann = ann.drop_duplicates(subset = ['IDX', 'PATHWAY'])
    ann = ann.merge(df, left_on='IDX', right_on='IDX').loc[:,['IDX', 'GENE', 'PATHWAY', 'DIST',]]

    # ann = ann.sort_values(by = 'P')
    # ann = ann.drop_duplicates(subset= ['GENE', 'PATHWAY'])

    df = df.merge(ann, left_on='IDX', right_on='IDX')
    nsnps = len(df['IDX'].unique())
    return df, ann, nsnps


pheno = 'IGF-1'
# pheno =  'Urate'
# pheno = 'Testosterone (Male)'
# pheno = 'Testosterone (Female)'
df, ann, nsnps = get_efa_df(**phenos[pheno])


gene_dist = float('inf')
gene_dist = 1e5

table_list = []

for top_snps in [15]:
    u1_ann = df.sort_values(by = 'U1', ascending=False).head(top_snps).loc[df['DIST'] < gene_dist].PATHWAY.value_counts()
    u2_ann = df.sort_values(by = 'U2', ascending=False).head(top_snps).loc[df['DIST'] < gene_dist].PATHWAY.value_counts()
    all_ann = ann[ann['DIST'] < gene_dist].PATHWAY.value_counts()

    # print('________________________________________________')
    enrichment = pd.DataFrame({
        'u1_count': u1_ann, 
        'u1_p': enrich(u1_ann, all_ann, nsnps, top_snps),
        'u2_count': u2_ann,
        'u2_p': enrich(u2_ann, all_ann, nsnps, top_snps),
        'all': all_ann
        }
    )

    u1_count_str = u1_ann.reindex(all_ann.index).fillna(0).astype(int).astype(str)
    u1_count_str = u1_count_str.str.cat(all_ann.astype(str), join = 'outer', sep = '/')
    u1_p_str = enrich(u1_ann, all_ann, nsnps, top_snps).reindex(all_ann.index).fillna(1).round(3).astype(str)
    u1_p_str =u1_p_str.map('({})'.format)

    u2_count_str = u2_ann.reindex(all_ann.index).fillna(0).astype(int).astype(str)
    u2_count_str = u2_count_str.str.cat(all_ann.astype(str), join = 'outer', sep = '/')
    u2_p_str = enrich(u2_ann, all_ann, nsnps, top_snps).reindex(all_ann.index).fillna(1).round(3).astype(str)
    u2_p_str = u2_p_str.map('({})'.format)

    enrichment = pd.DataFrame({
        'EF1 ($p$)': u1_count_str.str.cat(u1_p_str, join='outer', sep = ' '), 
        'EF2 ($p$)': u2_count_str.str.cat(u2_p_str, join='outer', sep = ' '),
        }
    ).fillna('0 (1)').reset_index().assign(nsnps = '%d snps' %top_snps)
    
    table_list.append(enrichment)


table = reduce(lambda df1, df2: df1.merge(df2, "outer"), table_list)
table = table.set_index(['index', 'nsnps']).stack().unstack([1, 2])
table.index.name = None
table['Window'] = '{}mb'.format(gene_dist/1e6)
table

nsnps,15 snps,15 snps,Window
Unnamed: 0_level_1,EF1 ($p$),EF2 ($p$),Unnamed: 3_level_1
Downstream signaling,2/3 (0.063),0/3 (1.0),0.1mb
Growth hormone secretion,0/4 (1.0),1/4 (0.499),0.1mb
IGF-1 secretion,1/4 (0.499),1/4 (0.499),0.1mb
IGF-1 serum balance,1/7 (0.708),3/7 (0.074),0.1mb
No pathway,0/1 (1.0),0/1 (1.0),0.1mb
Ras signaling,1/5 (0.581),1/5 (0.581),0.1mb


In [187]:
df.sort_values(by = 'U1_res', ascending=True).head(top_snps).loc[df['DIST'] < gene_dist]

Unnamed: 0,CHROM,POS,REF,A1,P,BETA,IDX,U1_orig,U2_orig,U1,U2,U1_res,U2_res,SNP,LAMBDA,GENE,PATHWAY,DIST
32,4,9721358,G,A,0.0,0.022399,4:9721358_G_A,-0.031017,0.091575,-0.031017,0.091575,-0.061296,0.061296,rs13129134_A,-0.285857,SLC2A9,Solute_transport,0
39,4,10244668,C,T,3.7399999999999997e-44,-0.148939,4:10244668_C_T,0.031469,-0.06592,-0.031469,0.06592,-0.048695,0.048695,rs570500361_T,-0.285857,SLC2A9,Solute_transport,88109
38,4,10160181,A,G,3.6499999999999998e-56,-0.19014,4:10160181_A_G,0.03123,-0.065443,-0.03123,0.065443,-0.048336,0.048336,rs763487892_G,-0.285857,SLC2A9,Solute_transport,3622
34,4,9860930,G,A,2.8300000000000003e-39,-0.151825,4:9860930_G_A,0.018776,-0.038495,-0.018776,0.038495,-0.028635,0.028635,rs557066340_A,-0.285857,SLC2A9,Solute_transport,0
54,6,25807603,A,G,2.81e-210,0.017394,6:25807603_A_G,-0.001466,0.055394,-0.001466,0.055394,-0.02843,0.02843,rs2817188_G,-0.285857,SLC17A1,Solute_transport,0
33,4,9751995,C,T,1.25e-32,-0.133196,4:9751995_C_T,0.016289,-0.036886,-0.016289,0.036886,-0.026587,0.026587,rs528465073_T,-0.285857,SLC2A9,Solute_transport,0


In [175]:
print(table.to_latex(escape=False))

\begin{tabular}{llllllll}
\toprule
nsnps & \multicolumn{2}{l}{10 snps} & \multicolumn{2}{l}{15 snps} & \multicolumn{2}{l}{20 snps} &  Window \\
{} &     EF1 ($p$) &     EF2 ($p$) &     EF1 ($p$) &      EF2 ($p$) &     EF1 ($p$) & \multicolumn{2}{l}{EF2 ($p$)} \\
\midrule
Purine_metabolism &    0/39 (1.0) &  1/39 (0.995) &  3/39 (0.979) &   3/39 (0.979) &   6/39 (0.89) &   5/39 (0.961) &  10.0mb \\
Solute_transport  &  8/34 (0.003) &    9/34 (0.0) &  9/34 (0.026) &  11/34 (0.001) &  9/34 (0.194) &  13/34 (0.002) &  10.0mb \\
\bottomrule
\end{tabular}



  print(table.to_latex(escape=False))


In [24]:
df.to_csv('./gwas/igf1_pathways.tsv', sep = '\t', index=False)