In [1]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import subprocess
import logging
from ImmuneGWAS.helpers.getpaths import get_paths, get_sumstats_path
from ImmuneGWAS.helpers import dbsnp, ldlink
import ImmuneGWAS.resources.immune_GWAS as immune_GWAS
from ImmuneGWAS import config
from ImmuneGWAS.variant import Variant

from ImmuneGWAS.resources.eqtlgen import eqtlgen_trans_LDblock_query 
from ImmuneGWAS.resources.eqtlgen import eqtlgen_cis_LDblock_query 
from ImmuneGWAS.resources.tokyo_eqtl import tokyo_eqtl_LDblock_query
from ImmuneGWAS.resources.eqtl_cat import *
from IPython.display import display_html 
from IPython.display import display,HTML

from scipy.cluster import hierarchy

from ImmuneGWAS.helpers import get_gene_symbol  # convert ensembl to gene

In [19]:
ldpruned_hits = "/media/cbio3/projects/antton/Immune_cell_GWAS/data/hits_only_table_hg38_LDprunned.txt"

Designing the lookup:

1) Variant object calls trans_df lookup
2) Harmonize according to EA/OA, raise exceptions for multi-allelic SNPs
3) Split trans_df into up and down
4) Return row-scaled heatmap df where cell ordering is hard-coded and rows are resorted for clustering
5) Plotting function for heatmap df
6) Get flow phenotypes
7) Get LDTrait phenotypes
8) Summarize phenotpyes into tables with betas (remove duplicate phenotypes etc)
9) Get a single summary cis-eQTL table
10) Return summary report - 2 heatmaps for up and down, cis-eQTLs, phenotypes summarized

In [2]:
rsid = "rs1354034"
chrom = 3
pos = 56815721
EA = 'T'
OA = 'C'

x = Variant(rsid, chrom, pos, EA, OA)

In [69]:
x.get_gwas_phenotypes()

[]

In [48]:
def multi_column_df_display(list_dfs, cols=3):
    html_table = "<table style='width:100%; border:0px'>{content}</table>"
    html_row = "<tr style='border:0px'>{content}</tr>"
    html_cell = "<td style='width:{width}%;vertical-align:top;border:0px'>{{content}}</td>"
    html_cell = html_cell.format(width=100/cols)

    cells = [ html_cell.format(content=df.to_html()) for df in list_dfs ]
    cells += (cols - (len(list_dfs)%cols)) * [html_cell.format(content="")] # pad
    rows = [ html_row.format(content="".join(cells[i:i+cols])) for i in range(0,len(cells),cols)]
    display(HTML(html_table.format(content="".join(rows))))

def summary_trans(var: Variant):
    
    # retrieve trans-eQTLs
    trans = eqtlgen_trans_LDblock_query(var)
    trans.Zscore = trans.Zscore.astype(float)
    
    
    # check EA for sign of beta
    
    if set(trans.AssessedAllele == x.EA) == {True}:
        trans.Zscore = trans.Zscore * -1
    
    # split into up and down genes
    up = trans[trans.Zscore>0]
    down = trans[trans.Zscore<0]
    up = up.Gene
    down = down.Gene
    
    # dictionary to map ensembl to gene name
    mapper = dict(zip(trans.Gene, trans.GeneSymbol))
    
    #get ge matrix from ImmunexUT
    paths = get_paths(root=config.cbio_root)
    ge = pd.read_csv(paths['ge_tokyo'], sep='\t')
    ge = ge.set_index("Gene_id")
    ge = ge.drop("Gene_name", axis=1)
    ge = ge.drop_duplicates()
    
    # get indices for trans-eQTL matrix
    # note any missing genes will be dropped
    
    u = [i for i in up if i in ge.index]
    u = set(u)
    d = [i for i in down if i in ge.index]
    d = set(d)
    
    # filter gene expression matrix into up and down genes
    u = ge.loc[u]
    d = ge.loc[d]
    
    # rename ensembl IDs to gene names
    u.index = u.index.map(mapper)
    d.index = d.index.map(mapper)

    # hard-code order of cell types according to gene expression similarity on whole GE matrix
    
    ordered_cols = ['Plasmablast', 'LDG', 'Neu', 'pDC', 'mDC', 'CL_Mono', 'Int_Mono',
           'CD16p_Mono', 'NC_Mono', 'Naive_B', 'USM_B', 'DN_B', 'SM_B', 'NK',
           'Mem_CD8', 'EM_CD8', 'TEMRA_CD8', 'Naive_CD8', 'Fr_I_nTreg',
           'Naive_CD4', 'Fr_III_T', 'Fr_II_eTreg', 'CM_CD8', 'Th2', 'Tfh', 'Th17',
           'Mem_CD4', 'Th1']
    
    # normalize up and down genes

    def standard_scale(x):
        return (x-x.min())/x.max()

    u = u.apply(standard_scale, axis=1)
    d = d.apply(standard_scale, axis=1)

    # reorder columns
    
    u = u.loc[:, ordered_cols]
    d = d.loc[:, ordered_cols]
    
    # reoder rows by dendrogram
    if len(u)>1:
        Z = hierarchy.linkage(u, optimal_ordering=True, method='ward')
        dendro = hierarchy.dendrogram(Z, no_plot=True)
    
        u = u.iloc[dendro['leaves'],:]
    
    if len(d)>1:
        Z = hierarchy.linkage(d, optimal_ordering=True, method='ward')
        dendro = hierarchy.dendrogram(Z, no_plot=True)
        d = d.iloc[dendro['leaves'],:]
    # collect all my cis-eqtls

    tokyo_eqtl_LDblock_query(x)
    eqtlgen_cis_LDblock_query (x)
    eqtl_catalogue_LDblock_query_type_restricted_multitype(x)

    # now all of these are in the Results object

    gwas = ldlink.ldtrait(x)

    # first try to filter by the same SNP as in Variant object, otherwise filter down to one rsid
    
    tokyo = x.results.tokyo_eqtl()[x.results.tokyo_eqtl()['Variant_ID']==x.rsid]
    
    if len(tokyo)>0:
        tokyo = tokyo[['Gene_name','Variant_ID', 'Backward_slope', 'Backward_P', 'cell_type']]
        tokyo = tokyo.sort_values(by='Gene_name')
    
    elif len(tokyo)==0:
        tokyo = x.results.tokyo_eqtl()
        rsid = tokyo['Variant ID'][0]
        

    
    cat = x.results.eqtl_cat()
    cat = cat[cat.position==str(x.pos)]
    cat = cat[['gene_symbol', 'z', 'study', 'cell_type']]
    
    
    gwas_traits = set(x.results.ldtrait()['GWAS Trait'].tolist())
    
    
    
    cisgen = x.results.eqtlgen_cis()
    cisgen = cisgen[cisgen.SNP==x.rsid]
    if len(cisgen)>0:
        cisgen = cisgen[['SNP', 'Zscore', 'Pvalue', 'GeneSymbol']].sort_values(by='GeneSymbol')
    elif len(csigen)==0:
        cisgen = x.results.eqtlgen_cis()
        rsid = cisgen.SNP[0]
        cisgen = cisgen[cisgen.SNP==rsid]
        cisgen = cisgen[['SNP', 'Zscore', 'Pvalue', 'GeneSymbol']].sort_values(by='GeneSymbol')
    # get the cis-eqtl genes to plot a heatmap of the cis-eQTL genes
    
    a = x.results.eqtlgen_cis_df['Gene']

    b = x.results.tokyo_eqtl_df['Gene_id'].apply(lambda x:x.split(".")[0])

    c = x.results.eqtl_cat_df['molecular_trait_id']
    
    cis_genes = list(set(pd.concat([a,b,c]).tolist()))
    cis_genes = [i for i in cis_genes if i in ge.index]
    cis_genes = ge.loc[cis_genes]
    cis_genes = cis_genes.apply(standard_scale, axis=1)
    cis_genes = cis_genes.loc[:, ordered_cols]
    cis_genes.index = cis_genes.index.map(get_gene_symbol)
    
    list_dfs = [tokyo, cat, cisgen]

    display(tokyo)
    display(cat)
    display(cisgen)
    print("GWAS Catalog\n")
    display(gwas_traits)
    print("Flow phenos\n")
    display(x.get_gwas_phenotypes())
    
    
    sns.heatmap(cis_genes, cbar=False, cmap='inferno', square=True)
    
    h = max([len(u), len(d)])
    h = int(h/10)
    if h<1:
        h=1
    
    if len(d)>0 and len(u)>0:
        print("both up and down trans")
        fig, ax= plt.subplots(1,2, figsize=(20,h))

        sns.heatmap(u, ax=ax[0], cbar=False, cmap='inferno')
        sns.heatmap(d, ax=ax[1], cbar=False, cmap='inferno')
    
        ax[0].set_title("UP Trans genes")
        ax[1].set_title("DOWN Trans genes")
    
    elif len(d)==0 and len(u)>0:
  
        fig, ax = plt.subplots(figsize=(10,h))
        sns.heatmap(u, ax=ax, cbar=False, cmap='inferno')
        ax.set_title("UP Trans genes")
    
    elif len(d)>0 and len(u)==0:

        fig, ax = plt.subplots(figsize=(10,h))
        sns.heatmap(d, ax=ax, cbar=False, cmap='inferno')
        ax.set_title("DOWN Trans genes")
                               
    elif len(d)==0 and len(u)==0:
        print("no significant trans-eQTL detected")
        

In [93]:
rsid = 'rs8073060'
EA = 'A'
OA = 'T'
pos = 35548243
chrom = 17

x = Variant(rsid, chrom, pos, EA, OA)

In [103]:
trans = pd.read_csv(get_paths(root=config.cbio_root)['eqtlgen_trans'], sep ='\t')

trans[trans.SNP==x.rsid]

Unnamed: 0,Pvalue,SNP,SNPChr,SNPPos,AssessedAllele,OtherAllele,Zscore,Gene,GeneSymbol,GeneChr,GenePos,NrCohorts,NrSamples,FDR,BonferroniP
49949,1.646800e-257,rs8073060,17,35548243,A,T,34.2785,ENSG00000105701,FKBP8,19,18648715,35,31185,0.000000,3.352000e-249
49950,4.050900e-242,rs8073060,17,35548243,A,T,33.2295,ENSG00000158856,EPB49,8,21923272,34,26110,0.000000,8.245500e-234
49951,1.181400e-200,rs8073060,17,35548243,A,T,30.2231,ENSG00000022840,RNF10,12,120993340,35,31185,0.000000,2.404700e-192
49952,1.290300e-149,rs8073060,17,35548243,A,T,26.0518,ENSG00000167671,UBXN6,19,4451407,35,31185,0.000000,2.626400e-141
49953,3.177200e-88,rs8073060,17,35548243,A,T,19.9124,ENSG00000167992,VWCE,11,61044329,35,31185,0.000000,6.467100e-80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50200,5.652200e-06,rs8073060,17,35548243,A,T,4.5391,ENSG00000142784,WDTC1,1,27598058,35,31185,0.030999,1.000000e+00
50201,6.452300e-06,rs8073060,17,35548243,A,T,4.5112,ENSG00000145423,SFRP2,4,154706008,35,31185,0.034630,1.000000e+00
50202,6.519500e-06,rs8073060,17,35548243,A,T,-4.5087,ENSG00000198518,HIST1H4E,6,26205562,35,31185,0.034925,1.000000e+00
50203,7.734700e-06,rs8073060,17,35548243,A,T,4.4724,ENSG00000105698,USF2,19,35765299,35,31185,0.040104,1.000000e+00
