In [1]:
import numpy as np
import ctypes
from scipy.stats import chi2
from statsmodels.sandbox.stats.multicomp import fdrcorrection_twostage as fdr
import os
import argparse
import time
import matplotlib.pyplot as plt
from iotools.dosageparser import DosageParser
from iotools.readVCF import ReadVCF 
from scipy.optimize import minimize
from scipy.special import erfinv
from inference.perm_regulizer_optimizer import OptimizeRegularizer
#from inference.per_snp_optimiser import OptimizeRegularizer
from select_genes_per_snp import linear_reg
import regstat
import regnull
import hist_qq
#from qqplot import plot_hist_qq

In [2]:
def read_expression(filename):
    gene_names = list()
    expression = list()
    with open(filename, 'r') as genefile:
        header = genefile.readline()
        donorids = header.strip().split("\t")[1:]
        for line in genefile:
            linesplit = line.strip().split("\t")
            expression.append(np.array(linesplit[1:], dtype=float))
            gene_names.append(linesplit[0])
    expression = np.array(expression)
    return donorids, expression, gene_names


'''def norm_binom(genotype, f):
    genotype = (genotype - (2 * f)) / np.sqrt(2 * f * (1 - f))
    return genotype
'''

def parse_geno (genotype_filename, sample_filename, startsnp, endsnp):                              #Read genotype here
    ds        = DosageParser(genotype_filename, sample_filename, startsnp, endsnp)
    dosage    = ds.dosage
    snpinfo   = ds.snpinfo
    donorids  = ds.sample_id
    nsnps     = ds.nsnps
    nsample   = ds.nsample
    return dosage, snpinfo, donorids, nsnps, nsample

                                                                                                    #quality check, matching
def quality_check ( sampleids, donorids):
    choose_ids     = [x for x in sampleids if x in donorids]
    dosage_indices = [donorids.index(i) for i in choose_ids]
    exprsn_indices = [sampleids.index(i) for i in choose_ids]
    return dosage_indices, exprsn_indices

def read_distance(filename):
    samplelist = list()
    distances = list()
    with open(filename, 'r') as mfile:
        mfile.readline()
        for line in mfile:
            linestrip = line.strip().split("\t")
            samplelist.append(linestrip[0])
            distances.append(np.array(linestrip[1:],dtype=float))
    return np.array(distances), samplelist

In [3]:
#out_fileprefix      = 
genotype_filename   = "data/geno/GTEx_450Indiv_genot_imput_info04_maf01_HWEp1E6_dbSNP135IDs_donorIDs_dosage_chr1.gz"
sample_filename     = "data/geno/donor_ids.fam"
#expression_filename = "PCA_KNN_Correction/main/expr25.txt"
expression_filename = "Whole_Blood_Analysis.v6p.normalized.expression.txt"

startsnp            = 1
endsnp              = 2000
optim               = 0
user_sigbeta        = 0.001636581825407
transgeno           = "data/GTEx_450Indiv_genot_imput_info04_maf01_HWEp1E6_dbSNP135IDs_donorIDs_dosage_filteredTEJAAS_transeqtls.gz"#opts.trans_genofile

In [4]:
sampleids, expression, gene_names = read_expression(expression_filename)
dosage, snpinfo, donorids, nsnps, nsample = parse_geno ( genotype_filename, sample_filename, startsnp, endsnp)
dosage_indices, exprsn_indices = quality_check (sampleids , donorids)
conv_dosage = np.round(dosage)
expr = expression[:, exprsn_indices]
#expr = np.random.normal(0, 1, 23973 * 338).reshape(23973, 338)
geno = conv_dosage[:, dosage_indices]
expr = (expr - np.mean(expr, axis = 1).reshape(-1, 1)) / np.std(expr, axis = 1).reshape(-1, 1)
geno = (geno.T - np.mean(geno.T,axis = 0)).T
print ("Completed data loading and processing\n")


low maf  poly alleles  complement alleles 

940   170   270 

Total SNPs read in :  2000
Total SNPs remained :  620

Completed data loading and processing



In [5]:
def permuted_dosage(expr, nsnp = 5000, fmin = 0.1, fmax = 0.9, maketest = False):

    f = np.random.uniform(fmin, fmax, nsnp)
    if maketest:
        f = np.repeat(0.1, nsnp)
    nsample = expr.shape[1]

    dosage = np.zeros((nsnp, nsample))
    for i in range(nsnp):
        if maketest:
            nfreq = np.array([[279,  54,   5]])[0]
        else:
            mafratios = np.array([(1 - f[i])**2, 2 * f[i] * (1 - f[i]), f[i]**2])
            nfreq  = np.random.multinomial(nsample, mafratios, size=1)[0]
        f1 = np.repeat(0, nfreq[0])
        f2 = np.repeat(1, nfreq[1])
        f3 = np.repeat(2, nfreq[2])
        x  = np.concatenate((f1,f2,f3))
        dosage[i, :] = np.random.permutation(x)

    maf2d = f.reshape(-1, 1)
    gtnorm = (dosage - (2 * maf2d)) / np.sqrt(2 * maf2d * (1 - maf2d))
    gtcent = dosage - np.mean(dosage, axis = 1).reshape(-1, 1)

    return gtnorm, gtcent


def expression(gt, gx, beta, cfrac = 0.001):

    ntrans  = gt.shape[0]
    ngene   = gx.shape[0]
    nsample = gx.shape[1]

    liabilities = np.zeros((ngene, nsample))
    cindex = np.zeros((ntrans, ngene))                                           # Index matrix of gene / trans-eQTLs pairs
    nc  = np.random.gamma(ngene * cfrac, scale = 1.0, size = ntrans).astype(int) # number of target genes for each trans-eQTL
    for i in range(ntrans):
        ncausal = min(ngene, nc[i])                                              # do something better, trans-eQTLs cannot target all genes
        choose = np.random.choice(ngene, ncausal, replace = False)
        cindex[i, choose] = 1                                                    # mark these genes as causal

    gtarget = list()

    for i in range(ngene):
        csnps = np.where(cindex[:, i] == 1)[0]
        if csnps.shape[0] > 0: # then we got a trans-eQTL
            betas = np.random.normal(0, beta, size = csnps.shape[0])
            #betas *= np.sqrt( H2[i] / np.sum(np.square(beta)) )
            liabilities[i, :] = np.dot(gt[csnps, :].T, betas)
            gtarget.append(i)

    newGX = gx + liabilities
    newGX = (newGX - np.mean(newGX, axis = 1).reshape(-1, 1)) / np.std(newGX, axis = 1).reshape(-1, 1)
    return newGX, nc

In [6]:
### Simulation of data

ngenes = 500
nsamples = 200
nsnps = 100
true_sigmabeta = 5.0#0.0006
nmin = 1
cfrac = 1
nmax = int(ngenes)
ncausal = np.random.randint(nmin, nmax, nsnps)
fmin = 0.1
fmax = 0.5
ntrans = 20

Y = np.random.randn(ngenes * nsamples).reshape(ngenes, nsamples)

gtnorm, gtcent = permuted_dosage(Y, nsnp = nsnps, fmin = fmin, fmax = fmax)
       
# Trans-eQTL
if ntrans > 0:
    newgx,  nc = expression(gtnorm[-ntrans:, :], Y, beta=true_sigmabeta, cfrac = cfrac)
Y = newgx

In [7]:
optim = True
if optim:
    print ("\nSigma beta optimization started. Reading from the provided trans-genotype file.")
    tic = time.time()
    try:

        #optimize_sigbeta   = OptimizeRegularizer(geno[:10], expr, sigmax = np.std(geno[:10].T,axis = 0))
        optimize_sigbeta   = OptimizeRegularizer(gtcent, Y, sigmax = np.std(gtcent.T,axis = 0))
        l = optimize_sigbeta.update()
        optimize_sigbeta.niter
        sigbetas            = optimize_sigbeta.sigmareg
        #print(sigbetas)
        toc = time.time()
        print ("Sigma beta optimization completed in :", toc - tic , "seconds\n")
        print ("Optimized sigma beta value is:" , sigbetas,"\n")
        #del geno, conv_dosage, trans_dosage
    except OSError as e:
        print("\n",e, ". Trans-eQTLs file not provided for Optimization\n")
        raise SystemExit


else:
    print("\n=============================================================")
    print("\nsigma beta optimization not requested")
    if user_sigbeta == 0.006:
        print ("\nsigma beta not provided. Using default value of 0.006")
    else:
        print("\nsigma beta value provided as : " , user_sigbeta)
    print("=============================================================")



Sigma beta optimization started. Reading from the provided trans-genotype file.
optimizing...
[59841.19757419]
[59841.19785982]
[59841.19757419]
[41796.77137179]
[41796.77137179]
[41796.77160685]
[0.599]
[24079.6119169]
[24079.6119169]
[24079.61195275]
[-1.7116633]
[23148.36660224]
[23148.36660224]
[23148.36663373]
[-2.1275322]
[21416.2786045]
[21416.2786045]
[21416.27863084]
[-3.0175844]
[20086.64434571]
[20086.64434571]
[20086.6443383]
[-4.99704573]
[19930.91257826]
[19930.91257826]
[19930.91257609]
[-4.56239356]
[19927.33117908]
[19927.33117908]
[19927.33118078]
[-4.38224143]
[19923.19344773]
[19923.19344773]
[19923.19344799]
[-4.44564849]
[19923.09327305]
[19923.09327305]
[19923.09327305]
[-4.45693425]
[19923.09317241]
[19923.09317241]
[19923.09317241]
[-4.45730144]
[19923.0931724]
[19923.0931724]
[19923.0931724]
[-4.45730393]
[19923.0931724]
[19923.0931724]
[19923.0931724]
[19923.0931724]
[19923.0931724]
[19923.0931724]
[19923.0931724]
[19923.0931724]
[19923.0931724]
[19923.09317

In [8]:
np.log(0.39248619818021474)

-0.9352539062500017

In [9]:
# ## Plot

# fig = plt.figure(figsize = (12, 10))
# ax5 = fig.add_subplot(221)
# ax6 = fig.add_subplot(222)
# ax7 = fig.add_subplot(223)
# ax8 = fig.add_subplot(224)

# nbins = 150
# sizefactor = 1
# hist_qq.plot(ax5, ax6, pvals_ultra, nbins, 'uniform', loc = 0, scale = 1, size = sizefactor)

# plt.tight_layout()
# plt.show()

In [10]:
print(np.median(np.square(S)))


NameError: name 'S' is not defined

In [None]:
print(pvals_ultra)

In [None]:
print(Rscore)

In [None]:
#print([x.rsid for x in snpinfo] )

In [None]:
geno

In [None]:
print(np.argsort([x.rsid for x in snpinfo]) )

In [None]:
snpinfo[674]

In [None]:
geno[674]

In [None]:
l

In [9]:
np.logspace(-4,1, 3)

array([1.00000000e-04, 3.16227766e-02, 1.00000000e+01])

In [12]:
np.e**(np.linspace(-4,1, 3))

array([0.01831564, 0.22313016, 2.71828183])