# Initialisation

In [1]:
### Load modules and data

import numpy as np
import itertools as it
import matplotlib.pyplot as plt
import math
from scipy import stats
import pdb
from sklearn import preprocessing
import copy
from unittest import *
import itertools
from bidict import bidict

%matplotlib inline


## Trait simulation

### Explanation

Given genotype data and an LD structure, simulate a trait which is linearly associated with a variant, or a set of variants. Here I generate a large $m \times n$ matrix ($m$=number of samples, $n$=number of SNPs), with $0,1,2$ as elements.

Then, I can choose a set of SNPs, and from these SNPs I generate a trait with a linear model with a given parameter $\beta$, as well as an unexplained variance parameter $\epsilon$.

Following this, I try to recover these sets of SNPs. I generate p-values for each SNP being associated with the trait, by individually building univariate linear models for each SNPs, as I understand summary statistics are generated.

### Implementation

In [4]:
### Sample genotypes

def simulate_genotype(n,m,geno_dist):
    """
    Simulate a genotype of n samples and m causal SNPs with specified genotype distribution for (0,1,2).
    """
    X=np.zeros([n,m])
    for i in range(m):
        X[:,i] = [np.random.choice(a=[0,1,2],p=geno_dist) for x in range(n)]
    return np.array(X)

###example
# X = simulate_genotype(n=10000,m=30,geno_dist=[0.85,0.1,0.05])

def simulate_traits(X,snp_group,eps=0.5):
    """
    SNPs in the form e.g. {3: 0.9, 5:0.4, 8:0.5}. Dictionary values are the linear model coefficients (beta values).
    eps is the level of unexplained variance. X is the genotype information.
    """
    beta = np.array(snp_group.values()).T
    snps = snp_group.keys()
    eps_vector = np.array(np.random.normal(0,eps,X.shape[0])).T
    return np.add(np.dot(X[:,snps], beta), eps_vector)
    
# examples
# y = simulate_traits(X,eps=0.5,snp_group={3: 5, 9: 3})

def build_linear_models(X,y):
    """
    Build univariate linear models for each SNP column in X against the trait y.
    """
    return [stats.linregress(X[:,i],y) for i in range(X.shape[1])]

# example
# models1 = [x for x in build_linear_models(X,y)]

def calc_effect_sizes(models):
    """
    Calculate the effect sizes = beta / se(beta) of individual SNPs towards the traits.
    Takes in a list of linear regression models.
    """
    return [x.slope / x.stderr for x in models]

# example
# z1 = [x.slope / x.stderr for x in models1]



### Example

In [67]:
snp_groups = [{1: 5}, {1: 5, 3: 6}, {1: 5, 3: 6, 15:3}, {1: 5, 3: 6, 15:3, 25:1}]

for g in snp_groups:
    n = 10000

    ### simulate genotypes
    X = simulate_genotype(n=10000,m=30,geno_dist=[0.85,0.1,0.05])
    ### scale columns
    X = preprocessing.scale(X)

    ### calculate LD matrix
    LD_matrix = np.corrcoef(X,rowvar=0)

    ### simulate traits
    y = simulate_traits(X,eps=0.5,snp_group=g)
    ### scale traits
    y = preprocessing.scale(y)

    t_statistics = build_linear_models(X,y)

    beta = [x.slope for x in t_statistics]
    se_beta = [x.stderr for x in t_statistics]

    ###calcuate z

    z =  np.divide(beta, se_beta)

    simulated_effectsize_data = ([x*np.sqrt(n) for x in beta], LD_matrix, n)

    gene_set_BFs = calc_variant_set_BFs(simulated_effectsize_data,k=5,v=0.01)

    gene_set_posteriors = calc_posterior(gene_set_BFs)
    print g, gene_set_posteriors[0:5]

{1: 5} [((1,), 0.9050352515170708), ((1, 21), 0.003189029568249363), ((1, 25), 0.00318428426718765), ((1, 19), 0.003161674088441622), ((1, 4), 0.0031344545531106003)]
{1: 5, 3: 6} [((1, 3), 0.9083736228868016), ((1, 3, 13), 0.0031873943086928548), ((1, 3, 10), 0.0031412730321097606), ((1, 3, 26), 0.003138966388610109), ((1, 3, 17), 0.0031380659516668206)]
{1: 5, 3: 6, 15: 3} [((1, 3, 15), 0.9150552356086875), ((1, 3, 12, 15), 0.0031637810581904495), ((1, 3, 4, 15), 0.0031548310641800084), ((1, 3, 15, 19), 0.003154222576982418), ((1, 3, 15, 27), 0.0031541062648322094)]
{1: 5, 3: 6, 25: 1, 15: 3} [((1, 3, 15, 25), 1.0), ((1, 3, 15), 3.0459784689226484e-28), ((1, 3, 9, 15), 1.0914551735029302e-30), ((1, 3, 13, 15), 1.083211641042767e-30), ((1, 3, 15, 21), 1.0756922570942834e-30)]
