In [None]:
import pandas
import pystan
import numpy
import scipy.stats

NeuronTopGeneGuides = pandas.read_csv("~/sgRNA/tiling/Neuron/NeuronTopGeneGuides.txt", sep='\t', header=0)
NeuronTopGeneGuides.head()

NeuronTopGeneGuides['gene'] = NeuronTopGeneGuides['gene'].astype('category')

mixture_model = """
data {
  int<lower=1> n_sgRNAs;
  int<lower=1> n_genes;
  real mu1;
  real<lower=0, upper=1> q0;
  real x[n_sgRNAs];
  int<lower=0, upper=n_genes> gene_ids[n_sgRNAs];
}
parameters {
  real mu_g[n_genes];
  real<lower=0, upper=1> q[n_genes];
  real<lower=0> sigma_g;
  real<lower=0> sigma1;
  real mu0;
  real<lower=0> sigma0;
}
model{
  mu_g ~ normal(mu1, sigma_g);
  mu0 ~ normal(0, 0.1);
  sigma0 ~ normal(0, 1);
  sigma_g ~ cauchy(0, 1);
  q ~ beta(q0*4/(1 - q0), 4);
  sigma1 ~ cauchy(0, 1);
  for (i in 1:n_sgRNAs){
    target += log_mix(q[gene_ids[i]], 
                      normal_lpdf(x[i] | mu_g[gene_ids[i]], sigma1), 
                      normal_lpdf(x[i] | mu0, sigma0)); 
  }
}
"""

sgRNAdata = {'n_sgRNAs' : NeuronTopGeneGuides.shape[0],
             'n_genes' : NeuronTopGeneGuides['gene'].nunique(),
             'mu1' : 1.7,
             'q0' : 0.2,
             'x' : NeuronTopGeneGuides['log2fc'],
             'gene_ids' : NeuronTopGeneGuides['gene'].cat.codes + 1 # stan starts counting at 1
            }

neuron_stan_fit = pystan.stan(model_code = mixture_model,
                             data=sgRNAdata, iter=2000, chains=4)
print(neuron_stan_fit)

mu_g_sample = pandas.DataFrame(neuron_stan_fit['mu_g'])
neuron_gene_means =  mu_g_sample.mean(axis=0)
q_sample = pandas.DataFrame(neuron_stan_fit['q'])
neuron_mixing = q_sample.mean(axis = 0)
sigma1_sample = pandas.DataFrame(neuron_stan_fit['sigma1'])
neuron_sigma1 = sigma1_sample.mean(axis = 0)
sigma0_sample = pandas.DataFrame(neuron_stan_fit['sigma0'])
neuron_sigma0 = sigma0_sample.mean(axis = 0)
mu0_sample = pandas.DataFrame(neuron_stan_fit['mu0'])
neuron_mu0 = mu0_sample.mean(axis = 0)

gene_ids = NeuronTopGeneGuides['gene'].cat.codes
neuron_sgRNA_probs = numpy.zeros(sgRNAdata['n_sgRNAs'])
for i in range(neuron_sgRNA_probs.shape[0]):
    gene = gene_ids[i]
    mu_g = neuron_gene_means[gene]
    q = neuron_mixing[gene]
    pos_prob = q*scipy.stats.norm(mu_g, neuron_sigma1).pdf(NeuronTopGeneGuides['log2fc'][i])[0]
    neg_prob = (1 - q)*scipy.stats.norm(neuron_mu0, neuron_sigma0).pdf(NeuronTopGeneGuides['log2fc'][i])[0]
    neuron_sgRNA_probs[i] = pos_prob/(pos_prob + neg_prob)
    
NeuronTopGeneGuides['mixture_probs'] = neuron_sgRNA_probs    
NeuronTopGeneGuides.head()

NeuronTopGeneGuides.to_csv("NeuronTopGeneGuidesMixtureProbs.txt", sep='\t')




INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_83d9cc82176032cc96b5f1f9bcbe2f0d NOW.
  tree = Parsing.p_module(s, pxd, full_module_name)


In [None]:

SelfRenewalTopGeneGuides = pandas.read_csv("~/sgRNA/tiling/SelfRenewal/SelfRenewalTopGeneGuides.txt", sep='\t', header=0)
SelfRenewalTopGeneGuides.head()

SelfRenewalTopGeneGuides['gene'] = SelfRenewalTopGeneGuides['gene'].astype('category')


sgRNAdata = {'n_sgRNAs' : SelfRenewalTopGeneGuides.shape[0],
             'n_genes' : SelfRenewalTopGeneGuides['gene'].nunique(),
             'mu1' : 4.3,
             'q0' : 0.5,
             'x' : SelfRenewalTopGeneGuides['log2fc'],
             'gene_ids' : SelfRenewalTopGeneGuides['gene'].cat.codes + 1 # stan starts counting at 1
            }

selfrenewal_stan_fit = pystan.stan(model_code = mixture_model,
                             data=sgRNAdata, iter=2000, chains=4)
print(selfrenewal_stan_fit)

mu_g_sample = pandas.DataFrame(selfrenewal_stan_fit['mu_g'])
selfrenewal_gene_means =  mu_g_sample.mean(axis=0)
q_sample = pandas.DataFrame(selfrenewal_stan_fit['q'])
selfrenewal_mixing = q_sample.mean(axis = 0)
sigma1_sample = pandas.DataFrame(selfrenewal_stan_fit['sigma1'])
selfrenewal_sigma1 = sigma1_sample.mean(axis = 0)
sigma0_sample = pandas.DataFrame(selfrenewal_stan_fit['sigma0'])
selfrenewal_sigma0 = sigma0_sample.mean(axis = 0)
mu0_sample = pandas.DataFrame(selfrenewal_stan_fit['mu0'])
selfrenewal_mu0 = mu0_sample.mean(axis = 0)

gene_ids = SelfRenewalTopGeneGuides['gene'].cat.codes
selfrenewal_sgRNA_probs = numpy.zeros(sgRNAdata['n_sgRNAs'])
for i in range(selfrenewal_sgRNA_probs.shape[0]):
    gene = gene_ids[i]
    mu_g = selfrenewal_gene_means[gene]
    q = selfrenewal_mixing[gene]
    pos_prob = q*scipy.stats.norm(mu_g, selfrenewal_sigma1).pdf(SelfRenewalTopGeneGuides['log2fc'][i])[0]
    neg_prob = (1 - q)*scipy.stats.norm(selfrenewal_mu0, selfrenewal_sigma0).pdf(SelfRenewalTopGeneGuides['log2fc'][i])[0]
    selfrenewal_sgRNA_probs[i] = pos_prob/(pos_prob + neg_prob)
    
SelfRenewalTopGeneGuides['mixture_probs'] = selfrenewal_sgRNA_probs    
SelfRenewalTopGeneGuides.head()

SelfRenewalTopGeneGuides.to_csv("SelfRenewalTopGeneGuidesMixtureProbs.txt", sep='\t')