# MAST Model Test
This shows that Python MAST implementation is identical to the original R MAST when hurdle and normal parts use the same features.

In [1]:
import sys
sys.path.append("..")
from models.mast import MAST
from models.nb import NB
sys.path.remove("..")

import numpy as np

In [2]:
from scipy.stats import uniform, binom, nbinom
import statsmodels.api as sm
# Data
np.random.seed(1)                   # set seed to replicate example
nobs= 5000                         # number of obs in model 

x1 = binom.rvs(1, 0.6, size=nobs)   # categorical explanatory variable
x2 = uniform.rvs(size=nobs)         # real explanatory variable

theta = 0.5
X = sm.add_constant(np.column_stack((x1, x2)))
beta = [1.0, 2.0, -1.5]
xb = np.dot(X, beta)          # linear predictor

exb = np.exp(xb)
nby = nbinom.rvs(exb, theta)

X_infl=np.ones(nobs)

scaler = np.ones(nobs)*3

In [3]:
mod = MAST(nby,X,exog_infl=X,scaler=scaler)
mod_nb = NB(nby,X,model_path='../models')

In [4]:
res0=mod.fit(method='statsmodels')[0]
res1=mod_nb.fit(method='stan',seed=1)[0]

In [5]:
res0

{'params': array([ 1.59929214,  4.79999334, -2.43572328,  3.58253395,  2.11198413,
        -1.87332867]),
 'llf_logit': -1366.0177445897789,
 'llf_normal_o': -4598.122493279009,
 'llf_normal': -10594.874425053225,
 'llf': -11960.892169643004,
 'df': 6,
 'aic': 23933.78433928601,
 'cpu_time': 0.009446144104003906,
 'model': 'mast',
 'method': 'statsmodels'}

In [6]:
res1

{'params': array([ 1.00448732,  2.02414839, -1.54176086,  2.19875645]),
 'llf': -11802.47360156414,
 'df': 4,
 'aic': 23612.94720312828,
 'cpu_time': 0.02042388916015625,
 'model': 'nb',
 'method': 'stan'}

In [7]:
res1['llf']-res0['llf']

158.41856807886506

As expected, MAST is worse than NB on a NB simulation dataset

## Test init

In [8]:
res2=mod.fit(method='statsmodels', start_params=res0['params'])[0]

In [9]:
res2

{'params': array([ 1.59929214,  4.79999334, -2.43572328,  3.58253395,  2.11198413,
        -1.87332867]),
 'llf_logit': -1366.0177445897789,
 'llf_normal_o': -4598.122493279009,
 'llf_normal': -10594.874425053225,
 'llf': -11960.892169643004,
 'df': 6,
 'aic': 23933.78433928601,
 'cpu_time': 0.0015387535095214844,
 'model': 'mast',
 'method': 'statsmodels'}

## Compare with MAST in R

In [10]:
import anndata as ad
import pandas as pd
import rpy2

In [11]:
from scipy.stats import uniform, binom, nbinom
import statsmodels.api as sm
# Data
np.random.seed(1)                   # set seed to replicate example
nobs= 10000                         # number of obs in model 

x1 = binom.rvs(1, 0.6, size=nobs)   # categorical explanatory variable
x2 = uniform.rvs(size=nobs)         # real explanatory variable

theta = 0.5
X = sm.add_constant(np.column_stack((x1, x2)))
beta = [1.0, 2.0, -1.5]
xb = np.dot(X, beta)          # linear predictor

exb = np.exp(xb)

scaler = uniform.rvs(size=nobs)*5

n_out = 5
Y_r=[]
Y=[]
for i in range(n_out):
    nby = nbinom.rvs(exb, theta)
    y = np.log2(1+nby*scaler).reshape((-1,1))
    Y_r.append(y)
    Y.append(nby)

obs_names = ['c_{}'.format(i) for i in range(nobs)]
f_names = ['f_{}'.format(i) for i in range(len(beta))]
var_names = ['test{}'.format(i) for i in range(n_out)]

df = pd.DataFrame(np.concatenate(Y_r,axis=1),columns=var_names).set_index([obs_names])
dfx=pd.DataFrame(X,columns=f_names).set_index([obs_names])

adata = ad.AnnData(df)
adata.obs_names = obs_names
adata.var_names = var_names
adata.obs[f_names]=dfx
adata.var['gene']=var_names

In [12]:
# batch run
mod = MAST(np.array(Y).T,X,exog_infl=X,scaler=scaler)
res=mod.fit(method='statsmodels')
ws_py=np.array([r['params'] for r in res])

In [13]:
ws_py

array([[ 1.59880073,  4.55987341, -2.37068105,  3.1102041 ,  1.8905374 ,
        -1.72431215],
       [ 1.61578845,  4.54880483, -2.32088689,  3.12581285,  1.88931887,
        -1.72877259],
       [ 1.74699095,  4.71276117, -2.68971672,  3.10482235,  1.90692379,
        -1.74868137],
       [ 1.51575939,  4.69046664, -2.28989738,  3.08324527,  1.90829789,
        -1.6826716 ],
       [ 1.60648462,  4.76416127, -2.39974033,  3.11733033,  1.87007069,
        -1.71223623]])

In [14]:
import anndata2ri
anndata2ri.activate()

INFO:rpy2.situation:cffi mode is CFFI_MODE.ANY
INFO:rpy2.situation:R home found: /Library/Frameworks/R.framework/Resources
INFO:rpy2.situation:R library path: 
INFO:rpy2.situation:LD_LIBRARY_PATH: 
INFO:rpy2.rinterface_lib.embedded:Default options to initialize R: rpy2, --quiet, --no-save
INFO:rpy2.rinterface_lib.embedded:R is already initialized. No need to initialize.


In [15]:
%load_ext rpy2.ipython

In [16]:
%%R -i adata -o output
library(MAST)
sca = SceToSingleCellAssay(adata, class = "SingleCellAssay", check_sanity = FALSE)
zlm.output <- zlm(~f_0+f_1+f_2, sca, method = "glm", ebayes=FALSE)
output<-summary(zlm.output, logFC=FALSE)$datatable



Done!




In [17]:
output=output.sort_values(by=['primerid', 'component'], ascending=[True, False])
ws_R=output[(~pd.isnull(output['coef']))].coef.values.reshape((n_out,-1))

In [18]:
ws_py

array([[ 1.59880073,  4.55987341, -2.37068105,  3.1102041 ,  1.8905374 ,
        -1.72431215],
       [ 1.61578845,  4.54880483, -2.32088689,  3.12581285,  1.88931887,
        -1.72877259],
       [ 1.74699095,  4.71276117, -2.68971672,  3.10482235,  1.90692379,
        -1.74868137],
       [ 1.51575939,  4.69046664, -2.28989738,  3.08324527,  1.90829789,
        -1.6826716 ],
       [ 1.60648462,  4.76416127, -2.39974033,  3.11733033,  1.87007069,
        -1.71223623]])

In [19]:
ws_R

array([[ 1.59880073,  4.55987341, -2.37068105,  3.1102041 ,  1.8905374 ,
        -1.72431215],
       [ 1.61578845,  4.54880482, -2.32088689,  3.12581285,  1.88931887,
        -1.72877259],
       [ 1.74699095,  4.71276115, -2.68971672,  3.10482235,  1.90692379,
        -1.74868137],
       [ 1.51575939,  4.69046663, -2.28989738,  3.08324527,  1.90829789,
        -1.6826716 ],
       [ 1.60648462,  4.76416122, -2.39974033,  3.11733033,  1.87007069,
        -1.71223623]])

In [20]:
np.sum(np.abs(ws_py-ws_R))

9.719137450403537e-08

this shows that Python MAST implementation returns identical results as the original R implementation