# memento SCVI comparison

In [1]:
import scanpy as sc
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.sparse as sparse
import scipy.stats as stats
import pickle as pkl
%matplotlib inline
import itertools

import statsmodels.formula.api as smf
import statsmodels.api as sm
pd.set_option('display.max_rows', 600)

In [2]:
import functools

In [3]:
import sys
sys.path.append('/home/ssm-user/Github/scrna-parameter-estimation/')
import memento

In [4]:
%load_ext autoreload

In [5]:
%autoreload 2

### Preprocess

In [6]:
data_path = '/data_volume/memento/lupus/'

adata = sc.read(data_path + './Lupus_study_adjusted_counts_cM_filtered.h5ad')
adata.layers['counts'] = adata.X.copy()

adata.raw = adata  # keep full dimension safe
sc.pp.highly_variable_genes(
    adata,
    flavor="seurat_v3",
    n_top_genes=10000,
    layer="counts",
    batch_key="batch_cov",
    subset=True,
)

adata.write(data_path + './lupus_cM_hvg.h5ad')

### Run memento

In [16]:
n_trial = 10

for trial in range(n_trial):
    
    adata = sc.read(data_path + './lupus_cM_hvg.h5ad')
    donor_status = adata.obs[['ind_cov', 'SLE_status']].drop_duplicates().reset_index()
    shuffled_donor_status = donor_status.copy().reset_index()
    shuffled_donor_status['shuffled_SLE_status'] = shuffled_donor_status['SLE_status'].sample(frac=1).values
    shuffled_donor_status.drop('SLE_status', axis=1, inplace=True)
    adata.obs = pd.merge(adata.obs, shuffled_donor_status, on='ind_cov', how='left')

    adata.obs['q'] = 0.07
    memento.setup_memento(adata, q_column='q', trim_percent=0.1, filter_mean_thresh=0.01)
    memento.create_groups(adata, label_columns=['shuffled_SLE_status', 'ind_cov'])
    memento.compute_1d_moments(adata, min_perc_group=.7)

    meta_df = memento.get_groups(adata)
    meta_df['intercept'] = 1
    covariate = meta_df[['intercept']]
    treatment = (meta_df[['shuffled_SLE_status']] == 'SLE').astype(float)
    
    memento.ht_1d_moments(
        adata, 
        covariate=covariate,
        treatment=treatment, 
        num_boot=5000, 
        verbose=1,
        num_cpus=50,
        resample_rep=True,
        approx=True)
    results = memento.get_1d_ht_result(adata)
    results['de_fdr'] = memento.util._fdrcorrect(results['de_pval'])
    results.to_csv(data_path + f'scvi_comparison/memento_shuffled_{trial}.csv')

[Parallel(n_jobs=50)]: Using backend LokyBackend with 50 concurrent workers.
[Parallel(n_jobs=50)]: Done 100 tasks      | elapsed:    4.3s
[Parallel(n_jobs=50)]: Done 350 tasks      | elapsed:   13.0s
[Parallel(n_jobs=50)]: Done 700 tasks      | elapsed:   24.7s
[Parallel(n_jobs=50)]: Done 1150 tasks      | elapsed:   39.0s
[Parallel(n_jobs=50)]: Done 1700 tasks      | elapsed:   57.0s
[Parallel(n_jobs=50)]: Done 2350 tasks      | elapsed:  1.3min
[Parallel(n_jobs=50)]: Done 3100 tasks      | elapsed:  1.7min
[Parallel(n_jobs=50)]: Done 3950 tasks      | elapsed:  2.2min
[Parallel(n_jobs=50)]: Done 4900 tasks      | elapsed:  2.7min
[Parallel(n_jobs=50)]: Done 5102 out of 5102 | elapsed:  3.0min finished
[Parallel(n_jobs=50)]: Using backend LokyBackend with 50 concurrent workers.
[Parallel(n_jobs=50)]: Done 100 tasks      | elapsed:    4.3s
[Parallel(n_jobs=50)]: Done 350 tasks      | elapsed:   12.9s
[Parallel(n_jobs=50)]: Done 700 tasks      | elapsed:   24.8s
[Parallel(n_jobs=50)]: 

In [9]:
results = memento.get_1d_ht_result(adata)
results['de_fdr'] = memento.util._fdrcorrect(results['de_pval'])
results.to_csv(data_path + f'scvi_comparison/memento_shuffled_{trial}.csv')

In [None]:
data_path = '/data_volume/memento/lupus/'

num_de = np.zeros(n_trial)
for  trial in range(n_trial):
    
    df = pd.read_csv(data_path + f'lupus/shuffled_scvi_result_{trial}.csv', index_col=0)
    num_de[trial] = df[df['is_de_fdr_0.05']].shape[0]
