In [1]:
import scanpy as sc
import multigrate
import gdown
import h5py
import numpy as np
from mofapy2.run.entry_point import entry_point

In [2]:
%config InlineBackend.figure_format = 'retina'

# MOFA 2

## Load the dataset

In [1]:
!wget "https://hmgubox2.helmholtz-muenchen.de/index.php/s/r2W5dMJdq6mFMZY/download?path=%2Fseurat-2020&files=expressions.h5ad" -O hao2020-expressions.h5ad

--2021-01-05 04:46:52--  https://hmgubox2.helmholtz-muenchen.de/index.php/s/r2W5dMJdq6mFMZY/download?path=%2Fseurat-2020&files=expressions.h5ad
Resolving localhost (localhost)... 127.0.0.1
Connecting to localhost (localhost)|127.0.0.1|:8085... connected.
Proxy request sent, awaiting response... 200 OK
Length: 1949492332 (1.8G) [application/octet-stream]
Saving to: ‘hao2020-expressions.h5ad’


2021-01-05 04:51:08 (7.26 MB/s) - ‘hao2020-expressions.h5ad’ saved [1949492332/1949492332]



In [3]:
scrna = sc.read_h5ad('hao2020-expressions.h5ad')
scrna

AnnData object with n_obs × n_vars = 161764 × 4000
    obs: 'nCount_ADT', 'nFeature_ADT', 'nCount_RNA', 'nFeature_RNA', 'orig.ident', 'lane', 'donor', 'time', 'celltype.l1', 'celltype.l2', 'celltype.l3', 'Phase', 'cell_type'
    var: 'features', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'celltype.l1_colors', 'celltype.l2_colors', 'celltype.l3_colors', 'neighbors'
    obsm: 'X_apca', 'X_aumap', 'X_pca', 'X_spca', 'X_umap', 'X_wnn.umap'
    varm: 'PCs', 'SPCA'
    layers: 'count'
    obsp: 'distances'

In [6]:
!wget "https://hmgubox2.helmholtz-muenchen.de/index.php/s/r2W5dMJdq6mFMZY/download?path=%2Fseurat-2020&files=protein.h5ad" -O hao2020-proteins.h5ad

--2021-01-05 04:51:19--  https://hmgubox2.helmholtz-muenchen.de/index.php/s/r2W5dMJdq6mFMZY/download?path=%2Fseurat-2020&files=protein.h5ad
Resolving localhost (localhost)... 127.0.0.1
Connecting to localhost (localhost)|127.0.0.1|:8085... connected.
Proxy request sent, awaiting response... 200 OK
Length: 904554908 (863M) [application/octet-stream]
Saving to: ‘hao2020-proteins.h5ad’


2021-01-05 04:53:34 (6.42 MB/s) - ‘hao2020-proteins.h5ad’ saved [904554908/904554908]



In [4]:
cite = sc.read_h5ad('hao2020-proteins.h5ad')
cite

AnnData object with n_obs × n_vars = 161764 × 224
    obs: 'nCount_ADT', 'nFeature_ADT', 'nCount_RNA', 'nFeature_RNA', 'orig.ident', 'lane', 'donor', 'time', 'celltype.l1', 'celltype.l2', 'celltype.l3', 'Phase', 'cell_type'
    var: 'features'
    uns: 'celltype.l1_colors', 'celltype.l2_colors', 'celltype.l3_colors'
    obsm: 'X_apca', 'X_aumap', 'X_pca', 'X_spca', 'X_umap', 'X_wnn.umap'
    varm: 'APCA'
    layers: 'count'

## Configure and train the model

In [11]:
ent = entry_point()


        #########################################################
        ###           __  __  ____  ______                    ### 
        ###          |  \/  |/ __ \|  ____/\    _             ### 
        ###          | \  / | |  | | |__ /  \ _| |_           ### 
        ###          | |\/| | |  | |  __/ /\ \_   _|          ###
        ###          | |  | | |__| | | / ____ \|_|            ###
        ###          |_|  |_|\____/|_|/_/    \_\              ###
        ###                                                   ### 
        ######################################################### 
       
 
        


In [12]:
ent.set_data_options(
    scale_groups = False, 
    scale_views = False
)

In [13]:
ent.set_data_matrix([[scrna.X.A], [cite.X]], views_names=['scRNA', 'scADT'])

Features names not provided, using default naming convention:
- feature1_view1, featureD_viewM

Groups names not provided, using default naming convention:
- group1, group2, ..., groupG

Samples names not provided, using default naming convention:
- sample1_group1, sample2_group1, sample1_group2, ..., sampleN_groupG

Successfully loaded view='scRNA' group='group0' with N=161764 samples and D=4000 features...
Successfully loaded view='scADT' group='group0' with N=161764 samples and D=224 features...




In [14]:
ent.set_model_options(
    factors = 20, 
    spikeslab_weights = True, 
    ard_factors = True,
    ard_weights = True
)

Model options:
- Automatic Relevance Determination prior on the factors: True
- Automatic Relevance Determination prior on the weights: True
- Spike-and-slab prior on the factors: False
- Spike-and-slab prior on the weights: True
Likelihoods:
- View 0 (scRNA): gaussian
- View 1 (scADT): gaussian




In [15]:
ent.set_train_options(
    iter = 1000, 
    convergence_mode = "fast", 
    startELBO = 1, 
    freqELBO = 1, 
    dropR2 = 0.001, 
    gpu_mode = False, 
    verbose = False, 
    seed = 1
)

Consider training the model with set drop_factor_threshold = -1 and prune them a posteriori


In [16]:
ent.build()

In [None]:
ent.run()



######################################
## Training the model with seed 1 ##
######################################


ELBO before training: -8686136456.83 

Iteration 1: time=302.39, ELBO=478021515.21, deltaELBO=9164157972.040 (105.50326969%), Factors=19
Iteration 2: time=287.33, ELBO=501131439.81, deltaELBO=23109924.593 (0.26605528%), Factors=18
Iteration 3: time=347.39, ELBO=509667279.13, deltaELBO=8535839.323 (0.09826969%), Factors=17
Iteration 4: time=317.93, ELBO=512974113.39, deltaELBO=3306834.264 (0.03807025%), Factors=17
Iteration 5: time=261.19, ELBO=513697924.45, deltaELBO=723811.059 (0.00833295%), Factors=17
Iteration 6: time=323.67, ELBO=513887022.01, deltaELBO=189097.555 (0.00217700%), Factors=17
Iteration 7: time=357.15, ELBO=513990941.70, deltaELBO=103919.692 (0.00119639%), Factors=17
Iteration 8: time=332.98, ELBO=514070018.42, deltaELBO=79076.719 (0.00091038%), Factors=17


In [None]:
outfile = 'hao2020-mofa.hdf5'
ent.save(outfile)

In [None]:
f = h5py.File(outfile)

In [None]:
z = np.concatenate([v[:,:] for k, v in f['expectations']['Z'].items()], axis=1).T

In [None]:
z.shape

In [None]:
# zs = np.concatenate([v[:] for k, v in f["samples"].items()], axis=0).astype(str)
# z = pd.DataFrame(z, index=zs).loc[adata.obs_names.values].to_numpy()

In [None]:
# adata.obsm['X_mofa'] = z
# w = np.concatenate([v[:,:] for k, v in f['expectations']['W'].items()], axis=1).T
# adata.varm['LFs'] = w

In [None]:
# np.concatenate([v[:,:] for k, v in f['expectations']['W'].items()], axis=1).T.shape

In [None]:
# f.keys()

In [None]:
# [v for k, v in f['expectations']['W'].items()]

In [None]:
z = sc.AnnData(z)
z.obs['cell_type'] = scrna.obs['cell_type'].tolist()

In [None]:
sc.pp.neighbors(z)
sc.tl.umap(z)

In [None]:
sc.pl.umap(z, color=['cell_type'], ncols=1)

In [None]:
sc.pp.pca(z)
multigrate.metrics.metrics(
    z, z,
    batch_key='modality',
    label_key='cell_type',
    asw_batch=False,
    pcr_batch=False,
    graph_connectivity_batch=False,
    asw_label=True,
    nmi_=True,
    ari_=True,
    isolated_label_asw=False,
    method='MOFA'
)