# After scMM model training, transforming output into latent representation

In [None]:
import pandas as pd
import scanpy as sc
from scipy import sparse
from scipy.sparse import issparse

def clustering(adata, path):
    # read scMM output
    df_train = pd.read_csv(path + 'lat_train_mean.csv', index_col=0)
    df_test = pd.read_csv(path + 'lat_test_mean.csv', index_col=0)
    id_train = pd.read_csv(path + 't_id.csv', index_col=0)
    id_test = pd.read_csv(path + 's_id.csv', index_col=0)

    # combine latent representation with cell id 
    df = pd.concat((df_train, df_test))
    id = pd.concat((id_train, id_test))
    id.rename(columns={"0": "ID"}, inplace=True)
    
    df = pd.concat((id, df),axis=1)
    df.set_index('ID', inplace=True)
    df.sort_index(inplace=True)

    adata_new = sc.AnnData(df, 
                    df.index.to_frame(), 
                    df.columns.to_frame())
    adata_new.obsm['spatial'] = adata.obsm['spatial']
    
    #add latent representation to adata
    adata.obsm['X_scMM'] = adata_new.X
    
    # downstream analysis
    # sc.pp.neighbors(adata_new, n_neighbors=10)
    # sc.tl.umap(adata_new)
    # sc.tl.leiden(adata_new, resolution=0.5)

    # sc.pl.umap(adata_new, color='leiden')
    # sc.pl.embedding(adata_new, basis='spatial', color='leiden',s=60)
    
    return adata

for path in ['../../datasets/Human_Lymph_Node_A1/','../../datasets/Human_Lymph_Node_D1/']:
    adata_RNA = sc.read_h5ad(path + '/adata_RNA.h5ad')
    adata_ADT = sc.read_h5ad(path + '/adata_ADT.h5ad')
    # if expression matrix is dense matrix, need to tranform to sparse matrix
    if not issparse(adata_RNA.X):
        adata_RNA.X = sparse.coo_matrix(adata_RNA.X)
    if not issparse(adata_ADT.X):    
        adata_ADT.X = sparse.coo_matrix(adata_ADT.X)
    # RNA
    from scipy.io import mmwrite
    import os
    
    if not os.path.exists(path + "/RNA-seq"):
        os.mkdir(path + "/RNA-seq")             
    RNA_count = adata_RNA.X.copy()
    mmwrite(path + "/RNA-seq/RNA_count.mtx", RNA_count.T)
    
    barcode = pd.DataFrame(index=adata_RNA.obs_names)
    barcode.to_csv(path + '/RNA-seq/barcode.tsv', sep='\t', header=None)
    
    gene = pd.DataFrame(index=adata_RNA.var_names)
    gene.to_csv(path + '/RNA-seq/gene.tsv', sep='\t', header=None)
    
    # ADT
    if not os.path.exists(path + "/CITE-seq"):
        os.mkdir(path + "/CITE-seq")  
    Protein_count = adata_ADT.X.copy()
    mmwrite(path + "/CITE-seq/Protein_count.mtx", Protein_count.T)
    
    barcode = pd.DataFrame(index=adata_ADT.obs_names)
    barcode.to_csv(path + '/CITE-seq/barcode.tsv', sep='\t', header=None)
    
    protein = pd.DataFrame(index=adata_ADT.var_names)
    protein.to_csv(path + '/CITE-seq/protein.tsv', sep='\t', header=None)

    !python 'scMM/main.py' --experiment 'rna_protein' --model 'rna_protein' --obj 'm_elbo_naive_warmup' --batch_size 32 --epochs 50 --deterministic_warmup 25 --lr 1e-4 --latent_dim 64 --num_hidden_layers 2 --r_hidden_dim 100 --p_hidden_dim 30 --dataset_path {path} --learn_prior
    
    train_list = 'rna_protein/'
    
    adata = sc.read_h5ad(path + '/adata_RNA.h5ad')
    adata = clustering(adata, path+train_list)

    result_path=path.replace("datasets","results")
    adata.write_h5ad(result_path + '/adata_scMM.h5ad')

True
../../datasets/Mouse_Thymus5/rna_protein
Loading  data ...
Original data contains 19737 cells x 23375 peaks
Finished loading takes 0.72 min
Loading  data ...
Original data contains 19737 cells x 128 peaks
Finished loading takes 0.05 min
RNA-seq shape is (19737, 23375)
CITE-seq shape is (19737, 128)
Namespace(experiment='rna_protein', model='rna_protein', obj='m_elbo_naive_warmup', llik_scaling=1.0, batch_size=32, epochs=50, lr=0.0001, latent_dim=64, num_hidden_layers=2, r_hidden_dim=100, p_hidden_dim=30, pre_trained='', learn_prior=True, analytics=True, print_freq=0, no_cuda=False, seed=1, dataset_path='../../datasets/Mouse_Thymus5/', r_dim=23375, p_dim=128, deterministic_warmup=25, cuda=True)
====> Epoch: 001 Train loss: 10815.2568
====>             Test loss: 8638.8658
====> Epoch: 002 Train loss: 7438.5307
====>             Test loss: 6624.4599
====> Epoch: 003 Train loss: 6084.4754
====>             Test loss: 5714.7433
====> Epoch: 004 Train loss: 5585.5595
====>             

