In [1]:
# In[]
import sys, os
sys.path.append('../')
sys.path.append('../src/')
import numpy as np
import pandas as pd
from scipy import sparse
import networkx as nx

import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
import time

from sklearn.decomposition import PCA
from sklearn.manifold import MDS

import scDART.diffusion_dist as diff
import scDART.dataset as dataset
import scDART.model as model
import scDART.loss as loss
import scDART.train
import scDART.TI as ti
import scDART.benchmark as bmk
import scDART.de_analy as de

from umap import UMAP

import scDART.utils as utils

import scDART.post_align as palign
from scipy.sparse import load_npz

import scanpy as sc

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

plt.rcParams["font.size"] = 20

In [4]:
import sys

sys.version

'3.10.8 | packaged by conda-forge | (main, Nov 22 2022, 08:26:04) [GCC 10.4.0]'

In [2]:
# torch.cuda.current_device()
# import os
# os.environ['CUDA_VISIBLE_DEVICE']='2'
import torch
torch.cuda.set_device(0)
torch.cuda.current_device()

0

In [2]:
# In[] scan and find the one with the highest neighborhood overlap score
seeds = [0, 1, 2]
latent_dims = [4, 8, 32]
reg_ds = [1, 10]
reg_gs = [0.01, 1, 10]
reg_mmds = [1, 10, 20, 30]

latent_dim = latent_dims[0]
reg_d = reg_ds[0]
reg_g = reg_gs[1]
# harder to merge, need to make mmd loss larger
reg_mmd = reg_mmds[1]
seed = seeds[0]

learning_rate = 3e-4
n_epochs = 500
use_anchor = False
ts = [30, 50, 70]
use_potential = True
norm = "l1"

print("Random seed: " + str(seed))
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)

Random seed: 0


In [12]:
# RNA preprocessing
counts_rna = sc.read("/home/xcx/MYBenchmark-datas/P0/RNA/matrix.mtx", cache = True).T
sc.pp.highly_variable_genes(counts_rna, n_top_genes=1000)

geneids = []
for i in range(len(counts_rna.var.highly_variable)):
    if counts_rna.var.highly_variable[i] == True:
        geneids.append(i)

df = pd.DataFrame(data=geneids)
df.to_csv(os.path.join("/home/xcx/MYBenchmark-codes/1-scDART/mydata/P0/highvar_geneids.csv"), index=False)

counts_rna = counts_rna[:, geneids]
sc.pp.normalize_total(counts_rna, target_sum=1e4)
sc.pp.log1p(counts_rna)

counts_rna = counts_rna.X
counts_rna = counts_rna.todense()
df = pd.DataFrame(data=counts_rna)
df.to_csv(os.path.join("/home/xcx/MYBenchmark-codes/1-scDART/mydata/P0/highvar_counts_rna.csv"), index=False)

torch.cuda.empty_cache()

In [4]:
# ATAC preprocessing
counts_atac = sc.read("/home/xcx/MYBenchmark-datas/P0/ATAC/matrix.mtx", cache = True).T
hihgvar_regionids = pd.read_csv('/home/xcx/MYBenchmark-codes/1-scDART/mydata/P0/highvar_regionids.csv', sep='\t', header=0).values[:,0]
counts_atac = counts_atac[:, hihgvar_regionids]

counts_atac = counts_atac.X
counts_atac = counts_atac.todense()

# binarize the scATAC-Seq count matrix
counts_atac = np.where(counts_atac < 1, 0, 1)

df = pd.DataFrame(data=counts_atac)
df.to_csv(os.path.join("/home/xcx/MYBenchmark-codes/1-scDART/mydata/P0/highvar_counts_atac.csv"), index=False)

counts_atac
print(counts_atac.shape)

(5081, 24286)


In [14]:
counts_atac.shape

(5081, 24286)

In [15]:
label_rna = pd.read_csv('/home/xcx/MYBenchmark-datas/P0/cell_label.csv', index_col=0).to_numpy()[:,0]
label_atac = pd.read_csv('/home/xcx/MYBenchmark-datas/P0/cell_label.csv', index_col=0).to_numpy()[:,0]
rna_dataset = dataset.dataset(counts = counts_rna, anchor = None)
atac_dataset = dataset.dataset(counts = counts_atac, anchor = None)
# coarse_reg = torch.FloatTensor(pd.read_csv("/home/xcx/SNARE-GSE126074/P0/ATAC/region2gene.csv", header = True, index_col = 0).values).to(device)
region2gene = sc.read("/home/xcx/MYBenchmark-codes/1-scDART/mydata/P0/highvar_region2gene.mtx", cache = True).X.todense()
coarse_reg = torch.FloatTensor(region2gene).to(device)

batch_size = int(max([len(rna_dataset),len(atac_dataset)])/4)
# batch_size = 4

train_rna_loader = DataLoader(rna_dataset, batch_size = batch_size, shuffle = True)
train_atac_loader = DataLoader(atac_dataset, batch_size = batch_size, shuffle = True)

EMBED_CONFIG = {
    'gact_layers': [atac_dataset.counts.shape[1], 1024, 512, rna_dataset.counts.shape[1]], 
    'proj_layers': [rna_dataset.counts.shape[1], 512, 128, latent_dim], # number of nodes in each 
    'learning_rate': learning_rate,
    'n_epochs': n_epochs + 1,
    'use_anchor': use_anchor,
    'reg_d': reg_d,
    'reg_g': reg_g,
    'reg_mmd': reg_mmd,
    'l_dist_type': 'kl',
    'device': device
}

In [24]:
label_rna = pd.read_csv('/home/xcx/MYBenchmark-datas/P0/cell_label.csv', index_col=0).to_numpy()[:,0]
label_atac = pd.read_csv('/home/xcx/MYBenchmark-datas/P0/cell_label.csv', index_col=0).to_numpy()[:,0]

In [16]:
# calculate the diffusion distance
dist_rna = diff.diffu_distance(rna_dataset.counts.numpy(), ts = ts,
                                use_potential = use_potential, dr = "pca", n_components = 30)

dist_atac = diff.diffu_distance(atac_dataset.counts.numpy(), ts = ts,
                                use_potential = use_potential, dr = "lsi", n_components = 30)

dist_rna = dist_rna/np.linalg.norm(dist_rna)
dist_atac = dist_atac/np.linalg.norm(dist_atac)
dist_rna = torch.FloatTensor(dist_rna).to(device)
dist_atac = torch.FloatTensor(dist_atac).to(device)
# initialize the model
gene_act = model.gene_act(features = EMBED_CONFIG["gact_layers"], dropout_rate = 0.0, negative_slope = 0.2).to(device)
encoder = model.Encoder(features = EMBED_CONFIG["proj_layers"], dropout_rate = 0.0, negative_slope = 0.2).to(device)
model_dict = {"gene_act": gene_act, "encoder": encoder}

opt_genact = torch.optim.Adam(gene_act.parameters(), lr = learning_rate)
opt_encoder = torch.optim.Adam(encoder.parameters(), lr = learning_rate)
opt_dict = {"gene_act": opt_genact, "encoder": opt_encoder}

torch.cuda.empty_cache()

running time(sec): 164.38278985023499
running time(sec): 191.38158440589905
running time(sec): 165.74893260002136
running time(sec): 165.17644906044006
running time(sec): 160.87509632110596
running time(sec): 165.27909541130066


In [17]:
print(dist_atac)
dist_atac.shape

tensor([[0.0000e+00, 3.2743e-05, 3.6373e-05,  ..., 1.8089e-05, 2.9289e-04,
         2.9873e-05],
        [3.2743e-05, 0.0000e+00, 5.0314e-06,  ..., 1.4818e-05, 3.2491e-04,
         6.0915e-05],
        [3.6373e-05, 5.0314e-06, 0.0000e+00,  ..., 1.8713e-05, 3.2869e-04,
         6.4814e-05],
        ...,
        [1.8089e-05, 1.4818e-05, 1.8713e-05,  ..., 0.0000e+00, 3.1035e-04,
         4.6284e-05],
        [2.9289e-04, 3.2491e-04, 3.2869e-04,  ..., 3.1035e-04, 0.0000e+00,
         2.6514e-04],
        [2.9873e-05, 6.0915e-05, 6.4814e-05,  ..., 4.6284e-05, 2.6514e-04,
         0.0000e+00]], device='cuda:0')


torch.Size([5081, 5081])

In [18]:
import scDART.train as train
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:16"
import datetime
starttime = datetime.datetime.now()

# training models
train.match_latent(model = model_dict, opts = opt_dict, dist_atac = dist_atac, dist_rna = dist_rna, 
                data_loader_rna = train_rna_loader, data_loader_atac = train_atac_loader, n_epochs = EMBED_CONFIG["n_epochs"], 
                reg_mtx = coarse_reg, reg_d = EMBED_CONFIG["reg_d"], reg_g = EMBED_CONFIG["reg_g"], reg_mmd = EMBED_CONFIG["reg_mmd"], use_anchor = EMBED_CONFIG["use_anchor"], norm = norm, 
                mode = EMBED_CONFIG["l_dist_type"])

with torch.no_grad():
    z_rna = model_dict["encoder"](rna_dataset.counts.to(device)).cpu().detach()
    z_atac = model_dict["encoder"](model_dict["gene_act"](atac_dataset.counts.to(device))).cpu().detach()

    
# np.save(file = "/home/xcx/results/P0/1-scDART/z_rna_" + str(latent_dim) + "_" + str(reg_d) + "_" + str(reg_g) + "_" + str(reg_mmd) + "_" + str(seed) + "_l1.npy", arr = z_rna.numpy())
# np.save(file = "/home/xcx/results/P0/1-scDART/z_atac_" + str(latent_dim) + "_" + str(reg_d) + "_" + str(reg_g) + "_" + str(reg_mmd) + "_" + str(seed) + "_l1.npy", arr = z_atac.numpy())
# torch.save(model_dict, "/home/xcx/results/P0/1-scDART/model_" + str(latent_dim) + "_" + str(reg_d) + "_" + str(reg_g) + "_" + str(reg_mmd) + "_" + str(seed) + "_l1.pth")

#long running
endtime = datetime.datetime.now()
print((endtime-starttime).seconds)

epoch:  0
	 mmd loss: 0.103
	 ATAC dist loss: 0.973
	 RNA dist loss: 0.388
	 gene activity loss: 15150.331
	 anchor matching loss: 0.000
epoch:  100
	 mmd loss: 0.092
	 ATAC dist loss: 0.072
	 RNA dist loss: 0.059
	 gene activity loss: 45.939
	 anchor matching loss: 0.000
epoch:  200
	 mmd loss: 0.098
	 ATAC dist loss: 0.068
	 RNA dist loss: 0.054
	 gene activity loss: 19.816
	 anchor matching loss: 0.000
epoch:  300
	 mmd loss: 0.087
	 ATAC dist loss: 0.051
	 RNA dist loss: 0.046
	 gene activity loss: 10.784
	 anchor matching loss: 0.000
epoch:  400
	 mmd loss: 0.083
	 ATAC dist loss: 0.048
	 RNA dist loss: 0.046
	 gene activity loss: 6.519
	 anchor matching loss: 0.000
epoch:  500
	 mmd loss: 0.083
	 ATAC dist loss: 0.045
	 RNA dist loss: 0.047
	 gene activity loss: 4.162
	 anchor matching loss: 0.000
754


In [19]:
latent_rna = z_rna.numpy()
latent_atac = z_atac.numpy()

df = pd.DataFrame(data=latent_rna)
df.to_csv(os.path.join("/home/xcx/results/P0/1-scDART/z_rna.csv"), index=False)
df = pd.DataFrame(data=latent_atac)
df.to_csv(os.path.join("/home/xcx/results/P0/1-scDART/z_atac.csv"), index=False)

In [22]:
z_rna = pd.read_csv("/home/xcx/results/P0/1-scDART/z_rna.csv", index_col = None).values
z_atac = pd.read_csv("/home/xcx/results/P0/1-scDART/z_atac.csv", index_col = None).values
label_rna = pd.read_csv('/home/xcx/MYBenchmark-datas/P0/cell_label.csv', index_col=False).to_numpy()[:,0]
label_atac = pd.read_csv('/home/xcx/MYBenchmark-datas/P0/cell_label.csv', index_col=False).to_numpy()[:,0]