In [None]:
import warnings
warnings.simplefilter(action='ignore')
import scanpy as sc
import torch
import scarches as sca
import numpy as np
import gdown
import pandas as pd
from collections import defaultdict,Counter
import gc
import matplotlib.pyplot as plt
import pickle
sc.set_figure_params(frameon=False)
sc.set_figure_params(dpi=200)
sc.set_figure_params(figsize=(4, 4))
torch.set_printoptions(precision=3, sci_mode=False, edgeitems=7)

In [None]:
pwd

In [6]:
adata = sc.read_h5ad('../MOFA/adata_combined_more_hvg_filtered.h5ad')

In [17]:
weights = pd.read_csv('../MOFA/MOFA_weights.csv', index_col='Unnamed: 0')
genes = pd.read_csv('../MOFA/MOFA_latent_factors_genes.csv', index_col='Unnamed: 0')
factors = pd.read_csv('../MOFA/MOFA_factors.csv', index_col='Unnamed: 0')

In [25]:
genes_selected = genes[:500]

In [27]:
all_genes = [x for x in list(set(genes_selected.values.flatten().astype(str))) if x!= 'nan']
binary_matrix = pd.DataFrame(0, index=all_genes, columns=genes_selected.columns)
for column in genes_selected.columns:
    genes_in_program = genes_selected[column].dropna()
    binary_matrix.loc[genes_in_program, column] = 1

In [30]:
adata_gp = adata[:, all_genes]

In [31]:
adata_gp.varm['I'] = np.array(binary_matrix)
select_terms = adata_gp.varm['I'].sum(0)>12

In [34]:
adata_gp.uns['terms'] = binary_matrix.columns
adata_gp._inplace_subset_var(adata_gp.varm['I'].sum(1)>0)

In [None]:
adata_gp.uns['terms']

In [None]:
print('Checking X')
subset = sc.pp.subsample(adata_gp, fraction=0.1, copy=True)
raw_counts = subset.X.toarray()
print(f"Are raw counts integers? {np.all(raw_counts.astype(int) == raw_counts)}")
print(f"Range of raw counts: {np.min(raw_counts)} to {np.max(raw_counts)}")
print("-" * 50)

In [None]:
intr_cvae = sca.models.EXPIMAP(
    adata=adata_gp,
    condition_key='batch_covariate',
    hidden_layer_sizes=[300, 300, 300],
    recon_loss='nb'
)

adata_gp.X = adata_gp.X.astype('float32')

ALPHA = 0.7
early_stopping_kwargs = {
    "early_stopping_metric": "val_unweighted_loss",
    "threshold": 0,
    "patience": 50,
    "reduce_lr": True,
    "lr_patience": 13,
    "lr_factor": 0.1,
}
intr_cvae.train(
    n_epochs=200,
    alpha_epoch_anneal=50,
    alpha=ALPHA,
    alpha_kl=0.5,
    weight_decay=0.,
    early_stopping_kwargs=early_stopping_kwargs,
    use_early_stopping=True,
    seed=2020
)

In [41]:
adata_gp.obsm['X_cvae'] = intr_cvae.get_latent(mean=False, only_active=True)
adata_gp.uns['terms'] = adata_gp.uns['terms'].tolist()

In [43]:
adata_gp.write('../Expimap/int_10_factors.h5ad')
intr_cvae.save('../Expimap/expimap_10_factors')

In [None]:
sc.pp.neighbors(adata_gp, use_rep='X_cvae')
sc.tl.umap(adata_gp)

In [48]:
adata_gp.obs.Level_1 = adata_gp.obs.Level_1.replace('NK Cell', 'Natural Killer')
adata_gp.obs.Level_1 = adata_gp.obs.Level_1.replace('Ductal Cell/Malignant', 'Malignant')
adata_gp.obs.Level_1 = adata_gp.obs.Level_1.replace('Intra-pancreatic Neurons', 'Neuronal Cell')

In [None]:
plt.rcParams['figure.figsize'] = (6, 6)
random_indices = np.random.permutation(list(range(adata_gp.shape[0])))
sc.pl.umap(adata_gp[random_indices, :], color=['batch_covariate', 'Level_1', 'Condition'], frameon=False, ncols=1)

In [47]:
adata_gp.write('../Expimap/int_10_factors.h5ad')

In [46]:
# Try integration with Donor

In [None]:
adata_gp.obs.ID = adata_gp.obs.ID.astype(str)
adata_gp.obs.batch_covariate = adata_gp.obs.batch_covariate.astype(str)
adata_gp.obs['ID_batch_covariate'] = adata_gp.obs.ID + '_' + adata_gp.obs.batch_covariate
adata_gp.obs.ID_batch_covariate = adata_gp.obs.ID_batch_covariate.astype('category')
adata_gp.obs.ID = adata_gp.obs.ID.astype('category')

In [64]:
# adata_gp.obs.ID.unique()

In [None]:
intr_cvae = sca.models.EXPIMAP(
    adata=adata_gp,
    condition_key='ID_batch_covariate',
    hidden_layer_sizes=[300, 300, 300],
    recon_loss='nb'
)

adata_gp.X = adata_gp.X.astype('float32')

ALPHA = 0.7
early_stopping_kwargs = {
    "early_stopping_metric": "val_unweighted_loss",
    "threshold": 0,
    "patience": 50,
    "reduce_lr": True,
    "lr_patience": 13,
    "lr_factor": 0.1,
}
intr_cvae.train(
    n_epochs=200,
    alpha_epoch_anneal=50,
    alpha=ALPHA,
    alpha_kl=0.5,
    weight_decay=0.,
    early_stopping_kwargs=early_stopping_kwargs,
    use_early_stopping=True,
    seed=2020
)

In [67]:
adata_gp.obsm['X_cvae'] = intr_cvae.get_latent(mean=False, only_active=True)
# adata_gp.uns['terms'] = adata_gp.uns['terms'].tolist()

In [72]:
adata_gp.write('../Expimap/int_10_factors_donor.h5ad')
# intr_cvae.save('../Expimap/expimap_10_factors_donor')

In [69]:
sc.pp.neighbors(adata_gp, use_rep='X_cvae')
sc.tl.umap(adata_gp)

In [3]:
adata_gp = sc.read_h5ad('../Expimap/int_10_factors_donor.h5ad')

In [None]:
plt.rcParams['figure.figsize'] = (14, 14)
random_indices = np.random.permutation(list(range(adata_gp.shape[0])))
sc.pl.umap(adata_gp[random_indices, :], color=['batch_covariate', 'Level_1', 'Condition'], frameon=False, ncols=1, legend_loc='on data', legend_fontsize=8, size=3)

In [24]:
cells_to_remove = ['Ambiguous_Immune', 'Ambiguous_Stromal', 'Ambiguous_Epithelial']
adata_filtered = adata_gp[~((adata_gp.obs.outlier == '1') | (adata_gp.obs.Level_1.isin(cells_to_remove)))]

In [21]:
factors = factors.reindex(adata_gp.obs_names)

In [23]:
adata_gp.obs = adata_gp.obs.join(factors)

In [None]:
random_indices = np.random.permutation(list(range(adata_filtered.shape[0])))
sc.pl.umap(adata_filtered[random_indices, :], color=['Level_1'], frameon=False, ncols=1, size=3, legend_loc='on data', legend_fontsize=8)

In [None]:
sc.pl.umap(adata_filtered[random_indices, :], color=['Level_1'], groups='T Cell', frameon=False, ncols=1, size=3, legend_loc='on data', legend_fontsize=8)

In [None]:
sc.pl.umap(adata_filtered[random_indices, :], color=['Level_1'], groups='Natural Killer', frameon=False, ncols=1, size=3, legend_loc='on data', legend_fontsize=8)

In [None]:
random_indices = np.random.permutation(list(range(adata_filtered.shape[0])))
sc.pl.umap(adata_filtered[random_indices, :], color=['batch_covariate', 'Level_1', 'Condition'], frameon=False, ncols=1, size=3)

In [None]:
plt.rcParams['figure.figsize'] = (15, 15)
sc.pl.umap(adata_filtered[random_indices, :], color=['Factor' + str(i) for i in list(range(1,11))], frameon=False, ncols=5, size=3)

# with 20 Latent Factors

In [3]:
weights = pd.read_csv('../MOFA/MOFA_20_Factors_Filtered/MOFA_weights_more_factors_filtered.csv', index_col='Unnamed: 0')
genes = pd.read_csv('../MOFA/MOFA_20_Factors_Filtered/MOFA_latent_factors_genes_more_factors_filtered.csv', index_col='Unnamed: 0')
factors = pd.read_csv('../MOFA/MOFA_20_Factors_Filtered/MOFA_latent_factors_more_factors_filtered.csv', index_col='Unnamed: 0')

In [20]:
genes_selected = genes[:500]

In [None]:
len(set(genes_selected.values.flatten()))

In [22]:
all_genes = [x for x in list(set(genes_selected.values.flatten().astype(str))) if x!= 'nan']
binary_matrix = pd.DataFrame(0, index=all_genes, columns=genes_selected.columns)
for column in genes_selected.columns:
    genes_in_program = genes_selected[column].dropna()
    binary_matrix.loc[genes_in_program, column] = 1

In [None]:
binary_matrix

In [24]:
adata_gp = adata[:, all_genes]

In [None]:
adata_gp

In [26]:
adata_gp.varm['I'] = np.array(binary_matrix)
select_terms = adata_gp.varm['I'].sum(0)>12

In [27]:
adata_gp.uns['terms'] = binary_matrix.columns
adata_gp._inplace_subset_var(adata_gp.varm['I'].sum(1)>0)

In [None]:
adata_gp.uns['terms']

In [None]:
print('Checking X')
subset = sc.pp.subsample(adata_gp, fraction=0.1, copy=True)
raw_counts = subset.X.toarray()
print(f"Are raw counts integers? {np.all(raw_counts.astype(int) == raw_counts)}")
print(f"Range of raw counts: {np.min(raw_counts)} to {np.max(raw_counts)}")
print("-" * 50)

In [None]:
intr_cvae = sca.models.EXPIMAP(
    adata=adata_gp,
    condition_key='batch_covariate',
    hidden_layer_sizes=[300, 300, 300],
    recon_loss='nb'
)

adata_gp.X = adata_gp.X.astype('float32')

ALPHA = 0.7
early_stopping_kwargs = {
    "early_stopping_metric": "val_unweighted_loss",
    "threshold": 0,
    "patience": 50,
    "reduce_lr": True,
    "lr_patience": 13,
    "lr_factor": 0.1,
}
intr_cvae.train(
    n_epochs=200,
    alpha_epoch_anneal=50,
    alpha=ALPHA,
    alpha_kl=0.5,
    weight_decay=0.,
    early_stopping_kwargs=early_stopping_kwargs,
    use_early_stopping=True,
    seed=2020
)

In [None]:
adata_gp.obsm['X_cvae'] = intr_cvae.get_latent(mean=False, only_active=True)
adata_gp.uns['terms'] = adata_gp.uns['terms'].tolist()

In [None]:
sc.pp.neighbors(adata_gp, use_rep='X_cvae')
sc.tl.umap(adata_gp)

In [None]:
adata_gp.obs.Level_1 = adata_gp.obs.Level_1.replace('NK Cell', 'Natural Killer')
adata_gp.obs.Level_1 = adata_gp.obs.Level_1.replace('Ductal Cell/Malignant', 'Malignant')
adata_gp.obs.Level_1 = adata_gp.obs.Level_1.replace('Intra-pancreatic Neurons', 'Neuronal Cell')

In [None]:
adata_gp.write('../Expimap/int_10_factors.h5ad')
intr_cvae.save('../Expimap/expimap_10_factors')

In [None]:
plt.rcParams['figure.figsize'] = (6, 6)
random_indices = np.random.permutation(list(range(adata_gp.shape[0])))
sc.pl.umap(adata_gp[random_indices, :], color=['batch_covariate', 'Level_1', 'Condition'], frameon=False, ncols=1)

In [None]:
pwd

In [None]:
adata_gp.obsm['X_cvae'].shape

# With 10 latent Factors chose from the 15 latent factors and overlapping genes removed

In [2]:
adata_gp = sc.read_h5ad('../Expimap/int_10_factors_selected_norepeatinggenes.h5ad')

In [None]:
intra_cvae = sca.models.EXPIMAP.load('../Expimap/expimap_10_factors_selected_norepeatinggenes', adata_gp)

In [None]:
len(adata_gp.obs.Dataset.unique())

In [None]:
len(adata_gp.obs.ID.unique())

In [3]:
adata_gp.obs.Level_1 = adata_gp.obs.Level_1.replace('NK Cell', 'Natural Killer')
adata_gp.obs.Level_1 = adata_gp.obs.Level_1.replace('Ductal Cell/Malignant', 'Malignant')
adata_gp.obs.Level_1 = adata_gp.obs.Level_1.replace('Intra-pancreatic Neurons', 'Neuronal Cell')

In [4]:
adata_gp.obs['Disease_Status'] = np.where(adata_gp.obs.Dataset.isin(['Steele_Adj_Norm', 'Peng_Normal']), 'Normal', 'PDAC')

In [None]:
sc.set_figure_params(dpi=300)
plt.rcParams['figure.figsize'] = (6, 6)
random_indices = np.random.permutation(list(range(adata_gp.shape[0])))
for i in ['batch_covariate', 'Level_1', 'Condition', 'Disease_Status']:
    sc.pl.umap(adata_gp[random_indices, :], color=i, frameon=False, ncols=2, wspace=0.5, save=f'{i}.png', )

In [None]:
plt.rcParams['figure.figsize'] = (6, 6)
random_indices = np.random.permutation(list(range(adata_gp.shape[0])))
sc.pl.umap(adata_gp[random_indices, :], color=['batch_covariate', 'Level_1', 'Condition', 'Disease_Status'], frameon=False, ncols=2, wspace=0.5)

In [None]:
adata_gp[adata_gp.obs.Level_1.isin(['T Cell', 'Myeloid Cell'])]

In [138]:
sc.tl.leiden(adata_gp, resolution=0.5)

In [None]:
plt.rcParams['figure.figsize'] = (20,10)
sc.pl.heatmap(adata_gp[adata_gp.obs.Level_1.isin(['T Cell', 'Myeloid Cell'])], groupby='Level_1', var_names=['CD3D', 'CD3E', 'CD68', 'LYZ'], layer='log_norm', standard_scale='var', swap_axes=True, figsize=(20,10))

In [None]:
plt.rcParams['figure.figsize'] = (20,10)
sc.pl.heatmap(adata_gp[adata_gp.obs.Level_1.isin(['T Cell', 'Myeloid Cell'])], groupby='Level_1', var_names=['CD3D', 'CD3E', 'CD68', 'LYZ'], layer='log_norm', standard_scale='var', swap_axes=True, figsize=(20,10))

In [None]:
unique_cells = adata_gp.obs.Level_1.unique().tolist()
n_cells = len(unique_cells)
n_cols = 4
n_rows = (n_cells // n_cols) + (n_cells % n_cols > 0)  
fig, axes = plt.subplots(n_rows, n_cols, figsize=(25, 20)) 
axes = axes.flatten()  

for i, cell in enumerate(unique_cells):
    sc.pl.umap(
        adata_gp, 
        color='Level_1', 
        groups=cell,
        title=cell,
        frameon=False, 
        ax=axes[i],
        show=False,
        size=3
    )


for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])
plt.tight_layout()
plt.show()

In [26]:
# fig, ax = plt.subplot(5,5)
# for cell in adata_gp.obs.Level_1.unique().tolist():
#     sc.pl.umap(adata_gp[random_indices, :], color='Level_1', groups=cell, frameon=False, ncols=1, size=3)

In [15]:
intra_cvae.latent_directions(adata=adata_gp)

In [54]:
intra_cvae.latent_enrich(groups='Level_1', use_directions=True, adata=adata_gp, )

In [None]:
plt.rcParams['figure.figsize'] = (15,45)
fig = sca.plotting.plot_abs_bfs(adata_gp, fontsize=7, n_cols=3)

In [None]:
adata_gp.uns['terms'].index()

In [33]:
terms = adata_gp.uns['terms'].tolist()
select_terms = ['Factor' + str(i+1) for i in list(range(10))]
idx = [terms.index(term) for term in select_terms]

In [40]:
latents = (intra_cvae.get_latent(adata_gp.X, adata_gp.obs['ID_batch_covariate'], mean=False) * adata_gp.uns['directions'])[:, idx]

In [None]:
for i in range(0,10):
    name = 'Factor' + str(i + 1)
    print(name)
    adata_gp.obs[name] = latents[:, i]

In [None]:
plt.rcParams['figure.figsize'] = (7,7)
sc.pl.scatter(adata_gp, x='Factor3', y='Factor4', color='Level_1', groups='T Cell', size=10)

In [None]:
plt.rcParams['figure.figsize'] = (7,7)
sc.pl.scatter(adata_gp, x='Factor3', y='Factor4', color='Level_1', groups='Myeloid Cell', size=10)

In [74]:
binary_matrix = pd.read_csv('../MOFA/MOFA_15_Factors_6082HVG/binary_matrix.csv', index_col='Unnamed: 0')

In [85]:
dict_gp = {}
for col in binary_matrix.columns:
    genes = binary_matrix[binary_matrix[col] == 1].index.tolist()
    dict_gp[col] = genes

In [92]:
import gseapy as gp

In [120]:
dict_df_kegg = {}
for k,v in dict_gp.items():
    temp = gp.enrichr(v, gene_sets=['KEGG_2021_Human'])
    dict_df_kegg[k] = temp.results.sort_values(by=['Combined Score'], ascending=False).head(20)

In [None]:
dict_df_kegg['Factor1'].head(10)

In [123]:
dict_df_msig = {}
for k,v in dict_gp.items():
    temp = gp.enrichr(v, gene_sets=['MSigDB_Hallmark_2020'])
    dict_df_msig[k] = temp.results.sort_values(by=['Combined Score'], ascending=False).head(20)

In [None]:
dict_df_msig['Factor3'].head(10)