# Jupyter Notebook 4: Out of Sample Prediction

Kang Dataset

In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import sys
import util_loss as ul
#import the package to use
import beta_vae_5
from dentate_features import *
from all_obs_linear_classifier_package import *
import os,glob

In [None]:
data = sc.read("./data/kang_seurat_normalized.h5ad")
data_train_full = sc.read("./data/kang_seurat_normalized_train.h5ad")
data_validate_full = sc.read("./data/kang_seurat_normalized_validate.h5ad")
cells = list(set(data_train_full.obs["cell_type"]))
print(cells)

In [None]:
for cell in cells:

    data_train_full_temp = data_train_full[-(data_train_full.obs["cell_type"]==cell)]
    data_validate_full_temp = data_validate_full[-(data_validate_full.obs["cell_type"]==cell)]
    print(data_train_full_temp.obs["cell_type"].value_counts())

    data_train_full_temp = ul.shuffle_adata(data_train_full_temp)
    data_validate_full_temp = ul.shuffle_adata(data_validate_full_temp)
    
    #Declaring parameters
    z = 5
    al = 100
    c = 500

    mod_path1 = "./models_seurat_leave_one/latent"+str(z)+"_alpha"+str(al)+"_c"+str(c)+"_"+cell
    scg_model = beta_vae_5.C_VAEArithKeras(x_dimension= data_train_full.shape[1],z_dimension=z, 
                                          model_to_use =mod_path1,alpha=al,c_max=c)                  
    scg_model.train(data_train_full_temp,validation_data=data_validate_full_temp,
                    n_epochs=2,shuffle=False)


In [None]:
'''
Manipulating Latent Space dimensions
'''

#Reloading pre-trained data
from simulate_cell import *

os.chdir("/storage/groups/ml01/workspace/harshita.agarwala/models_seurat_leave_one/")
path = "latent5_alpha100_c500_CD4 Naive T"
cell_to_drop = 'CD4 Naive T'
scg_model = beta_vae_5.C_VAEArithKeras(x_dimension= data.shape[1],z_dimension=5,model_to_use=path,
                                       alpha=100,c_max=500)
scg_model.restore_model()

data_temp = data[-(data.obs["cell_type"]==cell_to_drop)]
simulate_multiple_cell(path=path,data=data,model=scg_model,z_dim=5,feature="cell_type")


In [None]:
sc.tl.rank_genes_groups(data, groupby="cell_type", method='t-test')
x_dim = data.shape[1]

In [None]:
'''
Creating Regression plots and UMAPS
'''

from seurat_umaps_reg_plots import *

os.chdir("/storage/groups/ml01/workspace/harshita.agarwala/models_seurat_leave_one/")
path = "latent5_alpha100_c500_CD4 Naive T"
cell_to_drop = 'CD4 Naive T'
cells = list(set(data.obs["cell_type"]))

scg_model = beta_vae_5.C_VAEArithKeras(x_dimension= data.shape[1],z_dimension=5,model_to_use=path,
                                       alpha=100,c_max=500)
scg_model.restore_model()

df_list = generate_simulated_reg_plots(path=path+"/gene_heatmaps/",
                            actual_data=data,clust_typ = cell_to_drop,cells=cells)
df = pd.DataFrame(df_list,columns=["name","r_sq_all","r_sq_100"])
os.chdir("/storage/groups/ml01/workspace/harshita.agarwala/models_seurat_leave_one/")
df.to_csv(path+"/gene_heatmaps/reg_mean.csv",index=False)

Dentate Gyrus

In [None]:
data = sc.read("./data/dentate_gyrus_normalized.h5ad")
#data_train_full = sc.read("./data/dentate_gyrus_normalized_train.h5ad")
#data_validate_full = sc.read("./data/dentate_gyrus_normalized_validate.h5ad")
cells = list(set(data_train_full.obs["clusters"]))
print(cells)

In [None]:
for cell in cells:

    data_train_full_temp = data_train_full[-(data_train_full.obs["clusters"]==cell)]
    data_validate_full_temp = data_validate_full[-(data_validate_full.obs["clusters"]==cell)]
    print(data_train_full_temp.obs["clusters"].value_counts())

    data_train_full_temp = ul.shuffle_adata(data_train_full_temp)
    data_validate_full_temp = ul.shuffle_adata(data_validate_full_temp)
    
    #Declaring parameters
    z = 5
    al = 100
    c = 500

    mod_path1 = "./models_dentate_leave_one/latent"+str(z)+"_alpha"+str(al)+"_c"+str(c)+"_"+cell
    scg_model = beta_vae_5.C_VAEArithKeras(x_dimension= data_train_full.shape[1],z_dimension=z, 
                                          model_to_use =mod_path1,alpha=al,c_max=c)                  
    scg_model.train(data_train_full_temp,validation_data=data_validate_full_temp,
                    n_epochs=2,shuffle=False)


In [None]:
'''
Manipulating Latent Space dimensions
'''

#Reloading pre-trained data
from simulate_cell import *

os.chdir("/home/icb/harshita.agarwala/models_dentate_leave_one/")
path = "latent5_alpha50_c30_Astrocyte"
cell_to_drop = 'Astrocyte'
scg_model = beta_vae_5.C_VAEArithKeras(x_dimension= data.shape[1],z_dimension=5,model_to_use=path,
                                       alpha=50,c_max=30)
scg_model.restore_model()

data_temp = data[-(data.obs["clusters"]==cell_to_drop)]
simulate_multiple_cell(path=path,data=data,model=scg_model,z_dim=5,feature="clusters")


In [None]:
sc.tl.rank_genes_groups(data, groupby="clusters", method='t-test')
x_dim = data.shape[1]

In [None]:
'''
Creating Regression plots and UMAPS
'''

from seurat_umaps_reg_plots import *

os.chdir("/home/icb/harshita.agarwala/models_dentate_leave_one/")
path = "latent5_alpha50_c30_Astrocyte"
cell_to_drop = 'Astrocyte'
cells = list(set(data.obs["clusters"]))

scg_model = beta_vae_5.C_VAEArithKeras(x_dimension= data.shape[1],z_dimension=5,model_to_use=path,
                                       alpha=50,c_max=30)
scg_model.restore_model()

df_list = generate_simulated_reg_plots(path=path+"/gene_heatmaps/",
                            actual_data=data,clust_typ = cell_to_drop,cells=cells)
df = pd.DataFrame(df_list,columns=["name","r_sq_all","r_sq_100"])
os.chdir("/home/icb/harshita.agarwala/models_dentate_leave_one/")
df.to_csv(path+"/gene_heatmaps/reg_mean.csv",index=False)