In [1]:
import glob
import os
import torch

import anndata as ad
import scanpy as sc
from scipy.sparse import csr_matrix
from sklearn import model_selection
import yaml

from src.da_utils.data_processing import qc_sc
from src.da_utils import data_loading
from src.da_utils.data_loading import load_spatial, get_selected_dir, get_dset_dir

import pandas as pd
import numpy as np


In [2]:
MODEL_NAME = "ADDA"
DSET = "dlpfc"
SC_ID = "GSE144136"
ST_ID = "spatialLIBD"

CONFIGS_DIR = "configs"



In [3]:
def get_best_params_file(model_name, dset, sc_id, st_id, configs_dir="configs"):
    pattern = os.path.join("model", model_name, dset, f"{sc_id}_{st_id}", "**", "reverse_val_best_epoch.csv")

    results = []
    for rv_result_path in glob.glob(pattern, recursive=True):
        results.append(pd.read_csv(rv_result_path, index_col=0))

    results_df = pd.concat(results, axis=0)
    best_hp = results_df["val"].idxmin()
    config_fname = results_df.loc[best_hp, "config_fname"]
    with open(os.path.join(configs_dir, model_name, config_fname), "r") as f:
        config = yaml.safe_load(f)

    lib_params = config["lib_params"]
    data_params = config["data_params"]
    model_params = config["model_params"]


    torch_seed = lib_params.get("manual_seed")
    lib_seed_path = str(torch_seed) if "manual_seed" in lib_params else "random"

    model_folder = data_loading.get_model_rel_path(
        model_name,
        model_params["model_version"],
        lib_seed_path=lib_seed_path,
        **data_params,
    )

    model_folder

    model_path = os.path.join("model", model_folder, "advtrain", "samp_split" if data_params.get("samp_split") else "", "final_model.pth")
    checkpoint = torch.load(model_path)

    try:
        epoch = checkpoint["epoch"]
    except KeyError:
        epoch = checkpoint.get("iters")

    if int(epoch) != int(results_df.loc[best_hp, "best_epoch"]):
        raise ValueError("Epoch mismatch")
    
    return config_fname, results_df

get_best_params_file(MODEL_NAME, DSET, SC_ID, ST_ID, CONFIGS_DIR)

'standard_bnfix_adam_beta1_5_samp_split.yml'

{'model': ADDAST(
   (source_encoder): MLP(
     (encoder): Sequential(
       (0): Linear(in_features=360, out_features=1024, bias=True)
       (1): BatchNorm1d(1024, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
       (2): LeakyReLU(negative_slope=0.01)
       (3): Dropout(p=0.5, inplace=False)
       (4): Linear(in_features=1024, out_features=512, bias=True)
       (5): BatchNorm1d(512, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
       (6): LeakyReLU(negative_slope=0.01)
       (7): Dropout(p=0.5, inplace=False)
       (8): Linear(in_features=512, out_features=64, bias=True)
       (9): ELU(alpha=1.0)
     )
   )
   (clf): MLP(
     (encoder): Sequential(
       (0): Linear(in_features=64, out_features=33, bias=True)
       (1): LogSoftmax(dim=1)
     )
   )
   (target_encoder): MLP(
     (encoder): Sequential(
       (0): Linear(in_features=360, out_features=1024, bias=True)
       (1): BatchNorm1d(1024, eps=0.001, momentum=0.01, affine=Tru

In [3]:
selected_dir = get_selected_dir(
    dset_dir=get_dset_dir("data", dset="mouse_cortex"),
    sc_id="GSE115746",
    st_id="spotless_mouse_cortex",
    n_markers=20,
    all_genes=False,

)
sp_d = load_spatial(selected_dir, scaler_name="standard", st_split=False, samp_split=True)

In [2]:
adata_st = sc.read_h5ad("data/dlpfc/preprocessed/GSE144136_spatialLIBD/20markers/unscaled/mat_sp_samp_split_d.h5ad")

In [4]:
(
            adata_st[adata_st.obs["split"] == split].X.to_array() for split in data_loading.SPLITS
        )

Unnamed: 0_level_0,split,sample_id,X,Y,index,key,subject,replicate,Cluster,sum_umi,sum_gene,cell_count,in_tissue,spatialLIBD,array_col,array_row
spot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
AAACAACGAATAGTTC-1,train,151507,3276,2514,AAACAACGAATAGTTC-1,151507_AAACAACGAATAGTTC-1,Br5292,1,6,948,727,0,True,L1,16,0
AAACAAGTATCTCCCA-1,train,151507,9178,8520,AAACAAGTATCTCCCA-1,151507_AAACAAGTATCTCCCA-1,Br5292,1,3,4261,2170,3,True,L3,102,50
AAACAATCTACTAGCA-1,train,151507,5133,2878,AAACAATCTACTAGCA-1,151507_AAACAATCTACTAGCA-1,Br5292,1,2,1969,1093,2,True,L1,43,3
AAACACCAATAACTGC-1,train,151507,3462,9581,AAACACCAATAACTGC-1,151507_AAACACCAATAACTGC-1,Br5292,1,5,3368,1896,5,True,WM,19,59
AAACAGCTTTCAGAAG-1,train,151507,2779,7663,AAACAGCTTTCAGAAG-1,151507_AAACAGCTTTCAGAAG-1,Br5292,1,1,2981,1620,3,True,L6,9,43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTTCACATCCAGG-1-9,test,151674,5595,9461,TTGTTTCACATCCAGG-1.9,151674_TTGTTTCACATCCAGG-1,Br8100,2,2,4802,2425,5,True,WM,42,58
TTGTTTCATTAGTCTA-1-9,test,151674,4767,9694,TTGTTTCATTAGTCTA-1.9,151674_TTGTTTCATTAGTCTA-1,Br8100,2,2,4800,2489,6,True,WM,30,60
TTGTTTCCATACAACT-1-9,test,151674,4574,7896,TTGTTTCCATACAACT-1.9,151674_TTGTTTCCATACAACT-1,Br8100,2,1,1209,832,4,True,L6,27,45
TTGTTTGTATTACACG-1-5,test,151674,5513,11257,TTGTTTGTATTACACG-1.5,151674_TTGTTTGTATTACACG-1,Br8100,2,1,1307,901,9,True,WM,41,73


In [20]:
adata_st_d = {}
for sid in adata_st.obs["sample_id"].unique():
    adata_st_d[sid] = adata_st[adata_st.obs["sample_id"] == sid]
    adata_st_d[sid].obs.drop(columns=["sample_id"], inplace=True)
adata_col = ad.concat(adata_st_d, label="sample_id")

In [27]:
x_st_train, x_st_val = model_selection.train_test_split(
                    adata_col, test_size=0.2, random_state=5
                )

In [29]:
x_st_train.obs

Unnamed: 0,X,Y,nCount_RNA,nFeature_RNA,orig.ident,sample_id
spot_7-4,1533.980583,466.019417,60354.0,9732,0,Eng2019_cortex_svz_fov4
spot_7-6,1533.980583,466.019417,18300.0,6374,0,Eng2019_cortex_svz_fov6
spot_6-6,1000.0,1533.980583,5715.0,3176,0,Eng2019_cortex_svz_fov6
spot_7-5,1533.980583,466.019417,20200.0,6750,0,Eng2019_cortex_svz_fov5
spot_9-1,1533.980583,1533.980583,60364.0,9696,0,Eng2019_cortex_svz_fov1
spot_6-2,1000.0,1533.980583,53940.0,9530,0,Eng2019_cortex_svz_fov2
spot_7,1533.980583,466.019417,35974.0,8725,0,Eng2019_cortex_svz_fov0
spot_7-2,1533.980583,466.019417,43486.0,9049,0,Eng2019_cortex_svz_fov2
spot_3,466.019417,1533.980583,15919.0,6288,0,Eng2019_cortex_svz_fov0
spot_2-2,466.019417,1000.0,47721.0,9180,0,Eng2019_cortex_svz_fov2


In [15]:
adata = sc.read_h5ad("data/mouse_cortex/sc_adata/GSE115746.h5ad")



KeyError: 'sample_id'

In [3]:
adata.obs["cell_type"].value_counts()

L5 IT                           2964
Vip                             2679
Sst                             2551
L6 IT                           2173
Pvalb                           2076
Lamp5                           1843
L4                              1352
L6 CT                           1225
L2/3 IT                         1178
L5 PT                            901
NP                               740
Astro                            556
L6b                              473
Sncg                             249
High Intronic                    182
Endo                             166
VLMC                             133
Batch Grouping                   132
Macrophage                       131
Oligo                            131
SMC                              109
Serpinf1                          85
Meis2                             55
Doublet Endo                      41
Doublet VISp L5 NP and L6 CT      35
Peri                              32
CR                                17
D

In [29]:
cell_subclass_to_spot_composition = {
    "Astro": {"Astrocytes deep", "Astrocytes superficial"},
    "CR": set(),
    "Batch Grouping": {"Excitatory layer 5/6"}, # all cell clusters are L5
    "L2/3 IT": {"Excitatory layer II", "Excitatory layer 3"},
    "L4" : {"Excitatory layer 4"},
    "L5 PT": {"Excitatory layer 5/6"},
    "L5 IT": {"Excitatory layer 5/6"},
    'L6 CT': {"Excitatory layer 5/6"},
    'L6 IT': {"Excitatory layer 5/6"},
    'L6b': {"Excitatory layer 5/6"},
    'NP': {"Excitatory layer 5/6"}, # all NP are L5 or L6
    "Endo": {"Endothelial", "Choroid plexus"},
    'High Intronic': {"Excitatory layer 5/6"}, # all High Intronic are VISp L5 Endou
    ## Doublets; these are cell clusters



    "Peri": {"Endothelial","Choroid plexus"},
    "SMC": {"Endothelial","Choroid plexus"},
    "VLMC": {"Endothelial","Choroid plexus"},

    "Macrophage": {"Microglia"},

    "Lamp5": {"Interneurons, Interneurons deep"}, # "We define six subclasses of GABAergic cells: Sst, Pvalb, Vip, Lamp5, Sncg and Serpinf1, and two distinct types: Sst–Chodl and Meis2–Adamts19 (Fig. 1c). We represent the taxonomy by constellation diagrams, dendrograms, layer-of-isolation, and the expression of select marker genes (Fig. 5a–f). The major division among GABAergic types largely corresponds to their developmental origin in the medial ganglionic eminence (Pvalb and Sst subclasses) or caudal ganglionic eminence (Lamp5, Sncg, Serpinf1 and Vip subclasses)."
    "Meis2": {"Interneurons, Interneurons deep"},
    "Pvalb": {"Interneurons, Interneurons deep"},
    "Serpinf1": {"Interneurons, Interneurons deep"},
    "Sncg": {"Interneurons, Interneurons deep"},
    "Sst": {"Interneurons, Interneurons deep"},
    "Vip": {"Interneurons, Interneurons deep"},

    

      "Low Quality" : None,
    'No Class': None,  
}

cell_cluster_cell_type_to_spot_composition = {
        "Doublet VISp L5 NP and L6 CT": {"Excitatory layer 5/6"},
    "Doublet Endo and Peri_1": {"Endothelial", "Choroid plexus"}, # no choroid plexus in other dataset
    "Doublet Astro Aqp4 Ex": {"Astrocytes deep", "Astrocytes superficial"},
    "Doublet SMC and Glutamatergic": {"Endothelial", "Choroid plexus"},
    "Doublet Endo Peri SMC": {"Endothelial", "Choroid plexus"},
    "Oligo Serpinb1a": {"Oligodendrocytes"},
    "Oligo Synpr": {"Oligodendrocytes"},
    "Oligo Rassf10": {"Oligodendrocytes"},
    "OPC Pdgfra Grm": {"OPC"},
    "OPC Pdgfra Ccnb1": {"OPC"},
}

{'Astro': {'Astrocytes deep', 'Astrocytes superficial'},
 'CR': set(),
 'Batch Grouping': {'Excitatory layer 5/6'},
 'L2/3 IT': {'Excitatory layer 3', 'Excitatory layer II'},
 'L4': {'Excitatory layer 4'},
 'L5 PT': {'Excitatory layer 5/6'},
 'L5 IT': {'Excitatory layer 5/6'},
 'L6 CT': {'Excitatory layer 5/6'},
 'L6 IT': {'Excitatory layer 5/6'},
 'L6b': {'Excitatory layer 5/6'},
 'NP': {'Excitatory layer 5/6'},
 'Endo': {'Endothelial'},
 'High Intronic': {'Excitatory layer 5/6'},
 'Doublet VISp L5 NP and L6 CT': {'Excitatory layer 5/6'},
 'Doublet Endo and Peri_1': {'Choroid plexus', 'Endothelial'},
 'Doublet Astro Aqp4 Ex': {'Astrocytes deep', 'Astrocytes superficial'},
 'Doublet SMC and Glutamatergic': {'Choroid plexus'},
 'Doublet Endo Peri SMC': {'Choroid plexus', 'Endothelial'},
 'Oligo Serpinb1a': {'Oligodendrocytes'},
 'Oligo Synpr': {'Oligodendrocytes'},
 'Oligo Rassf10': {'Oligodendrocytes'},
 'OPC Pdgfra Grm': {'OPC'},
 'OPC Pdgfra Ccnb1': {'OPC'},
 'Peri': {'Choroid plexus

In [28]:
adata.obs["cell_cluster"][adata.obs["cell_subclass"] == "VLMC"].value_counts().head(20)

VLMC Osr1 Mc5r        76
VLMC Spp1 Col15a1     39
VLMC Osr1 Cd74        12
VLMC Spp1 Hs3st6       6
Sncg Vip Itih5         0
Sncg Slc17a8           0
Pvalb Calb1 Sst        0
Sncg Vip Nptx2         0
Pvalb Gabrg1           0
Pvalb Gpr149 Islr      0
Pvalb Reln Itm2a       0
Pvalb Reln Tac1        0
Pvalb Sema3e Kank4     0
Pvalb Th Sst           0
Pvalb Akr1c18 Ntf3     0
Pvalb Vipr2            0
SMC Acta2              0
Serpinf1 Aqp5 Vip      0
Serpinf1 Clrn1         0
Sncg Gpr50             0
Name: cell_cluster, dtype: int64

In [7]:
adata.obs["cell_cluster"].unique()

[NaN, 'Pvalb Tpbg', 'L4 IT VISp Rspo1', 'Vip Chat Htr1f', 'L2/3 IT VISp Agmat', ..., 'Batch Grouping VISp L5 PT Ctxn3', 'Batch Grouping VISp L5 PT Chrna6', 'Low Quality VISp L5 PT Ctxn3 2', 'L6b ALM Olfr111 Nxph1', 'L6b ALM Olfr111 Spon1']
Length: 152
Categories (151, object): ['Astro Aqp4', 'Batch Grouping VISp L5 PT Chrna6', 'Batch Grouping VISp L5 PT Ctxn3', 'CR Lhx5', ..., 'Vip Ptprt Pkp2', 'Vip Pygm C1ql1', 'Vip Rspo1 Itga4', 'Vip Rspo4 Rxfp1 Chat']

In [8]:
adata.obs

Unnamed: 0,title,source_name,organism,donor_id,donor_sex,donor_genotype,injection_type,injection_target,injected_material,dissected_region,...,cell_cluster,molecule,SRA_Run,GEO_Sample,GEO_Sample_Title,n_genes,n_genes_by_counts,total_counts,total_counts_mt,pct_counts_mt
F2S4_150422_002_A01,F2S4_150422_002_A01,Primary Visual Cortex (VISp),Mus musculus,184756,M,Rbp4-Cre_KL100/wt;Ai14(RCL-tdT)/wt,No Injection,No Injection,No Injection,VISp,...,,polyA RNA,SRR8322796,GSM3189947,VISp_Rbp4-Cre_KL1/wt;Ai14(RCL-tdT)/wt_No Injec...,12141,12141,1204630.0,0.0,0.0
F2S4_150422_002_B01,F2S4_150422_002_B01,Primary Visual Cortex (VISp),Mus musculus,184756,M,Rbp4-Cre_KL100/wt;Ai14(RCL-tdT)/wt,No Injection,No Injection,No Injection,VISp,...,,polyA RNA,SRR8322851,GSM3189947,VISp_Rbp4-Cre_KL1/wt;Ai14(RCL-tdT)/wt_No Injec...,11760,11760,1259862.0,0.0,0.0
F2S4_150422_002_C01,F2S4_150422_002_C01,Primary Visual Cortex (VISp),Mus musculus,184756,M,Rbp4-Cre_KL100/wt;Ai14(RCL-tdT)/wt,No Injection,No Injection,No Injection,VISp,...,,polyA RNA,SRR8322862,GSM3189947,VISp_Rbp4-Cre_KL1/wt;Ai14(RCL-tdT)/wt_No Injec...,10079,10079,1128827.0,0.0,0.0
F2S4_150422_002_D01,F2S4_150422_002_D01,Primary Visual Cortex (VISp),Mus musculus,184756,M,Rbp4-Cre_KL100/wt;Ai14(RCL-tdT)/wt,No Injection,No Injection,No Injection,VISp,...,,polyA RNA,SRR8322873,GSM3189947,VISp_Rbp4-Cre_KL1/wt;Ai14(RCL-tdT)/wt_No Injec...,10570,10570,879874.0,0.0,0.0
F2S4_150422_002_E01,F2S4_150422_002_E01,Primary Visual Cortex (VISp),Mus musculus,184756,M,Rbp4-Cre_KL100/wt;Ai14(RCL-tdT)/wt,No Injection,No Injection,No Injection,VISp,...,,polyA RNA,SRR8322884,GSM3189947,VISp_Rbp4-Cre_KL1/wt;Ai14(RCL-tdT)/wt_No Injec...,9737,9737,1117094.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
F1S4_180124_317_D01,F1S4_180124_317_D01,Anterior Lateral Motor Cortex (ALM),Mus musculus,364992,F,Rasgrf2-T2A-dgFlpO/wt;Ai65F/wt,No Injection,No Injection,No Injection,ALM,...,Lamp5 Lsp1,polyA RNA,SRR7315188,GSM3189867,ALM_Rasgrf2-T2A-dgFlpO/wt;Ai65F/wt_No Injection_1,8601,8601,1022536.0,0.0,0.0
F1S4_180124_317_E01,F1S4_180124_317_E01,Anterior Lateral Motor Cortex (ALM),Mus musculus,364992,F,Rasgrf2-T2A-dgFlpO/wt;Ai65F/wt,No Injection,No Injection,No Injection,ALM,...,Sncg Slc17a8,polyA RNA,SRR7315189,GSM3189867,ALM_Rasgrf2-T2A-dgFlpO/wt;Ai65F/wt_No Injection_1,9822,9822,724678.0,0.0,0.0
F1S4_180124_317_F01,F1S4_180124_317_F01,Anterior Lateral Motor Cortex (ALM),Mus musculus,364992,F,Rasgrf2-T2A-dgFlpO/wt;Ai65F/wt,No Injection,No Injection,No Injection,ALM,...,Sst Hpse Sema3c,polyA RNA,SRR7315190,GSM3189867,ALM_Rasgrf2-T2A-dgFlpO/wt;Ai65F/wt_No Injection_1,8729,8729,980847.0,0.0,0.0
F1S4_180124_317_G01,F1S4_180124_317_G01,Anterior Lateral Motor Cortex (ALM),Mus musculus,364992,F,Rasgrf2-T2A-dgFlpO/wt;Ai65F/wt,No Injection,No Injection,No Injection,ALM,...,Sncg Slc17a8,polyA RNA,SRR7315191,GSM3189867,ALM_Rasgrf2-T2A-dgFlpO/wt;Ai65F/wt_No Injection_1,10632,10632,1029765.0,0.0,0.0


In [9]:
adata_st = sc.read_h5ad("data/mouse_cortex/st_adata/spotless_mouse_cortex-Eng2019_cortex_svz_fov0.h5ad")


In [11]:
adata_st.obsm["relative_spot_composition"]

Unnamed: 0,Astrocytes.deep,Astrocytes.superficial,Choroid.plexus,Endothelial,Ependymal,Excitatory.layer.3,Excitatory.layer.4,Excitatory.layer.5.6,Excitatory.layer.II,Interneurons,Interneurons.deep,Microglia,NSC,Neural.progenitors,Neuroblasts,OPC,Oligodendrocytes
spot_1,0.0,0.0,0.0,0.058824,0.0,0.176471,0.235294,0.0,0.352941,0.117647,0.0,0.058824,0.0,0.0,0.0,0.0,0.0
spot_2,0.076923,0.076923,0.0,0.0,0.0,0.076923,0.153846,0.0,0.461538,0.076923,0.0,0.076923,0.0,0.0,0.0,0.0,0.0
spot_3,0.0,0.333333,0.0,0.166667,0.0,0.0,0.0,0.0,0.166667,0.166667,0.166667,0.0,0.0,0.0,0.0,0.0,0.0
spot_4,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.571429,0.285714,0.0,0.0,0.0,0.0,0.0,0.0,0.0
spot_5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.636364,0.181818,0.0,0.0,0.0,0.0,0.0,0.181818,0.0
spot_6,0.0,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0
spot_7,0.0,0.0,0.0,0.0,0.0,0.222222,0.222222,0.0,0.333333,0.111111,0.0,0.111111,0.0,0.0,0.0,0.0,0.0
spot_8,0.0,0.0,0.0,0.083333,0.0,0.0,0.083333,0.0,0.583333,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0
spot_9,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0


In [12]:
adata_st.obsm["relative_spot_composition"]["Choroid.plexus"].sum()

0.0

In [17]:
adata_sc = sc.read_h5ad("data/mouse_cortex/sc_adata/spotless_mouse_cortex.h5ad")
adata_sc.obs["cell_subclass"].cat.categories

Index(['Astrocytes deep', 'Astrocytes superficial', 'Choroid plexus',
       'Endothelial', 'Ependymal', 'Excitatory layer 3', 'Excitatory layer 4',
       'Excitatory layer 5/6', 'Excitatory layer II', 'Interneurons',
       'Interneurons deep', 'Microglia', 'NSC', 'Neural progenitors',
       'Neuroblasts', 'OPC', 'Oligodendrocytes'],
      dtype='object')

In [2]:
SPOTLESS_DIR = "data/spotless/standards"

CORTEX_DIR = "data/mouse_cortex"
OLFACTORY_DIR = "data/mouse_olfactory"
# SC_ID_OLFACTORY = "spotless_mouse_olfactory"
# SC_ID_CORTEX = "spotless_mouse_cortex"
# SC_ID_VISUAL = "spotless_mouse_visual"


# %%

standard_to_id = {
    "gold_standard_1": "spotless_mouse_cortex",
    "gold_standard_2": "spotless_mouse_olfactory",
    "gold_standard_3": "spotless_mouse_visual",
    "gold_standard_3_12celltypes": "spotless_mouse_visual",
}

id_to_dir = {
    "spotless_mouse_cortex": CORTEX_DIR,
    "spotless_mouse_olfactory": OLFACTORY_DIR,
    "spotless_mouse_visual": CORTEX_DIR,
}



In [45]:
def cat_to_obsm(cat, adata, drop_cols=None):
    if drop_cols is None:
        drop_cols = []
    selected_cols = adata.obs.columns.to_series().map(lambda x: x.split(".")[0] == cat)
    adata.obsm[cat] = (
        adata.obs.loc[:, selected_cols]
        .rename(columns=lambda x: x[len(cat + ".") :])
        .drop(columns=drop_cols)
    )
    keep_cols = ~selected_cols
    # print(keep_cols)

    for drop_col in drop_cols:
        keep_cols[cat + "." + drop_col] = True
    adata.obs = adata.obs.loc[:, keep_cols]

fpaths = sorted(glob.glob(os.path.join(SPOTLESS_DIR, "gold_standard_1", "*.h5ad")))
sample_ids = [os.path.basename(f).split(".")[0] for f in fpaths]
fovs = [sc.read_h5ad(name) for name in fpaths]

obs_cols = sorted(list(set.union(*[set(fov.obs.columns) for fov in fovs])))
for fov, sample_id in zip(fovs, sample_ids):
    fov.obs = fov.obs.reindex(columns=obs_cols)
    fov.obs = fov.obs.fillna(0)
    fov.obs = fov.obs.rename(columns={"coordinates.x": "X", "coordinates.y": "Y"})
    fov.X = csr_matrix(fov.X.astype("float32"))
    fov.raw = fov

    fov.obs = fov.obs.loc[:,~fov.obs.columns.str.contains("spot_no")]

    cat_to_obsm("relative_spot_composition", fov)
    cat_to_obsm("spot_composition", fov)

In [49]:
fovs[0].obsm["relative_spot_composition"]

Unnamed: 0,Astrocytes.deep,Astrocytes.superficial,Choroid.plexus,Endothelial,Ependymal,Excitatory.layer.3,Excitatory.layer.4,Excitatory.layer.5.6,Excitatory.layer.II,Interneurons,Interneurons.deep,Microglia,NSC,Neural.progenitors,Neuroblasts,OPC,Oligodendrocytes
spot_1,0.0,0.0,0.0,0.058824,0.0,0.176471,0.235294,0.0,0.352941,0.117647,0.0,0.058824,0.0,0.0,0.0,0.0,0.0
spot_2,0.076923,0.076923,0.0,0.0,0.0,0.076923,0.153846,0.0,0.461538,0.076923,0.0,0.076923,0.0,0.0,0.0,0.0,0.0
spot_3,0.0,0.333333,0.0,0.166667,0.0,0.0,0.0,0.0,0.166667,0.166667,0.166667,0.0,0.0,0.0,0.0,0.0,0.0
spot_4,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.571429,0.285714,0.0,0.0,0.0,0.0,0.0,0.0,0.0
spot_5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.636364,0.181818,0.0,0.0,0.0,0.0,0.0,0.181818,0.0
spot_6,0.0,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0
spot_7,0.0,0.0,0.0,0.0,0.0,0.222222,0.222222,0.0,0.333333,0.111111,0.0,0.111111,0.0,0.0,0.0,0.0,0.0
spot_8,0.0,0.0,0.0,0.083333,0.0,0.0,0.083333,0.0,0.583333,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0
spot_9,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0


In [55]:
for ref_path in glob.glob(os.path.join(SPOTLESS_DIR, "reference", "gold_standard_1.h5ad")):
    if "19celltypes" in ref_path:
        continue

    name = os.path.basename(ref_path).split(".")[0]
    id = standard_to_id[name]

    dset_dir = id_to_dir[id]
    print(f"Processing {name} to {id} in {dset_dir}")

    sc_dir = os.path.join(dset_dir, "sc_adata")
    if not os.path.exists(sc_dir):
        os.makedirs(sc_dir)

    adata_sc = sc.read_h5ad(ref_path)

    qc_sc(adata_sc)

    adata_sc.obs = adata_sc.obs.rename(columns={"celltype": "cell_subclass"})

    adata_sc.X = csr_matrix(adata_sc.X.astype("float32"))
    adata_sc.raw = adata_sc

    break

    # adata_sc.write(os.path.join(sc_dir, f"{id}.h5ad"))


Processing gold_standard_1 to spotless_mouse_cortex in data/mouse_cortex
0 mitochondrial genes


In [59]:
import re



{'Excitatory.layer.3': 'Excitatory layer 3',
 'Excitatory.layer.II': 'Excitatory layer II',
 'Interneurons': 'Interneurons',
 'Excitatory.layer.4': 'Excitatory layer 4',
 'Microglia': 'Microglia',
 'Astrocytes.deep': 'Astrocytes deep',
 'Endothelial': 'Endothelial',
 'Interneurons.deep': 'Interneurons deep',
 'Astrocytes.superficial': 'Astrocytes superficial',
 'OPC': 'OPC',
 'Excitatory.layer.5.6': 'Excitatory layer 5/6',
 'Neuroblasts': 'Neuroblasts',
 'NSC': 'NSC',
 'Oligodendrocytes': 'Oligodendrocytes',
 'Ependymal': 'Ependymal',
 'Neural.progenitors': 'Neural progenitors',
 'Choroid.plexus': 'Choroid plexus'}

In [69]:
adata = sc.read_h5ad("data/mouse_cortex/sc_adata/spotless_mouse_cortex.h5ad")

In [70]:
adata.X.toarray().max()

150.0

In [71]:
adata.raw.X.toarray().max()

150.0

In [5]:
for fov in fovs:
    fov.obs = fov.obs.reindex(columns=obs_cols)

In [6]:
# for column in fovs[0].obs.columns:
#     display(fovs[0].obs[column])
[os.path.basename(f).split(".")[0] for f in fpaths]

['Eng2019_cortex_svz_fov0',
 'Eng2019_cortex_svz_fov1',
 'Eng2019_cortex_svz_fov2',
 'Eng2019_cortex_svz_fov3',
 'Eng2019_cortex_svz_fov4',
 'Eng2019_cortex_svz_fov5',
 'Eng2019_cortex_svz_fov6']

In [12]:
gs1_ref_path = "data/spotless/reference/gold_standard_1.h5ad"

In [13]:
gs1_ref = sc.read_h5ad(gs1_ref_path)

In [14]:
gs1_ref

AnnData object with n_obs × n_vars = 906 × 10000
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'louvain', 'celltype', 'Field.of.View', 'X', 'Y', 'celltype_coarse', 'Region'
    var: 'features'

In [6]:
# adata.obs

In [11]:
# result = pyreadr.read_r('data/spotless/gold_standard_1/Eng2019_cortex_svz_fov0.rds')
with h5py.File("data/spotless/gold_standard_1.h5" , "r") as f:
    for name in f:
        gene_names = [str(gene, "utf-8") for gene in f[name]["geneNames"][()]]
        ad.AnnData(csr_matrix(f[name]["counts"][()]), dtype=np.int64)

AnnData object with n_obs × n_vars = 9 × 10000
AnnData object with n_obs × n_vars = 9 × 10000
AnnData object with n_obs × n_vars = 9 × 10000
AnnData object with n_obs × n_vars = 9 × 10000
AnnData object with n_obs × n_vars = 9 × 10000
AnnData object with n_obs × n_vars = 9 × 10000
AnnData object with n_obs × n_vars = 9 × 10000


In [1]:
import numpy as np

rng = np.random.default_rng()
X = rng.standard_normal(1000)

In [9]:
def standard(x):
    return np.exp(-np.power(x, 2.)/2) /np.sqrt(2*np.pi)

In [10]:
p = standard(X)

In [11]:
2**(-(p * np.log2(p)).sum())

1.5112405659839928e+144

In [1]:
import os
import glob

In [2]:
for name in glob.iglob("/lustre07/scratch/williamm/AGrEDA/model/*.pth"):
    print(name)
    os.remove(name)
print("done")

done


In [3]:
os.getcwd()


'/lustre07/scratch/williamm/AGrEDA'

In [4]:
import tarfile

In [10]:
with tarfile.open("./TMP/dum.tar.gz", "w:gz") as tar:
    for name in glob.glob("./TMP/*.dum"):
        tar.add(name, arcname=os.path.basename(name))

In [11]:
with tarfile.open("./TMP/dum.tar.gz", "r:gz") as tar:
    tar.extractall("./TMP")

In [12]:
for name in glob.iglob("/lustre07/scratch/williamm/AGrEDA/*"):
    print(name)

/lustre07/scratch/williamm/AGrEDA/autoenc_allgenes.ipynb
/lustre07/scratch/williamm/AGrEDA/wheels
/lustre07/scratch/williamm/AGrEDA/autosubmitter.py
/lustre07/scratch/williamm/AGrEDA/requirements_simple.txt
/lustre07/scratch/williamm/AGrEDA/autoenc_st.ipynb
/lustre07/scratch/williamm/AGrEDA/hello.out
/lustre07/scratch/williamm/AGrEDA/requirements_cc.txt
/lustre07/scratch/williamm/AGrEDA/prep_data.py
/lustre07/scratch/williamm/AGrEDA/adda.py
/lustre07/scratch/williamm/AGrEDA/coral.py
/lustre07/scratch/williamm/AGrEDA/AGrEDA-p_def.code-workspace
/lustre07/scratch/williamm/AGrEDA/eda-pdac-chijimatsu.ipynb
/lustre07/scratch/williamm/AGrEDA/environment.yml
/lustre07/scratch/williamm/AGrEDA/results
/lustre07/scratch/williamm/AGrEDA/run_jupyterlab.sh
/lustre07/scratch/williamm/AGrEDA/model
/lustre07/scratch/williamm/AGrEDA/scratch.ipynb
/lustre07/scratch/williamm/AGrEDA/batch_scripts
/lustre07/scratch/williamm/AGrEDA/reproduce_celldart.py
/lustre07/scratch/williamm/AGrEDA/eval_config.py
/lust