In [1]:
import glob
import os
import torch
from collections import OrderedDict

import anndata as ad
import h5py
import scanpy as sc
from scipy.sparse import csr_matrix
from sklearn import model_selection
import yaml

from src.da_utils.data_processing import qc_sc
from src.da_utils import data_loading
from src.da_utils.data_loading import load_spatial, get_selected_dir, get_dset_dir

import pandas as pd
import numpy as np

import seaborn as sns
import pickle

import matplotlib.pyplot as plt


# dlPFC

## ST

In [2]:
dlpfc_genes = OrderedDict()
dlpfc_samples = OrderedDict()

In [10]:
df = pd.read_pickle("data/dlpfc/spatialLIBD_data/temp.pkl")
df

Unnamed: 0_level_0,Unnamed: 1_level_0,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,STPG1,...,AL162377.3,ZNF883,AC111006.1,AC093827.5,AL691520.1,AC139491.7,AC021097.2,AC003043.2,AL356417.3,AP000646.1
sample_id,spot,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
151507,AAACAACGAATAGTTC-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
151507,AAACAAGTATCTCCCA-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
151507,AAACAATCTACTAGCA-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
151507,AAACACCAATAACTGC-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
151507,AAACAGCTTTCAGAAG-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151676,TTGTTGTGTGTCAAGA-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
151676,TTGTTTCACATCCAGG-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
151676,TTGTTTCATTAGTCTA-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
151676,TTGTTTCCATACAACT-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
dlpfc_genes["unprocessed"] = df.columns.to_frame().set_index("gene")
dlpfc_samples["unprocessed"] = df.index.to_frame().set_index("spot")

In [3]:
sids = []
adatas = []
for name in sorted(glob.glob("data/dlpfc/st_adata/spatialLIBD-*.h5ad")):
    sids.append(int(name.split("-")[1].split(".")[0]))
    adatas.append(sc.read_h5ad(name))

adata_st = ad.concat(adatas, label="sample_id", keys=sids)


KeyError: 'split'

In [7]:
dlpfc_genes["preprocessed"] = adata_st.var
dlpfc_samples["preprocessed"] = adata_st.obs


In [2]:
sc.read_h5ad("data/mouse_cortex/preprocessed/GSE115746_spotless_mouse_cortex/20markers/minmax/mat_sp_samp_split_d.h5ad").obs.groupby("sample_id").first()

Unnamed: 0_level_0,split,X,Y,nCount_RNA,nFeature_RNA,orig.ident
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Eng2019_cortex_svz_fov0,train,466.019417,466.019417,82549.0,9871,0
Eng2019_cortex_svz_fov1,train,466.019417,466.019417,25538.0,7664,0
Eng2019_cortex_svz_fov4,train,466.019417,466.019417,21251.0,6889,0
Eng2019_cortex_svz_fov5,train,466.019417,466.019417,3618.0,2186,0
Eng2019_cortex_svz_fov6,train,466.019417,466.019417,28177.0,7715,0
Eng2019_cortex_svz_fov2,val,466.019417,466.019417,51912.0,9290,0
Eng2019_cortex_svz_fov3,test,466.019417,466.019417,20226.0,7264,0


In [9]:
dlpfc_genes["unprocessed"]

TSPAN6
TNMD
DPM1
SCYL3
C1orf112
...
AC139491.7
AC021097.2
AC003043.2
AL356417.3
AP000646.1


In [2]:
adata_sc = sc.read_h5ad("data/mouse_cortex/sc_adata/GSE115746.h5ad")

In [20]:
adata_sc.obs["cell_type"].dropna()

F2S4_151217_005_B01    Pvalb
F2S4_151217_005_C01       L4
F2S4_151217_005_E01       L4
F2S4_151217_005_F01       L4
F2S4_151217_005_G01       L4
                       ...  
F1S4_180124_317_D01    Lamp5
F1S4_180124_317_E01     Sncg
F1S4_180124_317_F01      Sst
F1S4_180124_317_G01     Sncg
F1S4_180124_317_H01     Sncg
Name: cell_type, Length: 22277, dtype: category
Categories (28, object): ['Astro', 'Batch Grouping', 'CR', 'Doublet Astro Aqp4 Ex', ..., 'Sncg', 'Sst', 'VLMC', 'Vip']

In [22]:
adata_sts = []
for name in glob.glob("data/mouse_cortex/st_adata/spotless_mouse_cortex*.h5ad"):
    print(name)
    adata_sts.append(sc.read_h5ad(name))
    

adata_st = ad.concat(adata_sts, join="outer", label="sample")

data/mouse_cortex/st_adata/spotless_mouse_cortex-Eng2019_cortex_svz_fov4.h5ad
data/mouse_cortex/st_adata/spotless_mouse_cortex-Eng2019_cortex_svz_fov2.h5ad
data/mouse_cortex/st_adata/spotless_mouse_cortex-Eng2019_cortex_svz_fov3.h5ad
data/mouse_cortex/st_adata/spotless_mouse_cortex-Eng2019_cortex_svz_fov1.h5ad
data/mouse_cortex/st_adata/spotless_mouse_cortex-Eng2019_cortex_svz_fov5.h5ad
data/mouse_cortex/st_adata/spotless_mouse_cortex-Eng2019_cortex_svz_fov6.h5ad
data/mouse_cortex/st_adata/spotless_mouse_cortex-Eng2019_cortex_svz_fov0.h5ad


  utils.warn_names_duplicates("obs")


In [19]:
adata_sts = []
for name in glob.glob("data/spotless/standards/gold_standard_1/*.h5ad"):
    print(name)
    adata_sts.append(sc.read_h5ad(name))
    

adata_st = ad.concat(adata_sts, join="outer", label="sample")

data/spotless/standards/gold_standard_1/Eng2019_cortex_svz_fov5.h5ad
data/spotless/standards/gold_standard_1/Eng2019_cortex_svz_fov1.h5ad
data/spotless/standards/gold_standard_1/Eng2019_cortex_svz_fov6.h5ad
data/spotless/standards/gold_standard_1/Eng2019_cortex_svz_fov0.h5ad
data/spotless/standards/gold_standard_1/Eng2019_cortex_svz_fov2.h5ad
data/spotless/standards/gold_standard_1/Eng2019_cortex_svz_fov4.h5ad
data/spotless/standards/gold_standard_1/Eng2019_cortex_svz_fov3.h5ad


  utils.warn_names_duplicates("obs")


In [23]:
adata_st

AnnData object with n_obs × n_vars = 63 × 10000
    obs: 'X', 'Y', 'nCount_RNA', 'nFeature_RNA', 'orig.ident', 'sample'
    obsm: 'relative_spot_composition', 'spot_composition'

In [4]:
adata_st.obs.groupby("sample").size().mean()

3973.4166666666665

In [21]:
adata_sts = []
st_sample_ids = []
for name in glob.glob("data/mouse_cortex/st_adata/*.h5ad"):
    print(name)
    adata_sts.append(sc.read_h5ad(name))
    st_sample_ids.append(os.path.basename(name).split(".")[0].split("-")[1])
    

adata_st = ad.concat(adata_sts, join="outer", label="sample", keys=st_sample_ids)

data/mouse_cortex/st_adata/spotless_mouse_cortex-Eng2019_cortex_svz_fov4.h5ad
data/mouse_cortex/st_adata/spotless_mouse_cortex-Eng2019_cortex_svz_fov2.h5ad
data/mouse_cortex/st_adata/spotless_mouse_cortex-Eng2019_cortex_svz_fov3.h5ad
data/mouse_cortex/st_adata/spotless_mouse_cortex-Eng2019_cortex_svz_fov1.h5ad
data/mouse_cortex/st_adata/spotless_mouse_cortex-Eng2019_cortex_svz_fov5.h5ad
data/mouse_cortex/st_adata/spotless_mouse_cortex-Eng2019_cortex_svz_fov6.h5ad
data/mouse_cortex/st_adata/spotless_mouse_visual-Wang2018_visp_rep0410.h5ad
data/mouse_cortex/st_adata/spotless_mouse_cortex-Eng2019_cortex_svz_fov0.h5ad


  utils.warn_names_duplicates("obs")


In [23]:
adata_st.obs

Unnamed: 0,X,Y,nCount_RNA,nFeature_RNA,orig.ident,sample
spot_1,466.019417,466.019417,21251.0,6889,0,Eng2019_cortex_svz_fov4
spot_2,466.019417,1000.000000,34958.0,8716,0,Eng2019_cortex_svz_fov4
spot_3,466.019417,1533.980583,46103.0,9374,0,Eng2019_cortex_svz_fov4
spot_4,1000.000000,466.019417,70531.0,9761,0,Eng2019_cortex_svz_fov4
spot_5,1000.000000,1000.000000,42681.0,9140,0,Eng2019_cortex_svz_fov4
...,...,...,...,...,...,...
spot_5,1000.000000,1000.000000,61491.0,9653,0,Eng2019_cortex_svz_fov0
spot_6,1000.000000,1533.980583,17915.0,6909,0,Eng2019_cortex_svz_fov0
spot_7,1533.980583,466.019417,35974.0,8725,0,Eng2019_cortex_svz_fov0
spot_8,1533.980583,1000.000000,48911.0,9380,0,Eng2019_cortex_svz_fov0


In [4]:
adata_spotless_st = sc.read_h5ad("data/mouse_cortex/preprocessed/GSE115746_spotless_mouse_cortex/40markers/minmax/mat_sp_samp_split_d.h5ad")

In [5]:
adata_spotless_st.obs["split"].value_counts()

train    54
test      9
Name: split, dtype: int64

In [7]:
adata_dlpfc_st = sc.read_h5ad("data/dlpfc/preprocessed/GSE144136_spatialLIBD/20markers/minmax/mat_sp_samp_split_d.h5ad")
adata_dlpfc_st.obs["split"].value_counts()

train    44042
test      3639
Name: split, dtype: int64

In [11]:
adata_pdac_sc = sc.read_h5ad("data/pdac/sc_adata/CA001063.h5ad")
adata_pdac_sc.obs

Unnamed: 0,CELL,Patient,Type,cell_type,celltype0,celltype1,celltype2,celltype3,subject,nCount_RNA,...,histone_score_1,normal_stroma_score_1,olfactory_score_1,na_score_1,n_genes,n_genes_by_counts,total_counts,total_counts_mt,pct_counts_mt,sample_id
T1_AAACCTGAGATGTCGG,T1_AAACCTGAGATGTCGG,T1,Tumor,Fibroblast cell,fibroblast,fibroblast,fibroblast,fibroblast,CA001063_T1,11811.0,...,-0.002426,0.049558,-0.036479,-0.008173,3342,3342,4356.809570,0.0,0.0,CA001063_T1
T1_AAACGGGGTCATGCAT,T1_AAACGGGGTCATGCAT,T1,Tumor,Stellate cell,fibroblast,pancreatic stellate cell,pancreatic stellate cell,pancreatic stellate cell,CA001063_T1,1935.0,...,-0.023274,-0.001198,-0.039764,-0.008104,974,974,2335.506836,0.0,0.0,CA001063_T1
T1_AAAGATGCATGTTGAC,T1_AAAGATGCATGTTGAC,T1,Tumor,Macrophage cell,hematopoietic cell,myeloid leukocyte,myeloid dendritic cell,CD1c-positive myeloid dendritic cell,CA001063_T1,3811.0,...,-0.030058,0.003450,-0.029820,-0.015801,1490,1490,2976.401367,0.0,0.0,CA001063_T1
T1_AAAGATGGTCGAGTTT,T1_AAAGATGGTCGAGTTT,T1,Tumor,Macrophage cell,hematopoietic cell,myeloid leukocyte,macrophage,macrophage,CA001063_T1,2069.0,...,-0.028608,-0.031481,-0.049424,0.025834,1101,1101,2521.106445,0.0,0.0,CA001063_T1
T1_AAAGATGGTCTCTCTG,T1_AAAGATGGTCTCTCTG,T1,Tumor,Endothelial cell,endothelial cell,blood vessel endothelial cell,blood vessel endothelial cell,blood vessel endothelial cell,CA001063_T1,8706.0,...,0.102929,-0.022376,-0.060222,-0.005385,3189,3189,4350.042969,0.0,0.0,CA001063_T1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
T24_TTTGGTTGTGTTGAGG,T24_TTTGGTTGTGTTGAGG,T24,Tumor,Stellate cell,fibroblast,pancreatic stellate cell,pancreatic stellate cell,pancreatic stellate cell,CA001063_T24,6897.0,...,-0.027299,0.037214,-0.052525,-0.017397,2096,2096,3529.317627,0.0,0.0,CA001063_T24
T24_TTTGGTTTCCAGAGGA,T24_TTTGGTTTCCAGAGGA,T24,Tumor,Stellate cell,fibroblast,pancreatic stellate cell,pancreatic stellate cell,pancreatic stellate cell,CA001063_T24,4630.0,...,-0.039719,0.354603,-0.055698,0.009853,1641,1641,3127.418945,0.0,0.0,CA001063_T24
T24_TTTGGTTTCCTGCTTG,T24_TTTGGTTTCCTGCTTG,T24,Tumor,Ductal cell type 1,epithelial cell,pancreatic ductal cell,pancreatic ductal cell,pancreatic ductal cell,CA001063_T24,3734.0,...,0.001137,-0.013625,0.261652,0.008457,1701,1701,3222.236816,0.0,0.0,CA001063_T24
T24_TTTGTCAAGATGTCGG,T24_TTTGTCAAGATGTCGG,T24,Tumor,Ductal cell type 2,epithelial cell,pancreatic ductal cell,pancreatic ductal cell,pancreatic ductal cell,CA001063_T24,10274.0,...,-0.048517,-0.024899,0.012992,-0.018849,2824,2824,4040.932373,0.0,0.0,CA001063_T24


In [12]:
adata_pdac_sc.obs["cell_type"].value_counts()

Ductal cell type 2    11270
Fibroblast cell        5836
Stellate cell          5292
Endothelial cell       5135
Macrophage cell        4830
T cell                 3615
Ductal cell type 1     2647
B cell                 2381
Acinar cell             515
Endocrine cell          443
Name: cell_type, dtype: int64

In [16]:
adata_pdac_sc.var

Unnamed: 0,features,n_cells,mt,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts
LINC00115,LINC00115,1808,False,1808,0.051140,95.691545,2146.024414
FAM41C,FAM41C,891,False,891,0.024242,97.876752,1017.294189
SAMD11,SAMD11,1559,False,1559,0.047890,96.284911,2009.650757
NOC2L,NOC2L,12248,False,12248,0.378581,70.813078,15886.753906
KLHL17,KLHL17,915,False,915,0.023429,97.819560,983.189331
...,...,...,...,...,...,...,...
IL17F,IL17F,21,False,21,0.000857,99.949957,35.971462
INSL6,INSL6,28,False,28,0.000616,99.933276,25.851713
FOXI2,FOXI2,6,False,6,0.000175,99.985702,7.360909
CCDC60,CCDC60,5,False,5,0.000128,99.988085,5.365456


In [18]:
df_mouse_sc = pd.read_csv("data/mouse_cortex/GSE115746/GSE115746_cells_exon_counts.csv", index_col=0)

: 

: 

In [None]:
df_mouse_sc

In [7]:
df_mouse_sc.cell_subclass.dropna()

8        No Class
9           Pvalb
10             L4
11       No Class
12             L4
           ...   
28454         Sst
28455         Sst
28464    No Class
28465    No Class
28466    No Class
Name: cell_subclass, Length: 25481, dtype: object

In [17]:
df_mouse_sc[df_mouse_sc["organism"] != "Control"]

Unnamed: 0,sample_name,title,source_name,organism,donor_id,donor_sex,donor_genotype,injection_type,injection_target,injected_material,...,sequencing_tube,sequencing_batch,sequencing_qc_pass_fail,cell_class,cell_subclass,cell_cluster,molecule,SRA_Run,GEO_Sample,GEO_Sample_Title
8,F2S4_151217_005_A01,F2S4_151217_005_A01,Primary Visual Cortex (VISp),Mus musculus,222454,M,Snap25-IRES2-Cre/wt;Ai14(RCL-tdT)/wt,No Injection,No Injection,No Injection,...,LS-14690,R8S4-160203,Pass,No Class,No Class,Low Quality,polyA RNA,SRR7335295,GSM3189964,VISp_Snap25-IRES2-Cre/wt;Ai14(RCL-tdT)/wt_No I...
9,F2S4_151217_005_B01,F2S4_151217_005_B01,Primary Visual Cortex (VISp),Mus musculus,222454,M,Snap25-IRES2-Cre/wt;Ai14(RCL-tdT)/wt,No Injection,No Injection,No Injection,...,LS-14690,R8S4-160203,Pass,GABAergic,Pvalb,Pvalb Tpbg,polyA RNA,SRR7335406,GSM3189964,VISp_Snap25-IRES2-Cre/wt;Ai14(RCL-tdT)/wt_No I...
10,F2S4_151217_005_C01,F2S4_151217_005_C01,Primary Visual Cortex (VISp),Mus musculus,222454,M,Snap25-IRES2-Cre/wt;Ai14(RCL-tdT)/wt,No Injection,No Injection,No Injection,...,LS-14690,R8S4-160203,Pass,Glutamatergic,L4,L4 IT VISp Rspo1,polyA RNA,SRR7335517,GSM3189964,VISp_Snap25-IRES2-Cre/wt;Ai14(RCL-tdT)/wt_No I...
11,F2S4_151217_005_D01,F2S4_151217_005_D01,Primary Visual Cortex (VISp),Mus musculus,222454,M,Snap25-IRES2-Cre/wt;Ai14(RCL-tdT)/wt,No Injection,No Injection,No Injection,...,LS-14690,R8S4-160203,Pass,No Class,No Class,Low Quality,polyA RNA,SRR7335628,GSM3189964,VISp_Snap25-IRES2-Cre/wt;Ai14(RCL-tdT)/wt_No I...
12,F2S4_151217_005_E01,F2S4_151217_005_E01,Primary Visual Cortex (VISp),Mus musculus,222454,M,Snap25-IRES2-Cre/wt;Ai14(RCL-tdT)/wt,No Injection,No Injection,No Injection,...,LS-14690,R8S4-160203,Pass,Glutamatergic,L4,L4 IT VISp Rspo1,polyA RNA,SRR7335739,GSM3189964,VISp_Snap25-IRES2-Cre/wt;Ai14(RCL-tdT)/wt_No I...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28701,F2S4_160728_010_D01,F2S4_160728_010_D01,Primary Visual Cortex (VISp),Mus musculus,259259,M,Sst-IRES-Cre/wt;Pvalb-T2A-Dre/wt;Ai66(RCRL-tdT...,No Injection,No Injection,No Injection,...,LS-15501,R8S4-160817,Pass,GABAergic,,,polyA RNA,SRR8322942,GSM3189971,VISp_Sst-IRES-Cre/wt;Pvalb-T2A-Dre/wt;Ai66(RCR...
28702,F2S4_160728_010_E01,F2S4_160728_010_E01,Primary Visual Cortex (VISp),Mus musculus,259259,M,Sst-IRES-Cre/wt;Pvalb-T2A-Dre/wt;Ai66(RCRL-tdT...,No Injection,No Injection,No Injection,...,LS-15501,R8S4-160817,Pass,GABAergic,,,polyA RNA,SRR8322943,GSM3189971,VISp_Sst-IRES-Cre/wt;Pvalb-T2A-Dre/wt;Ai66(RCR...
28703,F2S4_160728_010_F01,F2S4_160728_010_F01,Primary Visual Cortex (VISp),Mus musculus,259259,M,Sst-IRES-Cre/wt;Pvalb-T2A-Dre/wt;Ai66(RCRL-tdT...,No Injection,No Injection,No Injection,...,LS-15501,R8S4-160817,Pass,GABAergic,,,polyA RNA,SRR8322944,GSM3189971,VISp_Sst-IRES-Cre/wt;Pvalb-T2A-Dre/wt;Ai66(RCR...
28704,F2S4_160728_010_G01,F2S4_160728_010_G01,Primary Visual Cortex (VISp),Mus musculus,259259,M,Sst-IRES-Cre/wt;Pvalb-T2A-Dre/wt;Ai66(RCRL-tdT...,No Injection,No Injection,No Injection,...,LS-15501,R8S4-160817,Pass,GABAergic,,,polyA RNA,SRR8322945,GSM3189971,VISp_Sst-IRES-Cre/wt;Pvalb-T2A-Dre/wt;Ai66(RCR...


In [16]:
df_mouse_sc.source_name.value_counts()

Primary Visual Cortex (VISp)                        15652
Anterior Lateral Motor Cortex (ALM)                 10068
Control: 10pg Zyagen Mouse Whole Brain Total RNA      923
Control: 1E-8 ERCC MIX1                               748
Control: No Template Control                          742
Control: 10pg Takara Control Total RNA                573
Name: source_name, dtype: int64