In [None]:
import os
import scanpy as sc
import pandas as pd
import numpy as np
import scipy.sparse as sp
import anndata as ad
import seaborn as sns

In [None]:
dir_path = '/mnt/storage/Daniele/atlases/mouse_public_raw/'

In [None]:
studies = os.listdir(dir_path)

## Carstens_2021

In [None]:
count_matrix_carstens = pd.read_csv(f'{dir_path}/Carstens_2021/GSE165534_MATRIXKPCVSKPCST.csv.gz')

In [None]:
var_names = count_matrix_carstens['Unnamed: 0'].tolist()
counts = sp.csr_matrix(count_matrix_carstens.iloc[:,1:].values)
obs_names = count_matrix_carstens.columns[1:].tolist()


In [None]:
counts = counts.astype(np.int32)

In [None]:
len(var_names), len(obs_names)

In [None]:
carstens_adata = ad.AnnData(X = counts.T, var = {"var_names":var_names}, obs = {"obs_names": obs_names})

In [None]:
del count_matrix

In [None]:
carstens_adata.obs['Sample'] = [cell.split('_')[0] for cell in carstens_adata.obs_names]

In [None]:
carstens_adata.obs['Dataset'] = 'Carstens_2021'

In [None]:
retain = [sample for sample in carstens_adata.obs['Sample'].unique() if 'METS' not in sample]

In [None]:
carstens_adata = carstens_adata[carstens_adata.obs.Sample.isin(retain)].copy()

In [None]:
samples = carstens_adata.obs['Sample'].unique().tolist()

In [None]:
import re
cleaned_mapping = {s: re.match(r"(KPC(?:ST)?\d+)", s).group(1) for s in samples}


In [None]:
carstens_adata.obs['Sample'] = carstens_adata.obs['Sample'].map(cleaned_mapping)


In [None]:
carstens_adata.write_h5ad(f'{dir_path}/Carstens_2021/adata_merged_raw.h5ad')

## Chen 2021

In [None]:
count_matrix_chen_21 = pd.read_csv(f'{dir_path}/Chen_2021/GSE166298_KPPF_tumors_all_matrix.csv.gz')

In [None]:
count_matrix_chen_21_col1smaKO = pd.read_csv(f'{dir_path}/Chen_2021/GSE166298_KPPF_Col1smaKO_combined_matrix.csv.gz')

In [None]:
var_names = count_matrix_chen_21['Unnamed: 0'].tolist()
counts = sp.csr_matrix(count_matrix_chen_21.iloc[:,1:].values)
obs_names = count_matrix_chen_21.columns[1:].tolist()


In [None]:
counts = counts.astype(np.int32)

In [None]:
len(var_names), len(obs_names)

In [None]:
chen_2021_adata = ad.AnnData(X = counts.T, var = {"var_names":var_names}, obs = {"obs_names": obs_names})

In [None]:
chen_2021_adata.obs['Sample'] = [f"{cell.split('_')[0]}_{cell.split('_')[2]}" for cell in chen_2021_adata.obs_names]

In [None]:
chen_2021_adata.obs['Dataset'] = 'Chen_2021'

In [None]:
var_names = count_matrix_chen_21_col1smaKO['Unnamed: 0'].tolist()
counts = sp.csr_matrix(count_matrix_chen_21_col1smaKO.iloc[:,1:].values)
obs_names = count_matrix_chen_21_col1smaKO.columns[1:].tolist()


In [None]:
counts = counts.astype(np.int32)

In [None]:
len(var_names), len(obs_names)

In [None]:
chen_2021_col1smako_adata = ad.AnnData(X = counts.T, var = {"var_names":var_names}, obs = {"obs_names": obs_names})

In [None]:
chen_2021_col1smako_adata.obs['Sample'] = 'KPPFcol1smaKO'

In [None]:
chen_2021_col1smako_adata.obs['Dataset'] = 'Chen_2021'

In [None]:
chen_2021_merged = ad.concat([chen_2021_adata, chen_2021_col1smako_adata], join = 'outer')

In [None]:
chen_2021_merged.write_h5ad(f'{dir_path}/Chen_2021/adata_merged_raw.h5ad')

In [None]:
chen_2021_merged

## Chen 2024

In [None]:
samples_map = {
    'GSM8480617_1_4':'sn1_4',
    'GSM8480618_2_4':'sn2_4',
    'GSM8480620_4_4':'sn4_4',
    'GSM8480621_5_4':'sn5_4',
    'GSM8480623_7_1':'sn7_1',
    'GSM8480624_8_2':'sn8_2',
    'GSM8480626_10_1':'sn10_1',
    'GSM8480627_11_1':'sn11_1',
}

In [None]:
adatas = []
for sample in samples_map.keys():
    ad_ = sc.read_10x_mtx(f'{dir_path}/Chen_2024/', prefix = f'{sample}.')
    ad_.obs['Sample'] = samples_map[sample]
    adatas.append(ad_)

In [None]:
chen_2024_adata = ad.concat(adatas, label = 'Sample', keys = samples_map.values(), join = 'outer')


In [None]:
from scipy.stats import median_abs_deviation
# mitochondrial genes
chen_2024_adata.var["mt"] = chen_2024_adata.var_names.str.startswith("mt-")
# ribosomal genes
chen_2024_adata.var["ribo"] = chen_2024_adata.var_names.str.startswith(("Rps", "Rpl"))
# hemoglobin genes.
chen_2024_adata.var["hb"] = chen_2024_adata.var_names.str.contains("^Hb[^(P)]")
# hemoglobin genes.
chen_2024_adata.var["malat"] = chen_2024_adata.var_names.str.contains("Malat")

In [None]:
sc.pp.calculate_qc_metrics(chen_2024_adata, inplace = True, qc_vars=["mt", "ribo", "hb", "malat"], log1p=True)

In [None]:
sc.pl.scatter(chen_2024_adata, x = 'total_counts', y = 'n_genes_by_counts', color = 'pct_counts_malat')

In [None]:
chen_2024_adata = chen_2024_adata[(chen_2024_adata.obs.total_counts > 500) & (chen_2024_adata.obs.n_genes_by_counts > 200)].copy()

In [None]:
chen_2024_adata.obs = chen_2024_adata.obs.iloc[:, :1].copy()
chen_2024_adata.var = chen_2024_adata.var.iloc[:, :0].copy()

In [None]:
chen_2024_adata.write_h5ad(f'{dir_path}/Chen_2024/adata_merged_raw.h5ad')

In [None]:
chen_2024_adata

## Cornell 2021

In [None]:
count_matrix_kpc = pd.read_csv(f'{dir_path}/Cornell_2021/GSE156210_KPC_Advanced_Count_Matrix.txt.gz', delimiter = '\t')
count_matrix_kpcn = pd.read_csv(f'{dir_path}/Cornell_2021/GSE156210_KPCN_Advanced_Count_Matrix.txt.gz', delimiter = '\t')

In [None]:
var_names = count_matrix_kpc.index.tolist()
counts = sp.csr_matrix(count_matrix_kpc.values)
obs_names = count_matrix_kpc.columns.tolist()
cornell_2021_kpc_adata = ad.AnnData(X = counts.T, var = {"var_names":var_names}, obs = {"obs_names": obs_names})

In [None]:
var_names = count_matrix_kpcn.index.tolist()
counts = sp.csr_matrix(count_matrix_kpcn.values)
obs_names = count_matrix_kpcn.columns.tolist()
cornell_2021_kpcn_adata = ad.AnnData(X = counts.T, var = {"var_names":var_names}, obs = {"obs_names": obs_names})

In [None]:
cornell_2021_kpcn_adata.obs['Sample'] = [f"{cell.split('_')[0]}" for cell in cornell_2021_kpcn_adata.obs_names]
cornell_2021_kpc_adata.obs['Sample'] = [f"{cell.split('_')[0]}" for cell in cornell_2021_kpc_adata.obs_names]

In [None]:
cornell_2021_kpc_adata.obs.Sample.value_counts()

In [None]:
cornell_2021_adata = ad.concat([cornell_2021_kpc_adata,cornell_2021_kpcn_adata], join = 'outer')

In [None]:
cornell_2021_adata.obs['Dataset'] = 'Cornell_2021'

In [None]:
cornell_2021_adata.write_h5ad(f'{dir_path}/Cornell_2021/adata_merged_raw.h5ad')

In [None]:
cornell_2021_adata

## Donahue 2024

In [None]:
samples = [sample[:sample.find('_filtered_feature_bc_matrix.h5')] for sample in os.listdir(f'{dir_path}/Donahue_2024/') if 'filtered_feature' in sample]

In [None]:
samples_names = [sample[sample.find('_')+1:] for sample in samples]

In [None]:
adatas = []
for sample, sample_name in zip(samples,samples_names):
    ad_ = sc.read_10x_h5(f'{dir_path}/Donahue_2024/{sample}_filtered_feature_bc_matrix.h5')
    ad_.obs['Sample'] = sample_name
    adatas.append(ad_)

In [None]:
for ad_ in adatas:
    ad_.obs_names = ad_.obs_names.astype(str)
    ad_.var_names_make_unique()


donahue_adata = ad.concat(adatas, join='outer', label='Sample', keys=samples_names, index_unique='-')
donahue_adata.obs['Dataset'] = 'Donahue_2024'


In [None]:
donahue_adata.write_h5ad(f'{dir_path}/Donahue_2024/adata_merged_raw.h5ad')

In [None]:
donahue_adata

## Elyada 2020

In [None]:
count_matrix_elyada = pd.read_csv(f'{dir_path}/Elyada_2020/GSE129455_All_Viable_expression.csv.gz')

In [None]:
count_matrix_elyada

In [None]:
var_names = count_matrix_elyada['Unnamed: 0'].tolist()
counts = sp.csr_matrix(count_matrix_elyada.iloc[:,1:].values)
obs_names = count_matrix_elyada.columns[1:].tolist()


In [None]:
counts = counts.astype(np.int32)

In [None]:
elyada_adata = ad.AnnData(X = counts.T, var = {"var_names":var_names}, obs = {"obs_names": obs_names})

In [None]:
elyada_adata.obs['Sample'] = [cell.split('-')[1] for cell in elyada_adata.obs_names]
elyada_adata.obs['Dataset'] = 'Elyada_2020'

In [None]:
elyada_adata.write_h5ad(f'{dir_path}/Elyada_2020/adata_merged_raw.h5ad')

In [None]:
elyada_adata

## Erdem 2024

In [None]:
samples_map = {
    'GSM6532915_C1':'C1',
    'GSM6532916_C2':'C2',
    'GSM6532917_TAK981_1':'TAK981_1',
    'GSM6532918_TAK981_2':'TAK981_2',
}

In [None]:
adatas = []
for sample in samples_map.keys():
    ad_ = sc.read_10x_mtx(f'{dir_path}/Erden_2024/', prefix = f'{sample}_')
    ad_.obs['Sample'] = samples_map[sample]
    adatas.append(ad_)

In [None]:
erden_2024_adata = ad.concat(adatas, label = 'Sample', keys = samples_map.values(), join = 'outer')


In [None]:
erden_2024_adata.write_h5ad(f'{dir_path}/Erden_2024/adata_merged_raw.h5ad')

In [None]:
erden_2024_adata

## Veghin 2024

In [None]:
samples_map = {
    'GSM7869229_C2':'C2',
    'GSM7869230_T2':'T2',
    'GSM7869231_C7':'C7',
    'GSM7869232_T7':'T7',
}

In [None]:
adatas = []
for sample in samples_map.keys():
    ad_ = sc.read_10x_mtx(f'{dir_path}/Veghin_2024/', prefix = f'{sample}_')
    ad_.obs['Sample'] = samples_map[sample]
    adatas.append(ad_)

In [None]:
veghin_2024_adata = ad.concat(adatas, label = 'Sample', keys = samples_map.values(), join = 'outer')


In [None]:
veghin_2024_adata.obs['Dataset'] = 'Veghin_2024'

In [None]:
veghin_2024_adata.write_h5ad(f'{dir_path}/Veghin_2024/adata_merged_raw.h5ad')

In [None]:
veghin_2024_adata

## Singhai 2024

In [None]:
singhai_2024_adata = sc.read_h5ad(f'{dir_path}/Singhal_2024/GSE271300_Mouse_PDAC_QC.h5ad')

In [None]:
singhai_2024_adata.obs['Sample'] = [f"{cell.split('-')[2]}" for cell in singhai_2024_adata.obs_names]
singhai_2024_adata.obs['Dataset'] = 'Singhai_2024'


In [None]:
singhai_2024_adata.write_h5ad(f'{dir_path}/Singhal_2024/adata_merged_raw.h5ad')

In [None]:
singhai_2024_adata

## Rupert 2025

In [None]:
samples_map = {
    'GSM8084319_Joe376':'Joe376',
    'GSM8084320_Joe221':'Joe221',
    'GSM8084321_Joe1':'Joe1',
    'GSM8084322_Joe2':'Joe2',
}

In [None]:
adatas = []
for sample in samples_map.keys():
    ad_ = sc.read_10x_mtx(f'{dir_path}/Rupert_2025/', prefix = f'{sample}_')
    ad_.obs['Sample'] = samples_map[sample]
    adatas.append(ad_)

In [None]:
rupert_2025_adata = ad.concat(adatas, label = 'Sample', keys = samples_map.values(), join = 'outer')


In [None]:
rupert_2025_adata.obs['Dataset'] = 'Rupert_2025'

In [None]:
rupert_2025_adata.write_h5ad(f'{dir_path}/Rupert_2025/adata_merged_raw.h5ad')

In [None]:
rupert_2025_adata

## Hosein 2022

In [None]:
dirs = [dir_ for dir_ in os.listdir(f'{dir_path}/Hosein_2022') if 'tar' not in dir_]
samples_names = [sample[:sample.find('_filtered')] for sample in dirs]

In [None]:
adatas = []
for dir_,sample in zip(dirs, samples_names):
    ad_ = sc.read_10x_mtx(f'{dir_path}/Hosein_2022/{dir_}',)
    ad_.obs['Sample'] = sample
    adatas.append(ad_) 

In [None]:
hosein_2022_adata = ad.concat(adatas, label = 'Sample', keys = samples_names, join = 'outer')


In [None]:
hosein_2022_adata.obs['Dataset'] = 'Hosein_2022'

In [None]:
hosein_2022_adata.write_h5ad(f'{dir_path}/Hosein_2022/adata_merged_raw.h5ad')

In [None]:
hosein_2022_adata

## Hosein 2019

In [None]:
samples_map = {
    'GSM3577882_normal_panc':'normal_panc',
    'GSM3577883_early_KIC':'early_KIC',
    'GSM3577884_late_KIC':'late_KIC',
    'GSM3577885_late_KPfC':'late_KPfC',
    'GSM3577886_late_KPC':'late_KPC',
}

In [None]:
adatas = []
for sample in samples_map.keys():
    genes = pd.read_csv(f'{dir_path}/Hosein_2019/{sample}_features.tsv.gz', delimiter = '\t',header=None)
    genes.loc[:,2] = "Gene Expression"
    genes.to_csv(f'{dir_path}/Hosein_2019/{sample}_features.tsv.gz', sep="\t", index=False, header=False, compression="gzip")
    ad_ = sc.read_10x_mtx(f'{dir_path}/Hosein_2019/', prefix = f'{sample}_')
    ad_.obs['Sample'] = samples_map[sample]
    adatas.append(ad_)

In [None]:
hosein_2019_adata = ad.concat(adatas, label = 'Sample', keys = samples_map.values(), join = 'outer')


In [None]:
hosein_2019_adata.obs['Dataset'] = 'Hosein_2019'

In [None]:
hosein_2019_adata.write_h5ad(f'{dir_path}/Hosein_2019/adata_merged_raw.h5ad')

In [None]:
hosein_2019_adata

## Han 2023

In [None]:
import rpy2
%load_ext rpy2.ipython


In [None]:
%%R
library(reticulate)
library(sceasy)
library(Seurat)
library(Matrix)


In [None]:
getwd()


In [None]:
%%R
seurat_object <- load('/mnt/storage/Daniele/atlases/mouse_public_raw/Han_2023/GSE200903_seurat_anaylsis_combined_clusters_210827.RData')

In [None]:
%%R
raw_counts <- Seurat::GetAssayData(combined_cluster, slot = "counts")

In [None]:
%%R
DefaultAssay(combined_cluster) <- "RNA"
combined_cluster[["RNA"]]@data <- (combined_cluster[["RNA"]]@counts)


In [None]:
%%R
# 1. Check if all entries are integers
# Convert sparse matrix to triplet form and check @x (non-zero values)
are_integers <- all(raw_counts@x == as.integer(raw_counts@x))
print(paste("Are raw counts integers?", are_integers))

# 2. Range of non-zero values
range_raw_counts <- range(raw_counts@x)
print(paste("Range of raw counts (non-zero):", range_raw_counts[1], "to", range_raw_counts[2]))

# 3. Percentage of zero counts
total_entries <- prod(dim(raw_counts))
zero_count <- total_entries - length(raw_counts@x)
percentage_zeros <- (zero_count / total_entries) * 100
print(paste("Percentage of zero counts:", round(percentage_zeros, 2), "%"))

# Separator
cat(rep("-", 50), "\n")

In [None]:
%%R
sceasy::convertFormat(combined_cluster, from="seurat", to="anndata",
                       outFile='/mnt/storage/Daniele/atlases/mouse_public_raw/Han_2023/adata_merged_raw.h5ad')

In [None]:
han_2023_adata = sc.read_h5ad('/mnt/storage/Daniele/atlases/mouse_public_raw/Han_2023/adata_merged_raw.h5ad')

In [None]:
han_2023_adata.X.A

In [None]:
han_2023_adata.obs['Sample'] = han_2023_adata.obs['stim']
han_2023_adata.obs['Dataset'] = 'Han_2023'

In [None]:
han_2023_adata.obs = han_2023_adata.obs[['Sample','Dataset']].copy()

In [None]:
han_2023_adata.write_h5ad('/mnt/storage/Daniele/atlases/mouse_public_raw/Han_2023/adata_merged_raw.h5ad')