# **IMPORT LIBRARIES**

In [3]:
import scanpy as sc
import pandas as pd
import numpy as np
import anndata as ad
import scrublet as scr
import scipy.io
import matplotlib.pyplot as plt
import os

# **LOAD AND MERGE DATA**

In [None]:
# Paths to 10x directories (adjust accordingly)
normal_paths = [
    "/samples/normal/ADJ1",
    "/samples/normal/ADJ2"
]

disease_paths = [
    "/samples/pdac/pdac1",
    "/samples/pdac/pdac2"
]

In [None]:
def read_and_label_adata(paths, prefix, condition):
    adatas = []
    for i, path in enumerate(paths):
        adata = sc.read_10x_mtx(path, var_names='gene_symbols', cache=True)
        adata.var_names_make_unique()
        adata.obs_names = [f"{prefix}{i+1}_{bc}" for bc in adata.obs_names]

        adata.obs['orig.ident'] = f"{prefix}{i+1}"
        adata.obs['condition'] = condition

        adatas.append(adata)

    merged = ad.concat(adatas, join="outer")
    return merged

normal_merged = read_and_label_adata(normal_paths, 'N', 'Normal')
disease_merged = read_and_label_adata(disease_paths, 'PDAC', 'PDAC')

adata = ad.concat([normal_merged, disease_merged], join="outer")
adata

In [None]:
adata.obs['orig.ident'].value_counts()

In [None]:
adata.obs['condition'].value_counts()

In [None]:
adata.shape

In [None]:
adata.obs[:5]

# **QC METRICS**

In [None]:
adata.var['mt'] = adata.var_names.str.startswith('MT-')
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], inplace=True)

sc.pl.violin(
    adata,
    ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
    groupby='condition',
    multi_panel=True)

# **FILTER CELLS**

In [None]:
adata = adata[
    (adata.obs.n_genes_by_counts > 200) &
    (adata.obs.n_genes_by_counts < 6000) &
    (adata.obs.total_counts > 2000) &
    (adata.obs.pct_counts_mt < 10),
    :
].copy()

sc.pl.violin(
    adata,
    ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
    groupby='condition',
    multi_panel=True)

In [None]:
adata

# **Doublet Detection**

In [None]:
# Prepare a column to store doublet scores & predictions
adata.obs['doublet_score'] = np.nan
adata.obs['predicted_doublet'] = False

# Run Scrublet per sample
for sample in adata.obs['orig.ident'].unique():
    print(f"Processing sample {sample} for doublets...")

    # subset to current sample
    adata_s = adata[adata.obs['orig.ident'] == sample].copy()

    # convert to dense if sparse
    counts_matrix = adata_s.X.toarray() if hasattr(adata_s.X, "toarray") else adata_s.X

    # initialize Scrublet
    scrub = scr.Scrublet(counts_matrix)

    # get doublet scores
    doublet_scores, predicted_doublets = scrub.scrub_doublets()

    # store results back into original adata
    adata.obs.loc[adata_s.obs_names, 'doublet_score'] = doublet_scores
    adata.obs.loc[adata_s.obs_names, 'predicted_doublet'] = predicted_doublets

In [None]:
# Filter out predicted doublets
adata = adata[~adata.obs['predicted_doublet'], :].copy()
# Check number of remaining cells
print(f"Remaining cells after doublet removal: {adata.n_obs}")

In [None]:
# Save raw counts in raw.X and layer['counts'] in adata
adata.raw = adata.copy()
adata.layers['counts'] = adata.X.copy()

print(adata.raw.X.shape)
print(adata.layers)

In [None]:
# Save filtered data
adata.write('filtered_data.h5ad')

# **NORMALIZATION AND HVFs SELECTION**

In [None]:
# perform normalization
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

# get highly variable features
sc.pp.highly_variable_genes(adata, n_top_genes=2000, flavor='seurat')
sc.pl.highly_variable_genes(adata)

In [None]:
# save normalized counts to a new layer of adata
adata.layers['data'] = adata.X.copy()
adata.layers

In [None]:
# Save normalized data if you need for later
#adata.write("adata_normalized.h5ad")

# **SCALING**

In [None]:
# Subset adata to keep variable features only
#adata_scale = adata[:, adata.var.highly_variable].copy()  # This will subset data to HVFs only which is not efficient for cell type annotation

# Scale all genes
sc.pp.scale(adata, zero_center=True)

In [None]:
# save scaled counts to a new layer of adata
adata.layers['scale.data'] = adata.X.copy()
adata.layers

# **Dimensionality Reduction**
## **(PCA, NEIGHBORS, CLUSTERING, UMAP AND t-SNE)**

In [None]:
# PCA
sc.tl.pca(adata, svd_solver='arpack', mask_var='highly_variable')
sc.pl.pca(adata, color='condition')
sc.pl.pca_variance_ratio(adata, n_pcs=50, log=True)

In [None]:
sc.pp.neighbors(adata, n_neighbors=15, n_pcs=10)
sc.tl.leiden(adata, resolution=0.6, flavor="igraph", n_iterations=2, directed=False)

sc.tl.umap(adata)
sc.tl.tsne(adata, n_pcs=10)

sc.pl.umap(adata, color=["leiden"])
sc.pl.tsne(adata, color=["leiden"])

In [None]:
# save clustered data
adata.write("adata_scaled_and_clustered.h5ad")

# **Re-assess quality control and cell filtering**

In [None]:
# Doublets
sc.pl.umap(adata,
    color=["leiden", "predicted_doublet", "doublet_score"], wspace=0.2, size=3, frameon=False)

In [None]:
# rna, genes, mt content
sc.pl.umap(adata, color=["leiden", "log1p_total_counts", "pct_counts_mt", "log1p_n_genes_by_counts"],
           wspace=0.2, ncols=2, frameon=False)

### **Now perform cell type annotation using either scANVI or CellTypist by navigating to respective annotation pipeline**