# **Differential Gene Expression (PseudoBulk DGE)**

In [None]:
# Create a unique pseudobulk sample ID for each cell
# Format: "<condition>_<orig.ident>"
adata.obs['sample_id'] = adata.obs['condition'].astype(str) + "_" + adata.obs['orig.ident'].astype(str)
# Check
adata.obs[['condition', 'orig.ident', 'sample_id']].head()

In [None]:
# make sure raw counts are in .X of adata
adata.X = adata.layers['counts']

In [None]:
# -----------------------------
# Pseudobulk aggregation function
# -----------------------------
def pseudobulk_for_deseq2(adata, groupby, sample_col="sample_id"):
    """
    Aggregate counts per sample and group for DESeq2 input.

    Returns:
    - count_df: genes x pseudobulk samples
    - meta_df: sample metadata with columns: sample_id, condition, orig_sample
    """
    pb_dict = {}
    meta_list = []
    genes = adata.var_names

    for sample in adata.obs[sample_col].unique():
        ad_sample = adata[adata.obs[sample_col] == sample]
        for grp in ad_sample.obs[groupby].unique():
            ad_grp = ad_sample[ad_sample.obs[groupby] == grp]
            # Sum counts across cells
            pb_name = f"{grp}_{sample}"  # unique pseudobulk sample name
            pb_dict[pb_name] = np.asarray(ad_grp.X.sum(axis=0)).flatten()

            # Metadata
            meta_list.append({
                "sample_id": pb_name,              # matches column in counts matrix
                "condition": grp,                  # DESeq2 group variable
                "orig_sample": ad_sample.obs["orig.ident"].unique()[0]  # original replicate
            })

    count_df = pd.DataFrame(pb_dict, index=genes)
    meta_df = pd.DataFrame(meta_list)
    return count_df, meta_df

# **a. Same cell type across different conditions**

In [None]:
cell_type = "T cells"
adata_sc1 = adata[adata.obs["celltypist_label"] == cell_type].copy()

count_df_sc1, meta_df_sc1 = pseudobulk_for_deseq2(
    adata_sc1,
    groupby="condition",
    sample_col="sample_id"
)

# Save DESeq2-ready CSVs
count_df_sc1.to_csv("pseudobulk_counts_T-cells-pdac_vs_normal.csv")
meta_df_sc1.to_csv("pseudobulk_metadata_T-cells-pdac_vs_normal.csv", index=False)

# **b. One cell type vs all other cell types in the same condition**

In [None]:
condition_of_interest = "PDAC"
adata_sc2 = adata[adata.obs["condition"] == condition_of_interest].copy()

# Binary label: Macrophage vs Other
adata_sc2.obs["tcell_vs_others"] = np.where(
    adata_sc2.obs["celltypist_label"] == "T cells",
    "T cells",
    "rest"
)

count_df_sc2, meta_df_sc2 = pseudobulk_for_deseq2(
    adata_sc2,
    groupby="tcell_vs_others",
    sample_col="sample_id"
)

# Save CSVs
count_df_sc2.to_csv("pseudobulk_counts_T-cells_vs_all_other_pdac.csv")
meta_df_sc2.to_csv("pseudobulk_metadata_T-cells_vs_all_other_pdac.csv", index=False)

# **c. One cell type vs all others regardless of condition**

In [None]:
adata_sc3 = adata.copy()

# Global binary label
adata_sc3.obs["tcell_global"] = np.where(
    adata_sc3.obs["celltypist_label"] == "T cells",
    "T cells",
    "rest"
)

count_df_sc3, meta_df_sc3 = pseudobulk_for_deseq2(
    adata_sc3,
    groupby="tcell_global",
    sample_col="sample_id"
)

# Save CSVs
count_df_sc3.to_csv("pseudobulk_counts_T-cells_vs_all_other_no-condition.csv")
meta_df_sc3.to_csv("pseudobulk_metadata_T-cells_vs_all_other_no-condition.csv", index=False)

### **Now use R script (DESeq2.R) to analyze the Pseudobulk files**