## Exporting data from atlas to generate Castro proportions plot (both DC subsets as a proportion of DC, and proportion of cDC1, cDC2 and classical monocytes of total myeloid APC)

In [1]:
#load packages I need
import os
import tools
import scanpy as sc
import pandas as pd
import dandelion as ddl
from tqdm import tqdm
import matplotlib.pyplot as plt
import scanpy.external as sce
from matplotlib.pyplot import rc_context
import seaborn as sns

In [2]:
#set current directory 
os.chdir('/scratch/user/s4436039/scdata/Myeloid_Objects')
os.getcwd()

'/scratch/user/s4436039/scdata/Myeloid_Objects'

In [4]:
# read in data
data = sc.read_h5ad('NRclean_clustered2_DC.h5ad')
data_all = sc.read_h5ad('NRclean_clustered2.h5ad')

In [5]:
data

AnnData object with n_obs × n_vars = 30241 × 1268
    obs: 'nCount_RNA', 'nFeature_RNA', 'sample_type', 'cancer_type', 'patient_id', 'sample_id', 'percent.mt', 'site', 'sample_type_major', 'cancer_subtype', 'integration_id', 'ident', 'dataset_id', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'S_score', 'G2M_score', 'keep_or_remove', 'leiden', 'celltype', 'leiden_DC', 'leiden_DC_v2', 'exclude_annots', 'leiden_M', 'Technology', 'cancer_type_broad', 'clean-leiden_0.8', 'clean-subcluster', 'clean-subcluster-v2', 'YS_sig', 'HSC_sig', 'CD5nDC2_up_vDC3', 'DC3_up_vCD5nDC2', 'CD5pDC2_up_vDC3', 'DC3_up_vCD5pDC2', 'DC3_up_vMono', 'Mono_up_vDC3', 'DC1_sig', 'DC2_sig', 'DC3_sig', 'DC_Axl_sig', 'DC2_up_DC3', 'DC3_up_DC2', 'mregDC_sig', 'cDC1_mregDC', 'cDC2_mregDC', 'NR_annotatio

In [6]:
data_all

AnnData object with n_obs × n_vars = 498023 × 1268
    obs: 'nCount_RNA', 'nFeature_RNA', 'sample_type', 'cancer_type', 'patient_id', 'sample_id', 'percent.mt', 'site', 'sample_type_major', 'cancer_subtype', 'integration_id', 'ident', 'dataset_id', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'S_score', 'G2M_score', 'keep_or_remove', 'leiden', 'celltype', 'leiden_DC', 'leiden_DC_v2', 'exclude_annots', 'leiden_M', 'Technology', 'cancer_type_broad', 'clean-leiden_0.8', 'clean-subcluster', 'clean-subcluster-v2', 'YS_sig', 'HSC_sig', 'CD5nDC2_up_vDC3', 'DC3_up_vCD5nDC2', 'CD5pDC2_up_vDC3', 'DC3_up_vCD5pDC2', 'DC3_up_vMono', 'Mono_up_vDC3', 'DC1_sig', 'DC2_sig', 'DC3_sig', 'DC_Axl_sig', 'DC2_up_DC3', 'DC3_up_DC2', 'mregDC_sig', 'cDC1_mregDC', 'cDC2_mregDC', 'NR_annotati

In [7]:
# check 
data.obs['NR_annotations_simple'].value_counts()

NR_annotations_simple
cDC2      17391
cDC1       6730
mregDC     6120
Name: count, dtype: int64

In [17]:
# check 
data_all.obs['NR_annotations_simple'].value_counts()

NR_annotations_simple
Tissue resident macrophage           368421
Classical monocytes                   57086
non-classical monocytes               29255
cDC2                                  17391
KI-67+ Tissue resident macrophage     13020
cDC1                                   6730
mregDC                                 6120
Name: count, dtype: int64

## Export proportions of DC subsets of total DC:

In [None]:
# create objects for sample types 
data_primary = data[data.obs["sample_type_major2"] == "primary tumour"]
data_mets = data[data.obs["sample_type_major2"] == "metastatic tumour"]
data_H = data[data.obs["sample_type_major2"] == "healthy"]

### Primary Tumour: 

In [None]:
# Create a DataFrame containing just sample, subtype and annotations 
df = data_primary.obs[["integration_id", "cancer_broadest", "NR_annotations_simple"]].copy()

# Create a new column for each DC type, indicating whether each cell belongs to that type (True vs False)
df["is_cDC1"] = df["NR_annotations_simple"] == "cDC1"
df["is_cDC2"] = df["NR_annotations_simple"] == "cDC2"
df["is_mregDC"] = df["NR_annotations_simple"] == "mregDC"

# Calculate proportions of each DC type per sample within each cancer subtype
proportion_df = (
    df.groupby(["integration_id", "cancer_broadest"])[["is_cDC1", "is_cDC2", "is_mregDC"]] #groups dataframe by each unique combo of sample_id and cancer type, keeps the booleen columns 
    .mean() #calculating the mean of a booleen column gives the proportion of true values 
    .reset_index()
    .rename(columns={"is_cDC1": "cDC1_proportion", "is_cDC2": "cDC2_proportion", "is_mregDC": "mregDC_proportion"}) #rename eg. is_cDC1 column to cDC1_proportion
)

#now have a dataframe containing proportion of each DC type for every sample and cancer type
proportion_df.head(15)

In [None]:
# Melt the DataFrame to "long" format for seaborn plotting
proportion_df_long = proportion_df.melt(
    id_vars=["integration_id", "cancer_broadest"], 
    value_vars=["cDC1_proportion", "cDC2_proportion", "mregDC_proportion"],
    var_name="Cell_Type",
    value_name="Proportion"
)

proportion_df_long.head(15)

In [None]:
#export proportion_df_long 
proportion_df_long.to_excel("2024-12-18_DC_prop-of-DC_primary.xlsx", index=False)

### Metastatic Tumour:

In [None]:
# Create a DataFrame containing just sample, subtype and annotations 
df = data_mets.obs[["integration_id", "cancer_broadest", "NR_annotations_simple"]].copy()

# Create a new column for each DC type, indicating whether each cell belongs to that type (True vs False)
df["is_cDC1"] = df["NR_annotations_simple"] == "cDC1"
df["is_cDC2"] = df["NR_annotations_simple"] == "cDC2"
df["is_mregDC"] = df["NR_annotations_simple"] == "mregDC"

# Calculate proportions of each DC type per sample within each cancer subtype
proportion_df = (
    df.groupby(["integration_id", "cancer_broadest"])[["is_cDC1", "is_cDC2", "is_mregDC"]] #groups dataframe by each unique combo of sample_id and cancer type, keeps the booleen columns 
    .mean() #calculating the mean of a booleen column gives the proportion of true values 
    .reset_index()
    .rename(columns={"is_cDC1": "cDC1_proportion", "is_cDC2": "cDC2_proportion", "is_mregDC": "mregDC_proportion"}) #rename eg. is_cDC1 column to cDC1_proportion
)

In [None]:
# Melt the DataFrame to "long" format for seaborn plotting
proportion_df_long = proportion_df.melt(
    id_vars=["integration_id", "cancer_broadest"], 
    value_vars=["cDC1_proportion", "cDC2_proportion", "mregDC_proportion"],
    var_name="Cell_Type",
    value_name="Proportion"
)

In [None]:
#export proportion_df_long 
proportion_df_long.to_excel("2024-12-18_DC_prop-of-DC_mets.xlsx", index=False)

### Healthy:

In [None]:
# Create a DataFrame containing just sample, subtype and annotations 
df = data_H.obs[["integration_id", "site", "NR_annotations_simple"]].copy()

# Create a new column for each DC type, indicating whether each cell belongs to that type (True vs False)
df["is_cDC1"] = df["NR_annotations_simple"] == "cDC1"
df["is_cDC2"] = df["NR_annotations_simple"] == "cDC2"
df["is_mregDC"] = df["NR_annotations_simple"] == "mregDC"

# Calculate proportions of each DC type per sample within each cancer subtype
proportion_df = (
    df.groupby(["integration_id", "site"])[["is_cDC1", "is_cDC2", "is_mregDC"]] #groups dataframe by each unique combo of sample_id and cancer type, keeps the booleen columns 
    .mean() #calculating the mean of a booleen column gives the proportion of true values 
    .reset_index()
    .rename(columns={"is_cDC1": "cDC1_proportion", "is_cDC2": "cDC2_proportion", "is_mregDC": "mregDC_proportion"}) #rename eg. is_cDC1 column to cDC1_proportion
)

#now have a dataframe containing proportion of each DC type for every sample and cancer type
proportion_df.head(15)

In [None]:
# Melt the DataFrame to "long" format for seaborn plotting
proportion_df_long = proportion_df.melt(
    id_vars=["integration_id", "site"], 
    value_vars=["cDC1_proportion", "cDC2_proportion", "mregDC_proportion"],
    var_name="Cell_Type",
    value_name="Proportion"
)

proportion_df_long.head(15)

In [None]:
#export proportion_df_long 
proportion_df_long.to_excel("2024-12-18_DC_prop-of-DC_healthy.xlsx", index=False)

## Export proportions of cDC1, cDC2, and classical monocytes of total Myeloid APC:

In [18]:
# create objects for sample types 
data_primary = data_all[data_all.obs["sample_type_major2"] == "primary tumour"]
data_mets = data_all[data_all.obs["sample_type_major2"] == "metastatic tumour"]
data_H = data_all[data_all.obs["sample_type_major2"] == "healthy"]

### Primary Tumour: 

In [19]:
# Create a DataFrame containing just sample, subtype and annotations 
df = data_primary.obs[["integration_id", "cancer_broadest", "NR_annotations_simple"]].copy()

# Create a new column for each DC type, indicating whether each cell belongs to that type (True vs False)
df["is_cDC1"] = df["NR_annotations_simple"] == "cDC1"
df["is_cDC2"] = df["NR_annotations_simple"] == "cDC2"
df["is_mono"] = df["NR_annotations_simple"] == "Classical monocytes"

# Calculate proportions of each DC type per sample within each cancer subtype
proportion_df = (
    df.groupby(["integration_id", "cancer_broadest"])[["is_cDC1", "is_cDC2", "is_mono"]] #groups dataframe by each unique combo of sample_id and cancer type, keeps the booleen columns 
    .mean() #calculating the mean of a booleen column gives the proportion of true values 
    .reset_index()
    .rename(columns={"is_cDC1": "cDC1_proportion", "is_cDC2": "cDC2_proportion", "is_mono": "C-mono_proportion"}) #rename eg. is_cDC1 column to cDC1_proportion
)

#now have a dataframe containing proportion of each DC type for every sample and cancer type
proportion_df.head(15)



Unnamed: 0,integration_id,cancer_broadest,cDC1_proportion,cDC2_proportion,C-mono_proportion
0,GGSE223063_GLIO_1,BC,,,
1,GGSE223063_GLIO_1,CRC,,,
2,GGSE223063_GLIO_1,GAC,,,
3,GGSE223063_GLIO_1,GBM,0.00749,0.009814,0.024793
4,GGSE223063_GLIO_1,HCC,,,
5,GGSE223063_GLIO_1,HGSOC,,,
6,GGSE223063_GLIO_1,HNSCC,,,
7,GGSE223063_GLIO_1,MEL,,,
8,GGSE223063_GLIO_1,NPC,,,
9,GGSE223063_GLIO_1,NSCLC,,,


In [20]:
# Melt the DataFrame to "long" format for seaborn plotting
proportion_df_long = proportion_df.melt(
    id_vars=["integration_id", "cancer_broadest"], 
    value_vars=["cDC1_proportion", "cDC2_proportion", "C-mono_proportion"],
    var_name="Cell_Type",
    value_name="Proportion"
)

proportion_df_long.head(15)

Unnamed: 0,integration_id,cancer_broadest,Cell_Type,Proportion
0,GGSE223063_GLIO_1,BC,cDC1_proportion,
1,GGSE223063_GLIO_1,CRC,cDC1_proportion,
2,GGSE223063_GLIO_1,GAC,cDC1_proportion,
3,GGSE223063_GLIO_1,GBM,cDC1_proportion,0.00749
4,GGSE223063_GLIO_1,HCC,cDC1_proportion,
5,GGSE223063_GLIO_1,HGSOC,cDC1_proportion,
6,GGSE223063_GLIO_1,HNSCC,cDC1_proportion,
7,GGSE223063_GLIO_1,MEL,cDC1_proportion,
8,GGSE223063_GLIO_1,NPC,cDC1_proportion,
9,GGSE223063_GLIO_1,NSCLC,cDC1_proportion,


In [21]:
#export proportion_df_long 
proportion_df_long.to_excel("2025-06-04_DCM_proportions_primary.xlsx", index=False)

### Metastatic Tumour:

In [22]:
# Create a DataFrame containing just sample, subtype and annotations 
df = data_primary.obs[["integration_id", "cancer_broadest", "NR_annotations_simple"]].copy()

# Create a new column for each DC type, indicating whether each cell belongs to that type (True vs False)
df["is_cDC1"] = df["NR_annotations_simple"] == "cDC1"
df["is_cDC2"] = df["NR_annotations_simple"] == "cDC2"
df["is_mono"] = df["NR_annotations_simple"] == "Classical monocytes"

# Calculate proportions of each DC type per sample within each cancer subtype
proportion_df = (
    df.groupby(["integration_id", "cancer_broadest"])[["is_cDC1", "is_cDC2", "is_mono"]] #groups dataframe by each unique combo of sample_id and cancer type, keeps the booleen columns 
    .mean() #calculating the mean of a booleen column gives the proportion of true values 
    .reset_index()
    .rename(columns={"is_cDC1": "cDC1_proportion", "is_cDC2": "cDC2_proportion", "is_mono": "C-mono_proportion"}) #rename eg. is_cDC1 column to cDC1_proportion
)

#now have a dataframe containing proportion of each DC type for every sample and cancer type
proportion_df.head(15)



Unnamed: 0,integration_id,cancer_broadest,cDC1_proportion,cDC2_proportion,C-mono_proportion
0,GGSE223063_GLIO_1,BC,,,
1,GGSE223063_GLIO_1,CRC,,,
2,GGSE223063_GLIO_1,GAC,,,
3,GGSE223063_GLIO_1,GBM,0.00749,0.009814,0.024793
4,GGSE223063_GLIO_1,HCC,,,
5,GGSE223063_GLIO_1,HGSOC,,,
6,GGSE223063_GLIO_1,HNSCC,,,
7,GGSE223063_GLIO_1,MEL,,,
8,GGSE223063_GLIO_1,NPC,,,
9,GGSE223063_GLIO_1,NSCLC,,,


In [23]:
# Melt the DataFrame to "long" format for seaborn plotting
proportion_df_long = proportion_df.melt(
    id_vars=["integration_id", "cancer_broadest"], 
    value_vars=["cDC1_proportion", "cDC2_proportion", "C-mono_proportion"],
    var_name="Cell_Type",
    value_name="Proportion"
)

In [24]:
#export proportion_df_long 
proportion_df_long.to_excel("2025-06-04_DCM_proportions_mets.xlsx", index=False)

### Healthy:

In [25]:
# Create a DataFrame containing just sample, subtype and annotations 
df = data_H.obs[["integration_id", "site", "NR_annotations_simple"]].copy()

# Create a new column for each DC type, indicating whether each cell belongs to that type (True vs False)
df["is_cDC1"] = df["NR_annotations_simple"] == "cDC1"
df["is_cDC2"] = df["NR_annotations_simple"] == "cDC2"
df["is_mono"] = df["NR_annotations_simple"] == "Classical monocytes"

# Calculate proportions of each DC type per sample within each cancer subtype
proportion_df = (
    df.groupby(["integration_id", "site"])[["is_cDC1", "is_cDC2", "is_mono"]] #groups dataframe by each unique combo of sample_id and cancer type, keeps the booleen columns 
    .mean() #calculating the mean of a booleen column gives the proportion of true values 
    .reset_index()
    .rename(columns={"is_cDC1": "cDC1_proportion", "is_cDC2": "cDC2_proportion", "is_mono": "C-mono_proportion"}) #rename eg. is_cDC1 column to cDC1_proportion
)

#now have a dataframe containing proportion of each DC type for every sample and cancer type
proportion_df.head(15)



Unnamed: 0,integration_id,site,cDC1_proportion,cDC2_proportion,C-mono_proportion
0,GSE131907_Healthy_N0001,breast,,,
1,GSE131907_Healthy_N0001,colon,,,
2,GSE131907_Healthy_N0001,liver,,,
3,GSE131907_Healthy_N0001,lung,0.008306,0.041528,0.169435
4,GSE131907_Healthy_N0001,lymph node,,,
5,GSE131907_Healthy_N0001,ovary,,,
6,GSE131907_Healthy_N0006,breast,,,
7,GSE131907_Healthy_N0006,colon,,,
8,GSE131907_Healthy_N0006,liver,,,
9,GSE131907_Healthy_N0006,lung,0.004866,0.017032,0.048662


In [26]:
# Melt the DataFrame to "long" format for seaborn plotting
proportion_df_long = proportion_df.melt(
    id_vars=["integration_id", "site"], 
    value_vars=["cDC1_proportion", "cDC2_proportion", "C-mono_proportion"],
    var_name="Cell_Type",
    value_name="Proportion"
)

proportion_df_long.head(15)

Unnamed: 0,integration_id,site,Cell_Type,Proportion
0,GSE131907_Healthy_N0001,breast,cDC1_proportion,
1,GSE131907_Healthy_N0001,colon,cDC1_proportion,
2,GSE131907_Healthy_N0001,liver,cDC1_proportion,
3,GSE131907_Healthy_N0001,lung,cDC1_proportion,0.008306
4,GSE131907_Healthy_N0001,lymph node,cDC1_proportion,
5,GSE131907_Healthy_N0001,ovary,cDC1_proportion,
6,GSE131907_Healthy_N0006,breast,cDC1_proportion,
7,GSE131907_Healthy_N0006,colon,cDC1_proportion,
8,GSE131907_Healthy_N0006,liver,cDC1_proportion,
9,GSE131907_Healthy_N0006,lung,cDC1_proportion,0.004866


In [27]:
#export proportion_df_long 
proportion_df_long.to_excel("2025-06-04_DCM_proportions_healthy.xlsx", index=False)