# Preparing dataframes for pseudobulking analysis
## Using raw data of DC (use object prepared for CytoTRACE --> NR_unint_DC_cytotrace.h5ad), need to prepare two types of dataframes: 
* sum by sample, so for each DC subset (3 dataframes) made a dataframe that has genes as rows and integration_id (samples) as columns
* then make a metadata dataframe that has two columns --> integration_id and cancer type

In [1]:
#load packages
import scanpy as sc
import os
import pandas as pd

In [4]:
#change current directory 
os.chdir('/scratch/user/s4436039/scdata/Pseudobulk')
os.getcwd()

'/scratch/user/s4436039/scdata/Pseudobulk'

In [5]:
# Read in the data 
data = sc.read_h5ad('/scratch/user/s4436039/scdata/CytoTRACE_R/NR_unint_DC_cytotrace.h5ad')

In [6]:
data

AnnData object with n_obs × n_vars = 30241 × 15074
    obs: 'nCount_RNA', 'nFeature_RNA', 'sample_type', 'cancer_type', 'patient_id', 'sample_id', 'percent.mt', 'site', 'sample_type_major', 'cancer_subtype', 'integration_id', 'ident', 'dataset_id', 'NR_annotations_simple', 'sample_type_major2', 'cancer_with_H', 'cancer_broadest'
    var: 'name'

# Make pseudobulk dataframes: 

In [8]:
#subset out each DC
data_DC1 = data[data.obs["NR_annotations_simple"].isin(["cDC1"])]
data_DC2 = data[data.obs["NR_annotations_simple"].isin(["cDC2"])]
data_mregDC = data[data.obs["NR_annotations_simple"].isin(["mregDC"])]

In [24]:
data_DC1_matrix = data_DC1.X.toarray()
data_DC2_matrix = data_DC2.X.toarray()
data_mregDC_matrix = data_mregDC.X.toarray()

In [25]:
genes_DC1 = data_DC1.var_names
obs_DC1 = data_DC1.obs

genes_DC2 = data_DC2.var_names
obs_DC2 = data_DC2.obs

genes_mregDC = data_mregDC.var_names
obs_mregDC = data_mregDC.obs

In [26]:
# Create a DataFrame with cells as rows and genes as columns
data_DC1_df = pd.DataFrame(data_DC1_matrix, index=obs_DC1.index, columns=genes_DC1)
data_DC2_df = pd.DataFrame(data_DC2_matrix, index=obs_DC2.index, columns=genes_DC2)
data_mregDC_df = pd.DataFrame(data_mregDC_matrix, index=obs_mregDC.index, columns=genes_mregDC)

In [27]:
# Add integration_id
data_DC1_df['integration_id'] = obs_DC1['integration_id']
data_DC2_df['integration_id'] = obs_DC2['integration_id']
data_mregDC_df['integration_id'] = obs_mregDC['integration_id']

In [29]:
data_DC1_df.head()

Unnamed: 0,FAM87B,LINC00115,FAM41C,SAMD11,NOC2L,KLHL17,PLEKHN1,HES4,ISG15,AGRN,...,MT-ATP8,MT-ATP6,MT-CO3,MT-ND3,MT-ND4L,MT-ND4,MT-ND5,MT-ND6,MT-CYB,integration_id
GSE215120_AM1_ACCAGTAAGACTGGGT-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,...,0.0,13.0,29.0,5.0,0.0,11.0,5.0,1.0,15.0,GSE215120_Acral_MEL_AM1
GSE215120_AM1_ACGTCAACAAGGACTG-1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,27.0,0.0,...,1.0,52.0,162.0,22.0,0.0,64.0,10.0,1.0,74.0,GSE215120_Acral_MEL_AM1
GSE215120_AM1_ATAACGCGTAGCGCTC-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,11.0,35.0,8.0,0.0,14.0,5.0,0.0,20.0,GSE215120_Acral_MEL_AM1
GSE215120_AM1_ATAGACCCATTACCTT-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,17.0,38.0,12.0,0.0,20.0,4.0,0.0,25.0,GSE215120_Acral_MEL_AM1
GSE215120_AM1_ATTGGTGAGTCTCGGC-1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,5.0,0.0,...,0.0,23.0,50.0,13.0,1.0,29.0,6.0,1.0,29.0,GSE215120_Acral_MEL_AM1


In [32]:
# Group by integration_id and sum to pseudobulk
pseudobulk_DC1_df = data_DC1_df.groupby('integration_id').sum().transpose()
pseudobulk_DC2_df = data_DC2_df.groupby('integration_id').sum().transpose()
pseudobulk_mregDC_df = data_mregDC_df.groupby('integration_id').sum().transpose()

  pseudobulk_DC1_df = data_DC1_df.groupby('integration_id').sum().transpose()
  pseudobulk_DC2_df = data_DC2_df.groupby('integration_id').sum().transpose()
  pseudobulk_mregDC_df = data_mregDC_df.groupby('integration_id').sum().transpose()


In [22]:
pseudobulk_DC1_df.head()

integration_id,GGSE223063_GLIO_1,GGSE223063_GLIO_3,GSE112271_HCC_Pt13,GSE112271_HCC_Pt14,GSE131907_Healthy_N0001,GSE131907_Healthy_N0006,GSE131907_Healthy_N0008,GSE131907_Healthy_N0009,GSE131907_Healthy_N0018,GSE131907_Healthy_N0019,...,PRJCA005422_HGSOC9_PT,PRJCA005422_HGSOC10_AS,PRJNA907381_MEL002_iLN,PRJNA907381_MEL002_uLN,PRJNA907381_MEL009_iLN,PRJNA907381_MEL014_iLN,PRJNA907381_MEL014_uLN,PRJNA907381_MEL018_iLN,PRJNA907381_MEL018_uLN,PRJNA907381_MEL022_iLN
FAM87B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LINC00115,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,2.0,1.0,0.0
FAM41C,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,1.0,0.0
SAMD11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NOC2L,0.0,0.0,0.0,2.0,4.0,1.0,0.0,4.0,8.0,6.0,...,1.0,16.0,3.0,8.0,18.0,3.0,0.0,11.0,31.0,4.0


In [33]:
pseudobulk_DC2_df.head()

integration_id,GGSE223063_GLIO_1,GGSE223063_GLIO_3,GSE112271_HCC_Pt13,GSE112271_HCC_Pt14,GSE131907_Healthy_N0001,GSE131907_Healthy_N0006,GSE131907_Healthy_N0008,GSE131907_Healthy_N0009,GSE131907_Healthy_N0018,GSE131907_Healthy_N0019,...,PRJCA005422_HGSOC10_AS,PRJCA005422_HGSOC10_PT,PRJNA907381_MEL002_iLN,PRJNA907381_MEL002_uLN,PRJNA907381_MEL009_iLN,PRJNA907381_MEL014_iLN,PRJNA907381_MEL014_uLN,PRJNA907381_MEL018_iLN,PRJNA907381_MEL018_uLN,PRJNA907381_MEL022_iLN
FAM87B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
LINC00115,0.0,0.0,0.0,1.0,2.0,0.0,1.0,0.0,0.0,2.0,...,0.0,0.0,2.0,13.0,3.0,2.0,3.0,2.0,6.0,2.0
FAM41C,1.0,0.0,0.0,1.0,2.0,2.0,0.0,2.0,2.0,2.0,...,0.0,1.0,0.0,12.0,1.0,0.0,0.0,0.0,5.0,0.0
SAMD11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0
NOC2L,4.0,0.0,3.0,6.0,14.0,4.0,1.0,6.0,19.0,11.0,...,8.0,3.0,15.0,119.0,49.0,25.0,10.0,27.0,87.0,15.0


In [34]:
pseudobulk_mregDC_df.head()

integration_id,GGSE223063_GLIO_1,GSE112271_HCC_Pt13,GSE112271_HCC_Pt14,GSE131907_Healthy_N0001,GSE131907_Healthy_N0006,GSE131907_Healthy_N0008,GSE131907_Healthy_N0009,GSE131907_Healthy_N0018,GSE131907_Healthy_N0019,GSE131907_Healthy_N0020,...,PRJCA005422_HGSOC10_AS,PRJCA005422_HGSOC10_PT,PRJNA907381_MEL002_iLN,PRJNA907381_MEL002_uLN,PRJNA907381_MEL009_iLN,PRJNA907381_MEL014_iLN,PRJNA907381_MEL014_uLN,PRJNA907381_MEL018_iLN,PRJNA907381_MEL018_uLN,PRJNA907381_MEL022_iLN
FAM87B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LINC00115,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,2.0,0.0
FAM41C,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,3.0,0.0
SAMD11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NOC2L,2.0,0.0,3.0,5.0,1.0,2.0,0.0,6.0,3.0,2.0,...,0.0,0.0,1.0,27.0,1.0,2.0,1.0,4.0,23.0,0.0


In [35]:
# save as csv
pseudobulk_DC1_df.to_csv("pseudobulk_DC1_data.csv", index=True)
pseudobulk_DC2_df.to_csv("pseudobulk_DC2_data.csv", index=True)
pseudobulk_mregDC_df.to_csv("pseudobulk_mregDC_data.csv", index=True)

# Make metadata dataframe 

In [36]:
data

AnnData object with n_obs × n_vars = 30241 × 15074
    obs: 'nCount_RNA', 'nFeature_RNA', 'sample_type', 'cancer_type', 'patient_id', 'sample_id', 'percent.mt', 'site', 'sample_type_major', 'cancer_subtype', 'integration_id', 'ident', 'dataset_id', 'NR_annotations_simple', 'sample_type_major2', 'cancer_with_H', 'cancer_broadest'
    var: 'name'

In [39]:
metadata = data.obs[['integration_id', 'cancer_broadest']].groupby('integration_id').first()

  metadata = data.obs[['integration_id', 'cancer_broadest']].groupby('integration_id').first()


In [40]:
metadata.head()

Unnamed: 0_level_0,cancer_broadest
integration_id,Unnamed: 1_level_1
GGSE223063_GLIO_1,GBM
GGSE223063_GLIO_3,GBM
GSE112271_HCC_Pt13,HCC
GSE112271_HCC_Pt14,HCC
GSE131907_Healthy_N0001,


# need to repeat this with only the primary's ??

In [None]:
# Reset the index to make 'integration_id' a regular column
grouped_metadata = grouped_metadata.reset_index()