# Data Pipeline

This notebook consists of three sections. In section [Data Exploration](#explore), we read the data and provide an overview of each of the modalities. In section [Comparisons](#compare), we investigate the features that are shared across different modalities, and finally, in section [Pseudo Bulking](#aggregate), we aggregate the single cell data modalities to obtain pseudobulks. 

Section [Comparisons](#compare) and section [Pseudo Bulking](#aggregate) are independent of each other. However, running the first cell of each subsection of section [Data Exploration](#explore), in which the data is loaded and saved into a variable, is necessary for running different code snippets of this notebook.

In [3]:
import os
import scanpy as sp
import anndata as ad
import pandas as pd 
import numpy as np
import pickle

In [6]:
"""
The data folder's structure is as follows

|data
    |input
        |full
        |raw
        |complementary
    |output
        |proteins
        |shared
        |stats
            |single
            |cross
        |pb_L
        |pb
        |preprocessed
"""

# Path to the inputs
cwd = os.getcwd()
input = os.path.abspath(os.path.join(cwd, "../data/input"))
print("in", input)


# Path to outputs 
output = os.path.abspath(os.path.join(cwd, "../data/output"))
print("out", output)

in /Users/shakiba/Desktop/data/data/input
out /Users/shakiba/Desktop/data/data/output


In [7]:
# Inputs
full = os.path.join(input, "full")
raw = os.path.join(input, "raw")
complementary = os.path.join(input, "complementary")

# Outputs
proteins = os.path.join(output, "proteins")
shared = os.path.join(output, "shared")
stats_single = os.path.join(output, "stats/single")
stats_cross = os.path.join(output, "stats/cross")
pb = os.path.join(output, "pb")   
pb_L = os.path.join(output, "pb_L") # Pseudobulks according to Leonardo 
preprocessed = os.path.join(output, "preprocessed")  

In [8]:
# Read complementary data
all_proteins = pd.read_csv(os.path.join(complementary, "all_proteins_V1.txt"), sep= "	")
COMBAT_CLINVAR_for_processed = pd.read_csv(os.path.join(complementary, "COMBAT_CLINVAR_for_processed.txt"), sep= "	")

# Module data for subsetting bulks 
bulk_genes_membership = pd.read_csv(os.path.join(complementary, "bulk_genes_membership.tsv"), sep= "\t")
membership_all_celltypes = pd.read_csv(os.path.join(complementary, "membership_all_celltypes.tsv"), sep= "\t")

# Label harmonization files
cite_cells = pd.read_csv(os.path.join(complementary, "cite_cells.csv"))
cytof_cells_harmonised = pd.read_csv(os.path.join(complementary, "cytof_cells_harmonised.csv"), sep=';')
cytof_cells = pd.read_csv(os.path.join(complementary, "cytof_cells.csv"))


In [12]:
all_proteins.to_csv("prot.tsv")

## Harmonization

The labels between cytof and citeseq are already harmonized. 
The only thing to notice is that a subset of cell types identified with one assay are not identified with the other one and vice versa.

In [54]:
def get_map_raw(l1, l2):
    dic = {}
    for label in l1:
        if label in l2:
            dic.update([(label, label)])
        else:
            dic.update([(label, None)])

    for label in l2:
        if label not in dic:
            dic.update([(label, None)])
    
    return dic

In [None]:
cell_type_harm_map = get_map_raw(cite_cells.Annotation_cell_type.unique(), cytof_cells_harmonised.harmonized_cell_type.unique())
cell_type_harm_map

In [None]:
major_subset_harm_map = get_map_raw(cite_cells.Annotation_major_subset.unique(), cytof_cells_harmonised.harmonized_major_subset.unique())
major_subset_harm_map

## Data Exploration <a id='explore'></a>

This snippet of the notebook contains code for 1) seperating ADT and scRNA, 2) extracting bulkRNA protein names, 3) generating an overview of the data contained in the modalities

### ADT and SC RNA

In [370]:
if not os.path.exists(os.path.join(preprocessed,"adt_pp.h5ad")):             
    cite = ad.read_h5ad(os.path.join(raw,"COMBAT-CITESeq-DATA.h5ad")) 
    
    # patient names are stored in obs.scRNASeq_sample_ID 
    cite.obs["original_obs_names"] = cite.obs_names
    cite.obs_names = cite.obs.scRNASeq_sample_ID

    adt_names = []
    scRNA_names = []

    # Determine ADT feature names and RNA feature names
    for name in cite.var_names: 
        if name.startswith("AB_"): # ADT data start with AB_ 
            adt_names.append(name)
        else:
            scRNA_names.append(name)

    # Extract ADT and scRNA 
    adt = cite[:, adt_names]
    scRNA = cite[:, scRNA_names]

    # Remove the prefix AB_ from ADT features 
    adt_names = [name.split("AB_")[1] for name in adt_names]
    adt.var_names = adt_names # Rename ADT features that correspond to its columns 

    # Assign domains to each modality
    adt.obs['Domain'] = 'cite'
    adt.obs['Domain_major'] = 'adt'

    scRNA.obs['Domain'] = 'cite'
    scRNA.obs['Domain_major'] = 'rna'

    adt.layers["raw"] = cite.layers["raw"][:, -len(adt.var_names):]
    scRNA.layers["raw"] = cite.layers["raw"][:, : -len(adt.var_names)]

    # Unifying the patient and disease names across all modalities
    adt.obs.rename(columns={'scRNASeq_sample_ID': 'PID', 'Source': 'Diseases'}, inplace=True) 
    scRNA.obs.rename(columns={'scRNASeq_sample_ID': 'PID', 'Source': 'Diseases'}, inplace=True)


    print("ADT: ", adt, "\n\n")
    print("single cell RNA: ", scRNA, "\n\n")

    adt.write_h5ad(os.path.join(preprocessed,"adt_pp.h5ad"))
    scRNA.write_h5ad(os.path.join(preprocessed,"scRNA_pp.h5ad"))

else:
    adt = ad.read_h5ad(os.path.join(preprocessed,"adt_pp.h5ad"))
    print("ADT: ", adt, "\n\n")
    
    scRNA = ad.read_h5ad(os.path.join(preprocessed,"scRNA_pp.h5ad")) 
    print("single cell RNA: ", scRNA, "\n\n")


  utils.warn_names_duplicates("obs")


ADT:  AnnData object with n_obs × n_vars = 836148 × 192
    obs: 'Annotation_cluster_id', 'Annotation_cluster_name', 'Annotation_minor_subset', 'Annotation_major_subset', 'Annotation_cell_type', 'GEX_region', 'QC_ngenes', 'QC_total_UMI', 'QC_pct_mitochondrial', 'QC_scrub_doublet_scores', 'TCR_chain_composition', 'TCR_clone_ID', 'TCR_clone_count', 'TCR_clone_proportion', 'TCR_contains_unproductive', 'TCR_doublet', 'TCR_chain_TRA', 'TCR_v_gene_TRA', 'TCR_d_gene_TRA', 'TCR_j_gene_TRA', 'TCR_c_gene_TRA', 'TCR_productive_TRA', 'TCR_cdr3_TRA', 'TCR_umis_TRA', 'TCR_chain_TRA2', 'TCR_v_gene_TRA2', 'TCR_d_gene_TRA2', 'TCR_j_gene_TRA2', 'TCR_c_gene_TRA2', 'TCR_productive_TRA2', 'TCR_cdr3_TRA2', 'TCR_umis_TRA2', 'TCR_chain_TRB', 'TCR_v_gene_TRB', 'TCR_d_gene_TRB', 'TCR_j_gene_TRB', 'TCR_c_gene_TRB', 'TCR_productive_TRB', 'TCR_chain_TRB2', 'TCR_v_gene_TRB2', 'TCR_d_gene_TRB2', 'TCR_j_gene_TRB2', 'TCR_c_gene_TRB2', 'TCR_productive_TRB2', 'TCR_cdr3_TRB2', 'TCR_umis_TRB2', 'BCR_umis_HC', 'BCR_contig_

  utils.warn_names_duplicates("obs")


#### ADT

In [73]:
summary_ADT = {"feature" : adt.var_names,
               "min" : adt.X.toarray().min(axis = 0),
               "max" : adt.X.toarray().max(axis = 0),
               "mean" : adt.X.toarray().mean(axis = 0),
               "var" : adt.X.toarray().var(axis = 0)}

df_adt= pd.DataFrame(summary_ADT)

print("[", adt.X.toarray().min(), ",", adt.X.toarray().max(), "]") # Overall range of proteins

[ -67.67877 , 460.98032 ]


In [74]:
# Content overview
adt.to_df().head()

Unnamed: 0,CD80,CD86,CD274_B7_H1_PD_L1,CD273_B7_DC_PD_L2,CD275_B7_H2_ICOSL,humanCD11b,CD252_OX40L,CD137L_4_1BBLigand,CD155_PVR,CD112_Nectin_2,...,CD101_BB27,CD360_IL_21R,CD88_C5aR,HLA_F,NLRP2,Podocalyxin,CD224,c_Met,CD258_LIGHT,DR3_TRAMP
AAACCTGAGAAAGTGG-1-gPlexA1,1.98787,1.921781,2.613414,0.456505,1.482558,1.789405,1.206598,2.821688,1.885517,1.059094,...,0.840967,2.21781,1.220086,0.12453,1.719992,1.028112,1.729305,1.463706,1.785078,1.601881
AAACCTGAGCGGATCA-1-gPlexA1,-0.539351,0.442409,2.392834,1.047547,0.131874,1.147668,0.541517,1.990631,2.284331,1.762416,...,3.184458,2.73134,1.006678,-0.142191,1.20244,1.168217,3.29596,0.679976,2.942873,2.06682
AAACCTGAGGACATTA-1-gPlexA1,0.993282,1.441381,0.310766,-0.556409,1.708025,0.19521,0.37591,2.114272,-0.618993,-0.049037,...,0.937668,0.563752,0.735085,0.130218,-0.380042,0.24376,0.863445,0.484824,0.713824,1.31677
AAACCTGAGGCGACAT-1-gPlexA1,0.838407,2.64194,0.344012,0.189955,1.021477,2.683172,1.320957,0.759884,1.35568,3.498369,...,0.947138,-0.744993,1.814052,1.67303,1.307825,1.210711,2.648582,0.606611,1.100374,-0.663722
AAACCTGAGGGAACGG-1-gPlexA1,1.172756,14.549344,-0.884014,1.349209,1.489393,4.734301,1.522198,0.871919,5.137852,4.039271,...,1.49347,2.687017,5.53505,1.429137,0.657836,2.042859,6.36786,1.316507,1.871877,1.279506


In [78]:
with open (os.path.join(proteins,'adt_feature_names.txt'), 'w') as file:  
    for name in adt_names:
        file.write(name + "\n") 
        
df_adt.to_excel(os.path.join(stats_single,"summary_ADT.xlsx")) 

#### scRNA

In [79]:
summary_scRNA = {"feature" : scRNA_names,
           "min" : scRNA.X.min(axis = 0).toarray()[0],
           "max" : scRNA.X.max(axis = 0).toarray()[0]}

df_scRNA= pd.DataFrame(summary_scRNA)

print("[", scRNA.X.min(), ",", scRNA.X.max(), "]")

[ 0.0 , 9.094633 ]


In [80]:
# Data overview
scRNA.to_df().head()

Unnamed: 0,OR4F5,OR4F29,OR4F16,SAMD11,NOC2L,KLHL17,PLEKHN1,PERM1,HES4,ISG15,...,AC007325.2,BX072566.1,AL354822.1,AC023491.2,AC004556.3,AC233755.2,AC233755.1,AC240274.1,AC213203.4,AC213203.1
AAACCTGAGAAAGTGG-1-gPlexA1,0.0,0.0,0.0,0.0,1.553033,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCTGAGCGGATCA-1-gPlexA1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.426129,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCTGAGGACATTA-1-gPlexA1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCTGAGGCGACAT-1-gPlexA1,0.0,0.0,0.0,0.0,1.800563,0.0,0.0,0.0,0.0,2.407496,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCTGAGGGAACGG-1-gPlexA1,0.0,0.0,0.0,0.0,1.445163,0.0,0.0,0.0,0.0,3.293625,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [81]:
with open (os.path.join(proteins,'scRNA_feature_names.txt'), 'w') as file:  
    for name in scRNA_names:
        file.write(name + "\n")  
        
df_scRNA.to_excel(os.path.join(stats_single,"summary_scRNA.xlsx"))

### bulk RNA

In [372]:
bulkRNA = pd.read_csv(os.path.join(raw, "bulkRna.txt"), sep= "	")  # Note that here featres are rows 
bulkRNA = bulk_genes_membership.join(bulkRNA, on="gene_id", how = "left")  # Join the two data frames based on "gene_id"
bulkRNA.set_index("gene_name", inplace=True)

if not os.path.exists(os.path.join(preprocessed,"bulkRNA_pp.h5ad")):   
    bulkRNA_ad  = ad.AnnData(bulkRNA.iloc[:,4:].T)
    bulkRNA_ad.obs["COMBAT_ID"] = [n.split("-")[0] for n in bulkRNA_ad.obs_names]
    bulkRNA_ad.obs["PID"] = bulkRNA_ad.obs_names
    bulkRNA_ad.var["gene_id"] = bulkRNA["gene_id"]
    bulkRNA_ad.var["gene_name"] = bulkRNA_ad.var_names
    bulkRNA_ad.var["module"] = bulkRNA["module"]

    bulkRNA_ad.write_h5ad(os.path.join(preprocessed,"bulkRNA_pp.h5ad"))

else:
    bulkRNA_ad = ad.read_h5ad(os.path.join(preprocessed,"bulkRNA_pp.h5ad"))

bulkRNA

1


  utils.warn_names_duplicates("var")


Unnamed: 0_level_0,module,gene_id,membership,p.value,S00016-Ja001T-TRGa,S00020-Ja003T-TRGa,S00024-Ja003T-TRGa,S00027-Ja003T-TRGa,S00028-Ja001T-TRGa,S00030-Ja003T-TRGa,...,S00081-Ja001T-TRGa,S00081-Ja005T-TRGa,S00082-Ja001T-TRGa,S00094-Ja005T-TRGa,S00095-Ja005T-TRGa,S00096-Ja005T-TRGa,S00097-Ja003T-TRGa,S00099-Ja005T-TRGa,S00104-Ja003T-TRGa,S00106-Ja003T-TRGa
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CCNE1,greenyellow,ENSG00000105173,0.958784,6.194695e-79,1.143957,1.832209,3.057676,1.043326,2.352262,1.691966,...,1.127736,1.479040,1.600500,1.105211,1.159955,1.111833,1.731594,1.307727,1.725340,0.787063
GINS1,greenyellow,ENSG00000101003,0.954818,3.500631e-76,1.195675,2.613918,3.472932,1.868858,2.506088,2.838360,...,1.411002,2.005985,2.117944,1.203383,1.331941,1.445836,1.834587,3.121070,2.555646,1.149869
FEN1,greenyellow,ENSG00000168496,0.953113,4.493982e-75,3.346927,4.070370,4.880629,3.178997,4.114665,4.348293,...,2.927373,3.096387,3.599349,2.486769,2.918063,2.832423,3.323475,4.223662,4.007329,2.466623
SPATS2,greenyellow,ENSG00000123352,0.949671,5.878363e-73,3.006723,3.313059,4.711806,2.880851,3.494934,3.005567,...,2.887494,2.772339,3.194292,2.736740,2.975801,2.766807,2.763138,2.602190,3.687930,2.589971
H2BC10,greenyellow,ENSG00000278588,0.946806,2.635032e-71,3.702432,5.271994,5.550350,3.659646,5.518242,5.151624,...,3.365058,4.045067,4.560161,2.781927,3.490069,3.963749,4.556235,5.240146,5.021300,2.998987
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AC007284.1,grey,ENSG00000251996,0.800786,3.415287e-33,0.664271,0.225412,0.179185,0.000000,0.182927,0.047961,...,0.000000,0.000000,0.382617,0.000000,0.238826,0.472037,0.317011,0.000000,0.213735,0.737699
AC010086.1,grey,ENSG00000226918,0.779957,1.710957e-30,0.164769,0.164520,0.397677,0.000000,0.286497,0.094378,...,0.000000,0.000000,0.302897,0.000000,0.330201,0.144782,0.602932,0.000000,0.074819,0.399713
AC253536.6,grey,ENSG00000272787,-0.169222,4.333857e-02,0.000000,0.000000,0.000000,0.000000,0.773761,0.883671,...,0.042124,0.031883,0.000000,0.710390,0.574012,0.000000,0.796798,0.591878,0.969468,0.880997
AC103810.1,grey,ENSG00000264057,0.089963,2.852850e-01,0.358735,0.195287,0.925435,1.445770,0.491140,0.225291,...,0.122854,0.290960,0.189251,0.084874,2.225828,0.189985,0.149547,0.157399,0.050308,0.120441


In [119]:
summary_bulkRNA = {"feature" : bulkRNA.index,
           "min" : bulkRNA_ad.X.min(axis = 0),
           "max" : bulkRNA_ad.X.max(axis = 0),
           "mean" : bulkRNA_ad.X.mean(axis = 0),
           "var" : bulkRNA_ad.X.var(axis = 0)}

df_bulkRNA= pd.DataFrame(summary_bulkRNA)

print("[", bulkRNA_ad.X.min(),",", bulkRNA_ad.X.max(), "]") # Overall range of proteins

[ 0.0 , 15.981462 ]


In [120]:
with open (os.path.join(proteins,'bulkRNA_feature_names.txt'), 'w') as file: # Save protein names
    for name in bulkRNA.index:
        file.write(name + "\n")  

df_bulkRNA.to_excel(os.path.join(stats_single,"summary_bulkRNA.xlsx")) # Save summary

### Luminex

In [373]:
luminex = pd.read_excel(os.path.join(raw,"luminex.xlsx"),"All data" )

if not os.path.exists(os.path.join(preprocessed,"luminex_pp.h5ad")):   

    luminex_ad = ad.AnnData(luminex.iloc[:,6:])
    luminex_ad.obs["PID"] = luminex["Row"].to_list()
    luminex_ad.obs["Disease"] = luminex["severity"].to_list()
    luminex_ad.obs["Sex"] = luminex["sex"].to_list()
    luminex_ad.obs["Age"] = luminex["age"].to_list()
    luminex_ad.obs["BMI"] = luminex["BMI"].to_list()
    luminex_ad.obs["Dexamethasone"] = luminex["dexamethasone"].to_list()

    luminex_ctrl = luminex["Row"][-34:].to_list() # The last 34 observations are control observations 
    luminex_patients = [n.split("-")[0] for n in luminex["Row"][:-34].to_list()] # Remove the date from the patient IDs
    luminex_ad.obs["COMBAT_ID"] = luminex_patients + luminex_ctrl # Patient Combat IDs are observations  

    luminex_ad.write_h5ad(os.path.join(preprocessed,"luminex_pp.h5ad"))

else:
    bulkRNA_ad = ad.read_h5ad(os.path.join(preprocessed,"luminex_pp.h5ad"))

luminex

  luminex_ad = ad.AnnData(luminex.iloc[:,6:])


Unnamed: 0,Row,severity,sex,age,BMI,dexamethasone,CCL18/PARC (BR33) (33) low,Lactoferrin (BR36) (36) high,Lipocalin-2/NGAL (BR21) (21) high,Myeloperoxidase/MPO (BR53) (53) high,...,IFN-alpha (BR63) (63) high,IL-2 (BR43) (43) high,IL-5 (BR53) (53) high,IL-8/CXCL8 (BR48) (48) high,IL-12 p70 (BR56) (56) high,IL-15 (BR52) (52) high,IL-23 (BR76) (76) high,IL-33 (BR14) (14) high,Oncostatin M/OSM (BR30) (30) high,TREM-1 (BR65) (65) high
0,S00029-Ja005E-PMCdb,COVID-critical,F,46,21.4,False,42080.95,15359.95,528930.72,101595.14,...,2.74,0.00,5.71,11.53,15.05,10.50,151.94,1.44,0.0,327.96
1,S00029-Ja001E-PMCdb,COVID-critical,F,46,21.4,False,47991.24,96194.78,467186.02,39638.10,...,0.00,9.42,0.00,14.35,0.00,19.25,0.00,0.00,0.0,728.84
2,S00052-Ja005E-PMCdb,COVID-critical,F,41,30.0,False,76407.98,45607.18,51840.97,88627.44,...,0.00,0.00,0.00,3.74,0.00,0.38,0.00,2.38,0.0,269.00
3,S00109-Ja005E-PMCdb,COVID-critical,F,52,,False,68297.35,64517.42,27631.27,77299.95,...,1.08,0.00,0.00,63.57,96.28,19.37,524.06,0.00,0.0,246.34
4,S00099-Ja005E-PMCdb,COVID-critical,F,52,35.0,False,171822.50,181611.29,245606.35,166649.32,...,0.00,0.00,5.96,8.35,0.00,8.98,0.00,0.00,0.0,955.97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
344,CTRL serum_plate_3,,,0,0.0,False,39474.59,53607.18,52424.81,30781.58,...,0.00,0.00,0.00,3.93,0.23,0.00,0.00,0.00,0.0,64.74
345,CTRL plasma_plate_2,,,0,0.0,False,37872.37,37779.40,47273.01,27368.19,...,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.0,32.94
346,CTRL serum_plate_2,,,0,0.0,False,43453.01,57980.85,52207.02,32636.10,...,0.00,0.00,0.00,4.98,0.00,0.00,0.00,0.00,0.0,45.23
347,CTRL plasma_plate_1,,,0,0.0,False,37111.29,26677.81,42385.49,24503.65,...,0.00,0.00,3.31,0.99,0.00,4.61,85.78,0.58,0.0,75.50


In [248]:
summary_luminex = {"feature" : luminex.columns[6:],
           "min" : luminex.iloc[:,6:].min(axis = 0),
           "max" : luminex.iloc[:,6:].max(axis = 0),
           "mean" : luminex.iloc[:,6:].mean(axis = 0),
           "var" : luminex.iloc[:,6:].var(axis = 0)}

df_luminex= pd.DataFrame(summary_luminex)

# Overall range of proteins
print("[",  luminex.iloc[:,6:].to_numpy().min(),",", luminex.iloc[:,6:].to_numpy().max(), "]")

[ 0.0 , 12019000.0 ]


In [266]:
with open (os.path.join(proteins,'luminex_feature_names.txt'), 'w') as file:  
    for name in luminex.columns:
        file.write(name + "\n") 

df_luminex.to_excel(os.path.join(stats_single,"summary_Luminex.xlsx")) # Save summary

### CyTOF

In [374]:
if not os.path.exists(os.path.join(preprocessed,"cytof_pp.h5ad")):   

    cytof = ad.read_h5ad(os.path.join(raw,"cytof_full.h5ad"))

    cytof.obs['Domain'] = 'cytof'
    cytof.obs['Domain_major'] = 'cytof'

    #used to map the name present in cytof_cells the same way as harmonized_cytof_cells (and consequently cite_Cells)
    dic_major = {}
    dic_type = {}
    for i in range(len(cytof_cells_harmonised)):
        key = cytof_cells_harmonised.iloc[i]['major_cell_type']
        value_major = cytof_cells_harmonised.iloc[i]['harmonized_major_subset']
        value_type = cytof_cells_harmonised.iloc[i]['harmonized_cell_type']
        if value_major not in dic_major:
            dic_major.update([(key, value_major)])
        if value_type not in dic_type:
            dic_type.update([(key, value_type)])

    cytof.obs['Annotation_major_subset'] = cytof.obs['major_cell_type'].map(dic_major).astype('category')
    cytof.obs['Annotation_cell_type'] = cytof.obs['major_cell_type'].map(dic_type).astype('category')

    cytof.obs['Annotation_major_subset'] = cytof.obs['Annotation_major_subset'].cat.rename_categories({'UNCLASSIFIED': 'nan'})
    cytof.obs['Annotation_cell_type'] = cytof.obs['Annotation_cell_type'].cat.rename_categories({'UNCLASSIFIED': 'nan'})

    cytof.obs["COMBAT_ID"]  = [n.split("-")[0] for n in cytof.obs.patient_id] # Remove the date from the patient IDs
    cytof.obs_names = cytof.obs.patient_id

    cytof.obs.rename(columns={'patient_id': 'PID', 'condition': 'Diseases'}, inplace=True)

    cytof.write_h5ad(os.path.join(preprocessed,"cytof_pp.h5ad"))

else:

    cytof = ad.read_h5ad(os.path.join(preprocessed,"cytof_pp.h5ad"))

print("CyTOF: ", cytof, "\n\n")



CyTOF:  AnnData object with n_obs × n_vars = 7118158 × 48
    obs: 'sample_id', 'Diseases', 'PID', 'batch', 'cellID', 'COMBAT_ID_Time', 'CyTOF_priority', 'major_cell_type', 'fine_cluster_id', 'Domain', 'Domain_major', 'Annotation_major_subset', 'Annotation_cell_type', 'COMBAT_ID'
    var: 'channel_name', 'marker_name', 'marker_class'
    uns: 'SOM_codes', 'X_name', 'cluster_codes', 'cofactor', 'experiment_info'
    obsm: 'TSNE', 'UMAP' 




AnnData expects .obs.index to contain strings, but got values like:
    ['H00067-Ha001E-CYGa', 'H00067-Ha001E-CYGa', 'H00067-Ha001E-CYGa', 'H00067-Ha001E-CYGa', 'H00067-Ha001E-CYGa']

    Inferred to be: categorical

  names = self._prep_dim_index(names, "obs")


In [283]:
summary_cytof = {"feature" : cytof.var_names,
           "min" : cytof.X.min(axis = 0),
           "max" : cytof.X.max(axis = 0),
           "mean" : cytof.X.mean(axis = 0),
           "var" : cytof.X.var(axis = 0)
           }

df_cytof= pd.DataFrame(summary_cytof)

# Overall range of the proteins
print("[", cytof.X.min(),",",cytof.X.max(), "]")

[ -6.436475 , 24.303553 ]


In [284]:
cytof.to_df().head()

Unnamed: 0_level_0,CD16,CD19,CD3,IgG,CD4,HLA_DR,CTLA4,Siglec_8,CD28,Ki_67,...,KLGR1,FOXP3,CD38,CD45,CD123,CD25,CD141,CLA,CX3CR1,Event_length
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
H00067-Ha001E-CYGa,0.002863,0.00712,3.169593,0.778884,0.000112,0.877128,0.482226,0.0,0.489538,1.481506,...,2.4122,0.00615,0.005076,5.388708,0.032491,0.922501,0.963545,2.933704,0.333416,2.32709
H00067-Ha001E-CYGa,0.001721,2.329424,0.00177,1.906131,0.001495,4.516001,0.492553,0.0,0.296466,0.024739,...,0.0,1.024434,2.667199,5.177439,0.022996,0.000264,0.001856,0.826805,0.484731,2.363303
H00067-Ha001E-CYGa,0.001721,0.007947,0.23164,0.302123,2.17078,3.362985,0.226497,0.437236,0.078629,0.686172,...,0.265121,0.301607,2.895148,4.42983,0.031306,0.727448,0.631563,3.159231,0.020285,2.191561
H00067-Ha001E-CYGa,0.001721,0.130171,0.00177,1.14377,0.001495,4.477472,1.895761,0.010462,0.078629,0.024739,...,0.016187,0.003332,3.320227,4.470901,0.01359,0.233912,0.001856,3.245132,0.020285,2.14249
H00067-Ha001E-CYGa,1.01013,0.007947,0.00177,1.599139,1.396846,3.425147,0.527159,0.0,0.472719,0.024739,...,0.012915,1.370077,3.129075,4.14228,0.000452,1.384267,0.756194,3.142244,0.667578,2.405398


In [285]:
with open (os.path.join(proteins,'cyTOF_feature_names.txt'), 'w') as file:  
    for name in cytof.var_names:
        file.write(name + "\n") 

df_cytof.to_excel(os.path.join(stats_single,"summary_cytof.xlsx")) # Save summary

### FACS

In [405]:
if not os.path.exists(os.path.join(preprocessed,"facs_pp.h5ad")):  

    facs = ad.read_h5ad(os.path.join(raw,"facs_full.h5ad"))

    facs.obs['Annotation_major_subset'] = 'CD4'
    facs.obs['Annotation_cell_type'] = 'CD4'

    facs.obs['Domain'] = 'facs'
    facs.obs['Domain_major'] = 'facs'

    facs_obs = pd.DataFrame()
    facs_obs["sample_id"] = facs.obs.sample_id
    facs_obs = facs_obs.set_index("sample_id") # Dataframe with all COMBAT IDs (one column)

    facs_experiment_info = facs.uns["experiment_info"][["sample_id","scRNASeq_sample_ID" ]] # Dataframe 
    facs_experiment_info["patients"] = [n.split("-")[0] for n in facs_experiment_info["scRNASeq_sample_ID" ]] # Remove the date from the patient IDs

    facs_joined = facs_obs.join(facs_experiment_info.set_index("sample_id"), on = "sample_id", how = "inner") # Maps sample_id to patients 

    facs.obs_names = facs_joined["scRNASeq_sample_ID" ]
    facs.obs["COMBAT_ID"] = facs_joined.set_index('scRNASeq_sample_ID')["patients"] # Extract patients (COMBAT ids) and use it as observation names
    facs.obs['PID'] = facs.obs_names 

    facs.obs.rename(columns={'condition': 'Diseases'}, inplace=True)

    facs.write_h5ad(os.path.join(preprocessed,"facs_pp.h5ad"))

else:

    facs = ad.read_h5ad(os.path.join(preprocessed,"facs_pp.h5ad"))

print("FACS: ", facs, "\n\n")

FACS:  AnnData object with n_obs × n_vars = 131920 × 12
    obs: 'fcs_file', 'sample_id', 'Diseases', 'patient_id', 'cluster_id', 'Annotation_major_subset', 'Annotation_cell_type', 'Domain', 'Domain_major', 'COMBAT_ID', 'PID'
    var: 'channel_name', 'marker_name', 'marker_class', 'used_for_clustering'
    uns: 'SOM_codes', 'X_name', 'cluster_codes', 'experiment_info'
    obsm: 'TSNE', 'UMAP'
    layers: 'exprs' 




  utils.warn_names_duplicates("obs")


In [295]:
summary_facs = {"feature" : facs.var_names,
           "min" : facs.X.min(axis = 0),
           "max" : facs.X.max(axis = 0),
           "mean" : facs.X.mean(axis = 0),
           "var" : facs.X.var(axis = 0)}

df_facs= pd.DataFrame(summary_facs)

# Overall range of proteins
print("[", facs.X.min(),",",facs.X.max(), "]")

[ -6608.2285 , 178222.64 ]


In [21]:
facs.to_df().head()

Unnamed: 0,CXCR3,CCR4,CD45RA,HLA-DR,CD25,CD38,CD127,PD1,CCR6,ICOS,CD27,CCR7
memory,22.444963,8083.472168,-67.636757,48.388863,128.51062,317.221191,153.553589,-477.221954,2538.507568,1307.797119,462.951263,260.562439
memory,46.32246,6292.529297,262.460266,341.741608,349.631439,638.302551,0.721993,1040.776855,2116.770264,6056.441895,8753.99707,1611.8302
memory,138.343292,1583.56958,535.068176,-177.763977,54.735806,396.080292,1005.266602,-21.077803,-146.690323,818.59491,5212.695312,688.128906
memory,99.121498,390.649139,-14.418692,83.374794,63.772312,260.273926,2112.503662,888.982605,1462.157104,672.073364,2724.397949,687.44043
memory,53.704018,3739.389404,14.945232,13.85416,561.055847,-484.798462,175.297806,-1016.438843,4490.885742,-101.385414,7555.630859,981.396484


In [296]:
with open (os.path.join(proteins,'facs_feature_names.txt'), 'w') as file:  
    for name in facs.var_names:
        file.write(name + "\n") 

df_facs.to_excel(os.path.join(stats_single,"summary_facs.xlsx")) # Save summary

## Comparisons <a id='compare'></a>

The amin of this section is to find shared proteins and genes across modalities

### Shared Protein Names

In [186]:
# compare bulkRNA and ADT
shared_features_adt_bulkRNA = set(adt_names).intersection(set(bulkRNA_joined["gene_name"]))

# compare Luminex, bulkRNA and ADT
shared_features_adt_luminex = set(adt_names).intersection(set(luminex.var_names))
shared_features_bulkRNA_luminex = set(luminex.var_names).intersection(set(bulkRNA_joined["gene_name"]))

# compare cyTOF bulkRNA and ADT
shared_features_adt_cytof = set(cytof.var_names).intersection(set(adt_names)) 
shared_features_bulkRNA_cytof = set(cytof.var_names).intersection(set(bulkRNA_joined["gene_name"]))

# compare FACS, bulkRNA and ADT, cyTOF
shared_features_adt_facs = set(facs.var_names).intersection(set(adt_names)) 
shared_features_bulkRNA_facs = set(facs.var_names).intersection(set(bulkRNA_joined["gene_name"]))
shared_features_cytof_facs = set(facs.var_names).intersection(set(cytof.var_names)) 
shared_features_adt_bulkRNA_facs = shared_features_adt_facs.intersection(set(bulkRNA_joined["gene_name"]))
shared_features_adt_cytof_facs = shared_features_adt_facs.intersection(set(cytof.var_names)) 
shared_features_bulkRNA_cytof_facs = shared_features_bulkRNA_facs.intersection(set(cytof.var_names)) 
shared_features_adt_bulkRNA_cytof_facs = shared_features_adt_bulkRNA.intersection(set(cytof.var_names)) 

# compare FACS, bulkRNA and ADT, cyTOF, csRNA
shared_features_adt_scRNA = set(scRNA_names).intersection(set(adt_names)) 
shared_features_bulkRNA_scRNA = set(scRNA_names).intersection(set(bulkRNA_joined["gene_name"]))
shared_features_cytof_scRNA = set(scRNA_names).intersection(set(cytof.var_names)) 
shared_features_facs_scRNA = set(scRNA_names).intersection(set(facs.var_names))
shared_features_adt_bulkRNA_scRNA = shared_features_adt_scRNA.intersection(set(bulkRNA_joined["gene_name"]))
shared_features_adt_cytof_scRNA = shared_features_adt_scRNA.intersection(set(cytof.var_names)) 
shared_features_adt_facs_scRNA = shared_features_adt_scRNA.intersection(set(facs.var_names)) 
shared_features_bulkRNA_cytof_scRNA = shared_features_bulkRNA_scRNA.intersection(set(cytof.var_names)) 
shared_features_bulkRNA_facs_scRNA = shared_features_bulkRNA_scRNA.intersection(set(facs.var_names))
shared_features_adt_bulkRNA_cytof_scRNA = shared_features_adt_bulkRNA.intersection(set(cytof.var_names)) 
shared_features_adt_bulkRNA_facs_scRNA = shared_features_adt_bulkRNA.intersection(set(facs.var_names)) 
shared_features_adt_cytof_facs_scRNA = shared_features_adt_cytof.intersection(set(facs.var_names)) 
shared_features_bulkRNA_facs_scRNA_cytof = shared_features_bulkRNA_facs_scRNA.intersection(set(cytof.var_names)) 

In [187]:
with open (os.path.join(out_path,'shared/shared_ADT_bulkRNA.txt'), 'w') as file:  
    for name in shared_features_adt_bulkRNA:
        file.write(name + "\n")  

with open (os.path.join(out_path,'shared/shared_ADT_Luminex.txt'), 'w') as file:  
    for name in shared_features_adt_luminex:
        file.write(name + "\n")  
with open (os.path.join(out_path,'shared/shared_bulkRNA_Luminex.txt'), 'w') as file:  
    for name in shared_features_bulkRNA_luminex:
        file.write(name + "\n")  


with open (os.path.join(out_path,'shared/shared_ADT_cyTOF.txt'), 'w') as file:  
    for name in shared_features_adt_cytof:
        file.write(name + "\n")  
with open (os.path.join(out_path,'shared/shared_bulkRNA_cyTOF.txt'), 'w') as file:  
    for name in shared_features_bulkRNA_cytof:
        file.write(name + "\n")  


with open (os.path.join(out_path,'shared/shared_ADT_FACS.txt'), 'w') as file:  
    for name in shared_features_adt_facs:
        file.write(name + "\n")  
with open (os.path.join(out_path,'shared/shared_bulkRNA_FACS.txt'), 'w') as file:  
    for name in shared_features_bulkRNA_facs:
        file.write(name + "\n")  
with open (os.path.join(out_path,'shared/shared_cyTOF_FACS.txt'), 'w') as file:  
    for name in shared_features_cytof_facs:
        file.write(name + "\n")  
with open (os.path.join(out_path,'shared/shared_ADT_bulkRNA_FACS.txt'), 'w') as file:  
    for name in shared_features_adt_bulkRNA:
        file.write(name + "\n")  
with open (os.path.join(out_path,'shared/shared_ADT_cytof_FACS.txt'), 'w') as file:  
    for name in shared_features_adt_cytof:
        file.write(name + "\n")  
with open (os.path.join(out_path,'shared/shared_bulkRNA_cytof_FACS.txt'), 'w') as file:  
    for name in shared_features_bulkRNA_cytof:
        file.write(name + "\n") 
with open (os.path.join(out_path,'shared/shared_ADT_bulkRNA_cytof_FACS.txt'), 'w') as file:  
    for name in shared_features_adt_bulkRNA_cytof_facs:
        file.write(name + "\n")       


with open (os.path.join(out_path,'shared/shared_ADT_scRNA.txt'), 'w') as file:  
    for name in shared_features_adt_scRNA:
        file.write(name + "\n")  
with open (os.path.join(out_path,'shared/shared_bulkRNA_scRNA.txt'), 'w') as file:  
    for name in shared_features_bulkRNA_scRNA:
        file.write(name + "\n")  
with open (os.path.join(out_path,'shared/shared_cyTOF_scRNA.txt'), 'w') as file:  
    for name in shared_features_cytof_scRNA:
        file.write(name + "\n")  
with open (os.path.join(out_path,'shared/shared_FACS_scRNA.txt'), 'w') as file:  
    for name in shared_features_facs_scRNA:
        file.write(name + "\n")  
with open (os.path.join(out_path,'shared/shared_ADT_bulkRNA_scRNA.txt'), 'w') as file:  
    for name in shared_features_adt_bulkRNA:
        file.write(name + "\n")  
with open (os.path.join(out_path,'shared/shared_ADT_cytof_csRNA.txt'), 'w') as file:  
    for name in shared_features_adt_cytof:
        file.write(name + "\n")  
with open (os.path.join(out_path,'shared/shared_ADT_FACS_csRNA.txt'), 'w') as file:  
    for name in shared_features_adt_facs:
        file.write(name + "\n")  
with open (os.path.join(out_path,'shared/shared_bulkRNA_cytof_scRNA.txt'), 'w') as file:  
    for name in shared_features_bulkRNA_cytof:
        file.write(name + "\n") 
with open (os.path.join(out_path,'shared/shared_bulkRNA_FACS_scRNA.txt'), 'w') as file:  
    for name in shared_features_bulkRNA_facs:
        file.write(name + "\n") 
with open (os.path.join(out_path,'shared/shared_ADT_bulkRNA_cytof_scRNA.txt'), 'w') as file:  
    for name in shared_features_adt_bulkRNA_cytof_scRNA:
        file.write(name + "\n")       
with open (os.path.join(out_path,'shared/shared_ADT_bulkRNA_FACS_scRNA.txt'), 'w') as file:  
    for name in shared_features_adt_bulkRNA_facs:
        file.write(name + "\n")    
with open (os.path.join(out_path,'shared/shared_bulkRNA_cytof_FACS_scRNA.txt'), 'w') as file:  
    for name in shared_features_bulkRNA_facs_scRNA_cytof:
        file.write(name + "\n")     

### Shared Protein Statistics

In [189]:
stats_bulkRNA_scRNA = df_bulkRNA.set_index("feature").join(df_scRNA.set_index("feature"), lsuffix="_bulk", rsuffix="_sc", how="inner")
stats_cytof_scRNA = df_cytof.set_index("feature").join(df_scRNA.set_index("feature"), lsuffix="_cytof", rsuffix="_scRNA", how = "inner")
stats_adt_scRNA = df_adt.set_index("feature").join(df_scRNA.set_index("feature"), lsuffix="_adt", rsuffix="_scRNA", how = "inner") 
stats_facs_scRNA = df_facs.set_index("feature").join(df_scRNA.set_index("feature"), lsuffix="_facs", rsuffix="_scRNA", how = "inner") 

stats_bulkRNA_cytof = df_bulkRNA.set_index("feature").join(df_cytof.set_index("feature"), lsuffix="_bulk", rsuffix="_cytof", how="inner")
stats_adt_cytof = df_adt.set_index("feature").join(df_cytof.set_index("feature"), lsuffix="_adt", rsuffix="_cytof", how = "inner") 
stats_cytof_facs = df_facs.set_index("feature").join(df_cytof.set_index("feature"), lsuffix="_facs", rsuffix="_cytof", how = "inner") 

stats_adt_bulkRNA = df_adt.set_index("feature").join(df_bulkRNA.set_index("feature"), lsuffix="_adt", rsuffix="_bulkRNA", how = "inner") 
stats_bulkRNA_facs = df_facs.set_index("feature").join(df_bulkRNA.set_index("feature"), lsuffix="_facs", rsuffix="_bulkRNA", how = "inner") 

stats_adt_facs = df_facs.set_index("feature").join(df_adt.set_index("feature"), lsuffix="_facs", rsuffix="_adt", how = "inner") 

In [190]:
stats_bulkRNA_scRNA.to_excel(os.path.join(out_path,"stats/cross/stats_bulkRNA_scRNA.xlsx"))
stats_cytof_scRNA.to_excel(os.path.join(out_path,"stats/cross/stats_cytof_scRNA.xlsx"))
stats_adt_scRNA.to_excel(os.path.join(out_path,"stats/cross/stats_adt_scRNA.xlsx")) 
stats_facs_scRNA.to_excel(os.path.join(out_path,"stats/cross/stats_facs_scRNA.xlsx")) 

stats_bulkRNA_cytof.to_excel(os.path.join(out_path,"stats/cross/stats_bulkRNA_cytof.xlsx"))
stats_adt_cytof.to_excel(os.path.join(out_path,"stats/cross/stats_adt_cytof.xlsx")) 
stats_cytof_facs.to_excel(os.path.join(out_path,"stats/cross/stats_cytof_facs.xlsx")) 

stats_adt_bulkRNA.to_excel(os.path.join(out_path,"stats/cross/stats_adt_bulkRNA.xlsx")) 
stats_bulkRNA_facs.to_excel(os.path.join(out_path,"stats/cross/stats_bulkRNA_facs.xlsx")) 

stats_adt_facs.to_excel(os.path.join(out_path,"stats/cross/stats_adt_facs.xlsx")) 

## Pseudo Bulking <a id='aggregate'></a>
This section is used to create bulks and pseudobulks, where observations are patient IDs and columns are gene or protein names.

### Aggregation
Averaging is used as a means of aggregation, while preserving the time stamps of patient samples.

In [384]:
facs_pb = facs.to_df().groupby(facs.obs_names).mean()  # all CD4
cytof_pb = cytof.to_df().groupby(by = [cytof.obs.Annotation_cell_type, cytof.obs_names]).mean().reset_index(level=[1]) 
adt_pb = adt.to_df().groupby(by = [adt.obs.Annotation_cell_type, adt.obs_names]).mean().reset_index(level=[1])
scRNA_pb = scRNA.to_df().groupby(by = [scRNA.obs.Annotation_cell_type, scRNA.obs_names]).mean().reset_index(level=[1])

In [386]:
# Save in Excel files
facs_pb.to_excel(os.path.join(pb, "facs.xlsx"))
cytof_pb.to_excel(os.path.join(pb, "cytof.xlsx"))
adt_pb.to_excel(os.path.join(pb, "adt.xlsx"))
luminex_ad.to_df().to_excel(os.path.join(pb, "luminex.xlsx"))

In [387]:
facs.uns["pseudobulk"] = facs_pb
cytof.uns["pseudobulk"] = cytof_pb
adt.uns["pseudobulk"] = adt_pb
#scRNA.uns["pseudobulk"] = scRNA_pb

### Subsetting RNA Data
Slice each cell type according to gene memberships

In [394]:
memberships_cite = {} # A dictionary that maps modules to a list of genes
for module in set(membership_all_celltypes["gene_name"].to_list()):
    memberships_cite[module] = membership_all_celltypes[membership_all_celltypes.index == module]["gene_name"].to_list()



In [395]:
memberships_bulk = {} # A dictionary that maps modules to a list of genes
for module in set(bulk_genes_membership["gene_name"].to_list()):
    memberships_bulk[module] = bulk_genes_membership[bulk_genes_membership.index == module]["gene_name"].to_list()

In [396]:
for ct in set(scRNA.obs.Annotation_cell_type): # Go through every cell type
    ct_slice = scRNA_pb[scRNA_pb.index == ct] 
    for module in memberships_cite: # Go through every module
        genes = ["scRNASeq_sample_ID"] + memberships_cite[module] # Fetch all genes of that module
        genes_of_the_modules_of_the_slice = ct_slice.columns.intersection(genes) 
        ct_slice[genes_of_the_modules_of_the_slice].to_excel(os.path.join(pb,f"scRNA_{ct}_{module}.xlsx")) # Save to Excel 
    

In [400]:
bulkRNA_df = bulkRNA_ad.to_df()
for module in memberships_bulk: # Go through every module
    genes = memberships_bulk[module] # Fetch all genes of that module
    genes_of_the_modules = bulkRNA_df.columns.intersection(genes) 
    bulkRNA_df[genes_of_the_modules].to_excel(os.path.join(pb,f"bulkRNA_{module}.xlsx")) # Save to Excel 

### Sclicing the Modalities According to Cell Types

In [397]:
# ADT
for ct in set(adt.obs.Annotation_cell_type): # Go through every cell type
    ct_slice = adt_pb[adt_pb.index == ct] 
    ct_slice.to_excel(os.path.join(pb,f"adt_{ct}.xlsx"))

In [398]:
# CyTOF
for ct in set(cytof.obs.Annotation_cell_type): # Go through every cell type
    ct_slice = cytof_pb[cytof_pb.index == ct] 
    ct_slice.to_excel(os.path.join(pb,f"cytof_{ct}.xlsx"))

### Pseudobulk Generation According to Leonardo's Notebook

In [326]:
rna_cite_pseudobulks = {}

for i, patient in enumerate(pd.unique(scRNA.obs_names)): # Go through every patient
        rna_cite_patient = scRNA[scRNA.obs.PID == patient,:] # Filter out all the rows/ measurements that have the patient ID PID
        for cell_type in pd.unique(scRNA.obs.Annotation_major_subset): # Go through every cell type 
            rna_cite_patient_cell_type = rna_cite_patient[rna_cite_patient.obs.Annotation_major_subset == cell_type, :] # From all the patients with PID, filter out those measurements that are related to a specific cell type
            if len(rna_cite_patient_cell_type) > 0:
                for module in pd.unique(membership_all_celltypes.module): # Go through every module 
                    gene_membership_cite_module = membership_all_celltypes.loc[membership_all_celltypes.module == module, 'gene_name'] # slice based on labels, and map module name to gene name 
                    print(i, patient, cell_type, module)
                    
                    genes = list(set(gene_membership_cite_module).intersection(set(rna_cite_patient_cell_type.var_names))) # To make sure the genes are column names ?????????
                    rna_cite_patient_cell_type_module = rna_cite_patient_cell_type[:, genes] # Between the measurements that are for a specific patient and cell type, filter out specific genes that belong to the same module
                   
                    if rna_cite_patient_cell_type_module.shape[0] == 0:
                        raise ValueError("no observations for the current sum")
         
                    if np.isnan(rna_cite_patient_cell_type_module.layers['raw'].data).any():
                        raise ValueError("row contains nan care!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
                    
                    row = rna_cite_patient_cell_type_module.layers['raw'].toarray()
                    if row.shape[0] > 1:
                        row = row.sum(axis=0) 
                    else:
                        row = row[0]

                    if np.any(np.isnan(row)):
                        raise ValueError("row contains nan care!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
                    key = f'{cell_type}-{module}'
                  
                    if key not in rna_cite_pseudobulks:
                        df = pd.DataFrame(columns=genes)
                        df.loc[patient] = row
                        rna_cite_pseudobulks[key] = df
                    else:
                        rna_cite_pseudobulks[key].loc[patient] = row
                    
        

0 S00109-Ja001E-PBCa NK greenyellow
(795,)
1           RPL18
2           RPL7A
3          RPS15A
4            RPS7
5           RPS23
           ...   
45749    IGLV6-57
45750         PDF
45751        OSBP
45752        BAP1
45753     SLC37A4
Name: gene_name, Length: 808, dtype: object
0 S00109-Ja001E-PBCa NK tan
(880,)
275        LAG3
276        GZMH
277      MCOLN2
278      CLEC2D
279       TPRG1
          ...  
23222     CDC26
23223    CITED4
23224      LBX2
23225     MIEN1
23226    ZNF487
Name: gene_name, Length: 888, dtype: object
0 S00109-Ja001E-PBCa NK red
(897,)
412          RAI14
413         DEPTOR
414           AJM1
415       FAM171A1
416          TTC28
           ...    
1310        ARNTL2
1311          MYOT
1312    AC087632.2
1313       CBFA2T3
1314         FOLR3
Name: gene_name, Length: 903, dtype: object
0 S00109-Ja001E-PBCa NK turquoise
(9691,)
1315       GIMAP6
1316     TNFRSF1A
1317       GIMAP7
1318        CXCR2
1319     MRFAP1L1
           ...   
58387    TMEM191B
5838

In [327]:
adt_cite_pseudobulks = {}
for i, patient in enumerate(pd.unique(adt.obs_names)):
    adt_cite_patient = adt[adt.obs.PID == patient, :]
    for cell_type in pd.unique(adt.obs.Annotation_major_subset):
        adt_cite_patient_cell_type = adt_cite_patient[adt_cite_patient.obs.Annotation_major_subset == cell_type, :]
        if len(adt_cite_patient_cell_type) > 0:
            print(i, patient, cell_type)
            key = f'{cell_type}'
            row = adt_cite_patient_cell_type.layers['raw'].toarray()
            if row.shape[0] > 1:
                row = row.sum(axis=0)
            else:
                row = row[0]
            if key not in adt_cite_pseudobulks:
                df = pd.DataFrame(columns = adt.var_names)
                df.loc[patient] = row
                adt_cite_pseudobulks[key] = df
            else:
                adt_cite_pseudobulks[key].loc[patient] = row

0 S00109-Ja001E-PBCa NK
0 S00109-Ja001E-PBCa CD8
0 S00109-Ja001E-PBCa nan
0 S00109-Ja001E-PBCa ncMono
0 S00109-Ja001E-PBCa cMono
0 S00109-Ja001E-PBCa CD4
0 S00109-Ja001E-PBCa B
0 S00109-Ja001E-PBCa MAIT
0 S00109-Ja001E-PBCa PB
0 S00109-Ja001E-PBCa iNKT
0 S00109-Ja001E-PBCa DN
0 S00109-Ja001E-PBCa DP
0 S00109-Ja001E-PBCa GDT
0 S00109-Ja001E-PBCa HSC
0 S00109-Ja001E-PBCa DC
0 S00109-Ja001E-PBCa PLT
0 S00109-Ja001E-PBCa RET
1 S00112-Ja003E-PBCa NK
1 S00112-Ja003E-PBCa CD8
1 S00112-Ja003E-PBCa nan
1 S00112-Ja003E-PBCa ncMono
1 S00112-Ja003E-PBCa cMono
1 S00112-Ja003E-PBCa CD4
1 S00112-Ja003E-PBCa B
1 S00112-Ja003E-PBCa MAIT
1 S00112-Ja003E-PBCa PB
1 S00112-Ja003E-PBCa iNKT
1 S00112-Ja003E-PBCa DN
1 S00112-Ja003E-PBCa DP
1 S00112-Ja003E-PBCa GDT
1 S00112-Ja003E-PBCa HSC
1 S00112-Ja003E-PBCa DC
1 S00112-Ja003E-PBCa PLT
2 G05153-Ja005E-PBCa NK
2 G05153-Ja005E-PBCa CD8
2 G05153-Ja005E-PBCa nan
2 G05153-Ja005E-PBCa ncMono
2 G05153-Ja005E-PBCa cMono
2 G05153-Ja005E-PBCa CD4
2 G05153-Ja005E-PBCa 

In [328]:
cytof_pseudobulks = {}
for i, patient in enumerate(pd.unique(cytof.obs.PID)):
    cytof_patient = cytof[cytof.obs.PID == patient, :]
    for cell_type in pd.unique(cytof.obs.Annotation_major_subset):
        cytof_patient_cell_type = cytof_patient[cytof_patient.obs.Annotation_major_subset == cell_type, :]
        if len(cytof_patient_cell_type) > 0:
            print(i, patient, cell_type)
            key = f'{cell_type}'
            row = cytof_patient_cell_type.X.toarray()
            if row.shape[0] > 1:
                row = row.mean(axis=0)
            else:
                row = row[0]
            if key not in cytof_pseudobulks:
                df = pd.DataFrame(columns=cytof.var_names)
                df.loc[patient] = row
                cytof_pseudobulks[key] = df
            else:
                cytof_pseudobulks[key].loc[patient] = row

            print(i, patient, cell_type)

0 H00067-Ha001E-CYGa CD8
0 H00067-Ha001E-CYGa CD8
0 H00067-Ha001E-CYGa B
0 H00067-Ha001E-CYGa B
0 H00067-Ha001E-CYGa cMono
0 H00067-Ha001E-CYGa cMono
0 H00067-Ha001E-CYGa CD4
0 H00067-Ha001E-CYGa CD4
0 H00067-Ha001E-CYGa GDT
0 H00067-Ha001E-CYGa GDT
0 H00067-Ha001E-CYGa DC
0 H00067-Ha001E-CYGa DC
0 H00067-Ha001E-CYGa Basophil
0 H00067-Ha001E-CYGa Basophil
0 H00067-Ha001E-CYGa NK
0 H00067-Ha001E-CYGa NK
0 H00067-Ha001E-CYGa ncMono
0 H00067-Ha001E-CYGa ncMono
0 H00067-Ha001E-CYGa MAIT
0 H00067-Ha001E-CYGa MAIT
0 H00067-Ha001E-CYGa DN
0 H00067-Ha001E-CYGa DN
0 H00067-Ha001E-CYGa PB
0 H00067-Ha001E-CYGa PB
0 H00067-Ha001E-CYGa nan
0 H00067-Ha001E-CYGa nan
1 N00023-Ja001E-CYGa CD8
1 N00023-Ja001E-CYGa CD8
1 N00023-Ja001E-CYGa B
1 N00023-Ja001E-CYGa B
1 N00023-Ja001E-CYGa cMono
1 N00023-Ja001E-CYGa cMono
1 N00023-Ja001E-CYGa CD4
1 N00023-Ja001E-CYGa CD4
1 N00023-Ja001E-CYGa GDT
1 N00023-Ja001E-CYGa GDT
1 N00023-Ja001E-CYGa DC
1 N00023-Ja001E-CYGa DC
1 N00023-Ja001E-CYGa Basophil
1 N00023-Ja0

In [331]:
rna_bulk_pseudobulks = {}
for i, patient in enumerate(pd.unique(bulkRNA_ad.obs.PID)):
    rna_bulk_patient = bulkRNA_ad[bulkRNA_ad.obs.PID == patient, :]
    for module in pd.unique(bulk_genes_membership.module):
        print(i, patient, module)
        gene_membership_bulk_module = bulk_genes_membership.loc[bulk_genes_membership.module == module, 'gene_name']
        rna_bulk_patient_module = rna_bulk_patient[:, gene_membership_bulk_module.index]
        key = f'{module}'
        row = rna_bulk_patient_module.X.toarray()
        if row.shape[0] > 1:
            row = row.sum(axis=0)
        else:
            row = row[0]
        if key not in rna_bulk_pseudobulks:
            df = pd.DataFrame(index = pd.unique(bulkRNA_ad.obs.PID), columns = gene_membership_bulk_module)
            df.iloc[i] = row
            rna_bulk_pseudobulks[key] = df
        else:
            rna_bulk_pseudobulks[key].iloc[i] = row

0 S00016-Ja001T-TRGa greenyellow
0 S00016-Ja001T-TRGa green
0 S00016-Ja001T-TRGa magenta
0 S00016-Ja001T-TRGa lightgreen
0 S00016-Ja001T-TRGa black
0 S00016-Ja001T-TRGa turquoise
0 S00016-Ja001T-TRGa lightcyan
0 S00016-Ja001T-TRGa midnightblue
0 S00016-Ja001T-TRGa blue
0 S00016-Ja001T-TRGa grey60
0 S00016-Ja001T-TRGa purple
0 S00016-Ja001T-TRGa cyan
0 S00016-Ja001T-TRGa grey
1 S00020-Ja003T-TRGa greenyellow
1 S00020-Ja003T-TRGa green
1 S00020-Ja003T-TRGa magenta
1 S00020-Ja003T-TRGa lightgreen
1 S00020-Ja003T-TRGa black
1 S00020-Ja003T-TRGa turquoise
1 S00020-Ja003T-TRGa lightcyan
1 S00020-Ja003T-TRGa midnightblue
1 S00020-Ja003T-TRGa blue
1 S00020-Ja003T-TRGa grey60
1 S00020-Ja003T-TRGa purple
1 S00020-Ja003T-TRGa cyan
1 S00020-Ja003T-TRGa grey
2 S00024-Ja003T-TRGa greenyellow
2 S00024-Ja003T-TRGa green
2 S00024-Ja003T-TRGa magenta
2 S00024-Ja003T-TRGa lightgreen
2 S00024-Ja003T-TRGa black
2 S00024-Ja003T-TRGa turquoise
2 S00024-Ja003T-TRGa lightcyan
2 S00024-Ja003T-TRGa midnightblue


In [352]:
luminex_pseudobulks = pd.DataFrame(columns = luminex_ad.var_names)
luminex_matrix = np.empty((0, len(luminex_ad.var_names)))
for i, patient in enumerate(pd.unique(luminex_ad.obs.PID)):
    print(f"{i} {patient}")
    luminex_patient = luminex_ad[luminex_ad.obs.PID==patient, :]
    row = luminex_patient.X.toarray()
    if row.shape[0] > 1:
        row = row.mean(axis=0)
    else:
        row = row[0]
    
    luminex_pseudobulks.loc[patient] = row

0 S00029-Ja005E-PMCdb
1 S00029-Ja001E-PMCdb
2 S00052-Ja005E-PMCdb
3 S00109-Ja005E-PMCdb
4 S00099-Ja005E-PMCdb
5 S00094-Ja005E-PMCdb
6 S00027-Ja005E-PMCdb
7 S00027-Ja003E-PMCdb
8 S00147-Ja001E-PMCdb
9 S00124-Ja005E-PMCdb
10 S00030-Ja003E-PMCdb
11 S00065-Ja003E-PMCdb
12 S00054-Ja001E-PMCdb
13 S00007-Ja003E-PMCdb
14 S00007-Ja005E-PMCdb
15 S00111-Ja001E-PMCdb
16 S00111-Ja003E-PMCeb
17 S00024-Ja003E-PMCdb
18 S00043-Ja005E-PMCdb
19 S00055-Ja005E-PMCdb
20 S00095-Ja005E-PMCdb
21 S00043-Ja001E-PMCdb
22 S00143-Ja005E-PMCdb
23 S00143-Ja001E-PMCdb
24 S00143-Ja003E-PMCdb
25 S00040-Ja005E-PMCdb
26 S00020-Ja003E-PMCdb
27 S00008-Ja003E-PMCdb
28 S00068-Ja005E-PMCdb
29 S00005-Ja005E-PMCdb
30 S00050-Ja003E-PMCdb
31 S00050-Ja001E-PMCdb
32 S00073-Ja003E-PMCdb
33 S00079-Ja001E-PMCdb
34 S00111-Ja005E-PMCdb
35 S00064-Ja005E-PMCdb
36 S00052-Ja003E-PMCdb
37 S00037-Ja003E-PMCdb
38 S00148-Ja003E-PMCdb
39 S00148-Ja005E-PMCdb
40 S00109-Ja001E-PMCdb
41 S00074-Ja003E-PMCdb
42 S00069-Ja005E-PMCdb
43 S00053-Ja003E-PMCd

### Saving Pseudobulks

In [360]:
with open(os.path.join(pb_L,'citeRNA_pseudobulks.pickle'), 'wb') as f:
    pickle.dump(rna_cite_pseudobulks, f)

with open(os.path.join(pb_L,'adt_pseudobulks.pickle'), 'wb') as f:
    pickle.dump(adt_cite_pseudobulks, f)

with open(os.path.join(pb_L, 'cytof_pseudobulks.pickle'), 'wb') as f:
    pickle.dump(cytof_pseudobulks, f)

with open(os.path.join(pb_L, 'bulkRNA_pseudobulks.pickle'), 'wb') as f:
    pickle.dump(rna_bulk_pseudobulks, f)

luminex_pseudobulks.to_csv(os.path.join(pb_L, "luminex_pseudobulks.csv"))
facs_pb.to_csv(os.path.join(pb_L, "facs_pseudobulks.csv"))



    