# Data Processing

In [1]:
import os
import anndata as ad
import math
import scipy as sip
from scipy import sparse
import seaborn as sb
import scanpy as scp
import pickle
import pandas as pd
import numpy as np
import snf
import sklearn as skl
from sklearn import cluster
from sklearn.preprocessing import QuantileTransformer, StandardScaler, MaxAbsScaler, MinMaxScaler, PowerTransformer

In [2]:
data = os.path.abspath(os.path.join(os.getcwd(),"../data"))

input = os.path.join(data, "input")
output = os.path.join(data, "output")

original = os.path.join(input, "original")
complementary = os.path.join(input, "complementary")

shared = os.path.join(output, "shared_info_74")


level1 = os.path.join(output, "level1")
level2 = os.path.join(output, "level2")
level3 = os.path.join(output, "level3")

## Loading Data

In [3]:
quant_N = QuantileTransformer(output_distribution="normal")
quant_U = QuantileTransformer(output_distribution="uniform")
standard = StandardScaler()
standard_sparse = StandardScaler(with_mean=False)
maxabs = MaxAbsScaler()
minmax = MinMaxScaler()
power = PowerTransformer()

In [4]:
# Read Original Data
adt = ad.read_h5ad(os.path.join(original, "adt_pp.h5ad"))
bulkRNA = ad.read_h5ad(os.path.join(original, "bulkRNA_pp.h5ad"))
cytof = ad.read_h5ad (os.path.join(original, "cytof_pp.h5ad"))
facs = ad.read_h5ad(os.path.join(original, "facs_pp.h5ad"))
luminex = ad.read_h5ad(os.path.join(original, "luminex_pp.h5ad"))
scRNA = ad.read_h5ad(os.path.join(original, "scRNA_pp.h5ad"))

# Read complementary data
all_proteins = pd.read_csv(os.path.join(complementary, "all_proteins_V1.txt"), sep= "	")
COMBAT_CLINVAR_for_processed = pd.read_csv(os.path.join(complementary, "COMBAT_CLINVAR_for_processed.txt"), sep= "	")
module_information =  pd.read_excel(os.path.join(complementary, "module_names.xlsx"), sheet_name="meta_analysis_assignments")
with open(os.path.join(complementary, "id_to_name.pickle"), "rb") as f:
    id_to_name = pickle.load(f) 
with open(os.path.join(complementary, "name_to_id.pickle"), "rb") as f:
    name_to_id = pickle.load(f) 

# Module data for subsetting bulks 
bulk_genes_membership = pd.read_csv(os.path.join(complementary, "bulk_genes_membership.tsv"), sep= "\t")
membership_all_celltypes = pd.read_csv(os.path.join(complementary, "membership_all_celltypes.tsv"), sep= "\t")

# Label harmonization files
cite_cells = pd.read_csv(os.path.join(complementary, "cite_cells.csv"))
cytof_cells_harmonised = pd.read_csv(os.path.join(complementary, "cytof_cells_harmonised.csv"), sep=';')
cytof_cells = pd.read_csv(os.path.join(complementary, "cytof_cells.csv"))

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


## Preprocessing
Pseudobulks will aggregate patient information, so we need to transform observations from samples to patient names

In [5]:
print("ADT", adt.X.min(), adt.X.mean(), adt.X.max())
print("bulkRNA", bulkRNA.X.min(), bulkRNA.X.mean(), bulkRNA.X.max())
print("citeRNA", scRNA.X.min(), scRNA.X.mean(), scRNA.X.max())
print("cytof", cytof.X.min(), cytof.X.mean(), cytof.X.max())
print("FACS", facs.X.min(), facs.X.mean(), facs.X.max())
print("Luminex", luminex.X.min(), luminex.X.mean(), luminex.X.max())

ADT -67.67877 2.0366762 460.98032
bulkRNA 0.0 2.8075864 15.981462
citeRNA 0.0 0.102417216 9.094633
cytof -6.436475 1.2397815 24.303553
FACS -6608.2285 1333.8447 178222.64
Luminex 0.0 36537.375 12019000.0


In [6]:
shared_patients = list(set(adt.obs.COMBAT_ID).intersection(set(cytof.obs.COMBAT_ID)).intersection(set(scRNA.obs.COMBAT_ID)).intersection(set(bulkRNA.obs.COMBAT_ID)).intersection(set(facs.obs.COMBAT_ID)).intersection(set(luminex.obs.COMBAT_ID)))
shared_patients.sort()
print(len(shared_patients))

75


In [7]:
# ADT
adt.obs_names = [str(i) for i in range(len(adt.obs_names)) ]
indices = []
for p in shared_patients:
    indices += list(adt.obs_names[adt.obs.COMBAT_ID == p])

adt_shared = adt[indices]

# BulkRNA
bulkRNA.obs_names = [str(i) for i in range(len(bulkRNA.obs_names)) ]
indices = []
for p in shared_patients:
    indices += list(bulkRNA.obs_names[bulkRNA.obs.COMBAT_ID == p])
    
bulkRNA_shared = bulkRNA[indices]

# CiteRNA
scRNA.obs_names = [str(i) for i in range(len(scRNA.obs_names)) ]
indices = []
for p in shared_patients:
    indices += list(scRNA.obs_names[scRNA.obs.COMBAT_ID == p])
    
scRNA_shared = scRNA[indices]

# CyTOF
cytof.obs_names = [str(i) for i in range(len(cytof.obs_names)) ]
indices = []
for p in shared_patients:
    indices += list(cytof.obs_names[cytof.obs.COMBAT_ID == p])
    
cytof_shared = cytof[indices]

# Luminex
luminex.obs_names = [str(i) for i in range(len(luminex.obs_names)) ]
indices = []
for p in shared_patients:
    indices += list(luminex.obs_names[luminex.obs.COMBAT_ID == p])
    
luminex_shared = luminex[indices]

# FACS
facs.obs_names = [str(i) for i in range(len(facs.obs_names)) ]
indices = []
for p in shared_patients:
    indices += list(facs.obs_names[facs.obs.COMBAT_ID == p])
    
facs_shared = facs[indices]

In [8]:
adt_shared.obs_names = adt_shared.obs.COMBAT_ID.copy()
bulkRNA_shared.obs_names = bulkRNA_shared.obs.COMBAT_ID.copy()
cytof_shared.obs_names = cytof_shared.obs.COMBAT_ID.copy()
facs_shared.obs_names = facs_shared.obs.COMBAT_ID.copy()
luminex_shared.obs_names = luminex_shared.obs.COMBAT_ID.copy()
scRNA_shared.obs_names =  scRNA_shared.obs.COMBAT_ID.copy()

AnnData expects .obs.index to contain strings, but got values like:
    ['G05073', 'G05073', 'G05073', 'G05073', 'G05073']

    Inferred to be: categorical

  names = self._prep_dim_index(names, "obs")
AnnData expects .obs.index to contain strings, but got values like:
    ['G05073', 'H00053', 'H00054', 'H00058', 'H00064']

    Inferred to be: categorical

  names = self._prep_dim_index(names, "obs")
  utils.warn_names_duplicates("var")
AnnData expects .obs.index to contain strings, but got values like:
    ['G05073', 'G05073', 'G05073', 'G05073', 'G05073']

    Inferred to be: categorical

  names = self._prep_dim_index(names, "obs")


In [9]:
module_names = list(module_information.name.unique())

# For every module, extract what graphs they correspond to 
id_to_module_name = {}
module_name_to_id = {n: [] for n in module_names} 

for process_name in module_names:
    data_subset = module_information[module_information.name == process_name]
    for i in data_subset.index:
        data_item = data_subset.loc[i]
        module = data_item.module.split(",")
        if len(module) == 1:
            name = "citeRNA_" + data_item.cell_type + "-" + module[0].lower()
            id = name_to_id[name]
            module_name_to_id[process_name].append(id)
            id_to_module_name[id] = process_name
        else:
            for m in module:
                name = "citeRNA_" + data_item.cell_type + "-" + m.lower()
                id = name_to_id[name]
                module_name_to_id[process_name].append(id)
                id_to_module_name[id] = process_name

module_name_to_id

{'IFN': ['G77', 'G151', 'G54', 'G48', 'G101', 'G112', 'G17'],
 'AP1': ['G39', 'G114', 'G18', 'G92', 'G50', 'G65', 'G79'],
 'cycling': ['G96', 'G38', 'G19', 'G123', 'G80', 'G148'],
 'C19': ['G56', 'G73', 'G12', 'G122', 'G105'],
 'ZNF': ['G55', 'G110', 'G91', 'G72', 'G15', 'G45']}

# Normalize Data

## 75 Shared Patients Across Modalities PB

In [10]:
rna_cite_pseudobulks_shared_p = {}

for i, patient in enumerate(pd.unique(scRNA_shared.obs_names)): # Go through every patient
        rna_cite_patient = scRNA_shared[scRNA_shared.obs.COMBAT_ID == patient,:] # Filter out all the rows/ measurements that have the ID COMBAT_ID
        for cell_type in pd.unique(scRNA_shared.obs.Annotation_major_subset): # Go through every cell type 
            rna_cite_patient_cell_type = rna_cite_patient[rna_cite_patient.obs.Annotation_major_subset == cell_type, :] # From all the patients with PID, filter out those measurements that are related to a specific cell type
            if len(rna_cite_patient_cell_type) > 0:
                for module in pd.unique(membership_all_celltypes.module): # Go through every module 
                    gene_membership_cite_module = membership_all_celltypes.loc[membership_all_celltypes.module == module, 'gene_name'] # slice based on labels, and map module name to gene name 
                    print(i, patient, cell_type, module)
                    
                    genes = list(set(gene_membership_cite_module).intersection(set(rna_cite_patient_cell_type.var_names))) # To make sure the genes are column names 
                    rna_cite_patient_cell_type_module = rna_cite_patient_cell_type[:, genes] # Between the measurements that are for a specific patient and cell type, filter out specific genes that belong to the same module
                   
                    if rna_cite_patient_cell_type_module.shape[0] == 0:
                        raise ValueError("no observations for the current sum")
         
                    if np.isnan(rna_cite_patient_cell_type_module.layers['raw'].data).any():
                        raise ValueError("row contains nan care!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
                    
                    row = rna_cite_patient_cell_type_module.X.toarray()             # use normalized counts of the selected gene columns belonging to a specific module
                                                                                    # of the selected patient with PID 
                                                                                    # of the selected cell types
                    if row.shape[0] > 1:
                        row = row.mean(axis=0, dtype = "float64") # If multiple such data exist, average them 
                    else:
                        row = row[0]

                    if np.any(np.isnan(row)):
                        raise ValueError("row contains nan care!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
                    key = f'{cell_type}-{module}'
                    if key not in rna_cite_pseudobulks_shared_p:
                        df = pd.DataFrame(columns=genes, dtype = "float64")
                        df.loc[patient] = row
                        rna_cite_pseudobulks_shared_p[key] = df
                    else:
                        rna_cite_pseudobulks_shared_p[key].loc[patient] = row
                    

0 G05073 GDT greenyellow
0 G05073 GDT tan
0 G05073 GDT red
0 G05073 GDT turquoise
0 G05073 GDT green
0 G05073 GDT purple
0 G05073 GDT brown
0 G05073 GDT blue
0 G05073 GDT black
0 G05073 GDT salmon
0 G05073 GDT lightcyan
0 G05073 GDT grey60
0 G05073 GDT grey
0 G05073 GDT lightgreen
0 G05073 GDT pink
0 G05073 GDT yellow
0 G05073 GDT magenta
0 G05073 GDT cyan
0 G05073 GDT midnightblue
0 G05073 GDT lightyellow
0 G05073 CD4 greenyellow
0 G05073 CD4 tan
0 G05073 CD4 red
0 G05073 CD4 turquoise
0 G05073 CD4 green
0 G05073 CD4 purple
0 G05073 CD4 brown
0 G05073 CD4 blue
0 G05073 CD4 black
0 G05073 CD4 salmon
0 G05073 CD4 lightcyan
0 G05073 CD4 grey60
0 G05073 CD4 grey
0 G05073 CD4 lightgreen
0 G05073 CD4 pink
0 G05073 CD4 yellow
0 G05073 CD4 magenta
0 G05073 CD4 cyan
0 G05073 CD4 midnightblue
0 G05073 CD4 lightyellow
0 G05073 nan greenyellow
0 G05073 nan tan
0 G05073 nan red
0 G05073 nan turquoise
0 G05073 nan green
0 G05073 nan purple
0 G05073 nan brown
0 G05073 nan blue
0 G05073 nan black
0 G

In [11]:
adt_cite_pseudobulks_shared_p = {}
for i, patient in enumerate(pd.unique(adt_shared.obs_names)):
    adt_cite_patient = adt_shared[adt_shared.obs.COMBAT_ID == patient, :]
    for cell_type in pd.unique(adt_shared.obs.Annotation_major_subset):
        adt_cite_patient_cell_type = adt_cite_patient[adt_cite_patient.obs.Annotation_major_subset == cell_type, :]
        if len(adt_cite_patient_cell_type) > 0:
            print(i, patient, cell_type)
            key = f'{cell_type}'
            row = adt_cite_patient_cell_type.X.toarray()
            if row.shape[0] > 1:
                row = row.mean(axis=0, dtype = "float64")
            else:
                row = row[0]
            if key not in adt_cite_pseudobulks_shared_p:
                df = pd.DataFrame(columns = adt_shared.var_names, dtype = "float64")
                df.loc[patient] = row
                adt_cite_pseudobulks_shared_p[key] = df
            else:
                adt_cite_pseudobulks_shared_p[key].loc[patient] = row

0 G05073 GDT
0 G05073 CD4
0 G05073 nan
0 G05073 NK
0 G05073 CD8
0 G05073 cMono
0 G05073 B
0 G05073 DC
0 G05073 MAIT
0 G05073 ncMono
0 G05073 DP
0 G05073 DN
0 G05073 PB
0 G05073 iNKT
0 G05073 PLT
0 G05073 HSC
0 G05073 Mast
1 H00053 GDT
1 H00053 CD4
1 H00053 nan
1 H00053 NK
1 H00053 CD8
1 H00053 cMono
1 H00053 B
1 H00053 DC
1 H00053 MAIT
1 H00053 ncMono
1 H00053 DP
1 H00053 DN
1 H00053 PB
1 H00053 iNKT
1 H00053 PLT
1 H00053 HSC
1 H00053 Mast
2 H00054 GDT
2 H00054 CD4
2 H00054 nan
2 H00054 NK
2 H00054 CD8
2 H00054 cMono
2 H00054 B
2 H00054 DC
2 H00054 MAIT
2 H00054 ncMono
2 H00054 DP
2 H00054 DN
2 H00054 PB
2 H00054 iNKT
2 H00054 PLT
2 H00054 HSC
3 H00058 GDT
3 H00058 CD4
3 H00058 nan
3 H00058 NK
3 H00058 CD8
3 H00058 cMono
3 H00058 B
3 H00058 DC
3 H00058 MAIT
3 H00058 ncMono
3 H00058 DP
3 H00058 DN
3 H00058 PB
3 H00058 iNKT
3 H00058 PLT
3 H00058 HSC
4 H00064 GDT
4 H00064 CD4
4 H00064 nan
4 H00064 NK
4 H00064 CD8
4 H00064 cMono
4 H00064 B
4 H00064 DC
4 H00064 MAIT
4 H00064 ncMono
4 H00064

In [12]:
cytof_pseudobulks_shared = {}
for i, patient in enumerate(pd.unique(cytof_shared.obs_names)):
    cytof_patient = cytof_shared[cytof_shared.obs.COMBAT_ID == patient, :]
    for cell_type in pd.unique(cytof_shared.obs.Annotation_major_subset):
        cytof_patient_cell_type = cytof_patient[cytof_patient.obs.Annotation_major_subset == cell_type, :]
        if len(cytof_patient_cell_type) > 0:
            print(i, patient, cell_type)
            key = f'{cell_type}'
            row = cytof_patient_cell_type.X.toarray()
            if row.shape[0] > 1:
                row = row.mean(axis=0, dtype = "float64")
            else:
                row = row[0]
            if key not in cytof_pseudobulks_shared:
                df = pd.DataFrame(columns=cytof_shared.var_names, dtype = "float64")
                df.loc[patient] = row
                cytof_pseudobulks_shared[key] = df
            else:
                cytof_pseudobulks_shared[key].loc[patient] = row

            print(i, patient, cell_type)

0 G05073 CD8
0 G05073 CD8
0 G05073 DN
0 G05073 DN
0 G05073 CD4
0 G05073 CD4
0 G05073 B
0 G05073 B
0 G05073 MAIT
0 G05073 MAIT
0 G05073 cMono
0 G05073 cMono
0 G05073 GDT
0 G05073 GDT
0 G05073 NK
0 G05073 NK
0 G05073 ncMono
0 G05073 ncMono
0 G05073 Basophil
0 G05073 Basophil
0 G05073 PB
0 G05073 PB
0 G05073 DC
0 G05073 DC
0 G05073 nan
0 G05073 nan
1 H00053 CD8
1 H00053 CD8
1 H00053 DN
1 H00053 DN
1 H00053 CD4
1 H00053 CD4
1 H00053 B
1 H00053 B
1 H00053 MAIT
1 H00053 MAIT
1 H00053 cMono
1 H00053 cMono
1 H00053 GDT
1 H00053 GDT
1 H00053 NK
1 H00053 NK
1 H00053 ncMono
1 H00053 ncMono
1 H00053 Basophil
1 H00053 Basophil
1 H00053 PB
1 H00053 PB
1 H00053 DC
1 H00053 DC
1 H00053 nan
1 H00053 nan
2 H00054 CD8
2 H00054 CD8
2 H00054 DN
2 H00054 DN
2 H00054 CD4
2 H00054 CD4
2 H00054 B
2 H00054 B
2 H00054 MAIT
2 H00054 MAIT
2 H00054 cMono
2 H00054 cMono
2 H00054 GDT
2 H00054 GDT
2 H00054 NK
2 H00054 NK
2 H00054 ncMono
2 H00054 ncMono
2 H00054 Basophil
2 H00054 Basophil
2 H00054 PB
2 H00054 PB
2 H000

In [13]:
rna_bulk_pseudobulks_shared = {}
for i, patient in enumerate(pd.unique(bulkRNA_shared.obs_names)):
    rna_bulk_patient = bulkRNA_shared[bulkRNA_shared.obs.COMBAT_ID == patient, :]
    for module in pd.unique(bulk_genes_membership.module):
        print(i, patient, module)
        gene_membership_bulk_module = bulk_genes_membership.loc[bulk_genes_membership.module == module, 'gene_name']
        rna_bulk_patient_module = rna_bulk_patient[:, gene_membership_bulk_module.index]
        key = f'{module}'
        row = rna_bulk_patient_module.X.toarray()
        if row.shape[0] > 1:
            row = row.mean(axis=0, dtype = "float64")
        else:
            row = row[0]
        if key not in rna_bulk_pseudobulks_shared:
            df = pd.DataFrame(index = pd.unique(bulkRNA_shared.obs_names), columns = gene_membership_bulk_module, dtype = "float64")
            df.iloc[i] = row
            rna_bulk_pseudobulks_shared[key] = df
        else:
            rna_bulk_pseudobulks_shared[key].iloc[i] = row

0 G05073 greenyellow
0 G05073 green
0 G05073 magenta
0 G05073 lightgreen
0 G05073 black
0 G05073 turquoise
0 G05073 lightcyan
0 G05073 midnightblue
0 G05073 blue
0 G05073 grey60
0 G05073 purple
0 G05073 cyan
0 G05073 grey
1 H00053 greenyellow
1 H00053 green
1 H00053 magenta
1 H00053 lightgreen
1 H00053 black
1 H00053 turquoise
1 H00053 lightcyan
1 H00053 midnightblue
1 H00053 blue
1 H00053 grey60
1 H00053 purple
1 H00053 cyan
1 H00053 grey
2 H00054 greenyellow
2 H00054 green
2 H00054 magenta
2 H00054 lightgreen
2 H00054 black
2 H00054 turquoise
2 H00054 lightcyan
2 H00054 midnightblue
2 H00054 blue
2 H00054 grey60
2 H00054 purple
2 H00054 cyan
2 H00054 grey
3 H00058 greenyellow
3 H00058 green
3 H00058 magenta
3 H00058 lightgreen
3 H00058 black
3 H00058 turquoise
3 H00058 lightcyan
3 H00058 midnightblue
3 H00058 blue
3 H00058 grey60
3 H00058 purple
3 H00058 cyan
3 H00058 grey
4 H00064 greenyellow
4 H00064 green
4 H00064 magenta
4 H00064 lightgreen
4 H00064 black
4 H00064 turquoise
4 H00

In [14]:
luminex_pseudobulks_shared = pd.DataFrame(columns = luminex_shared.var_names, dtype = "float64")

for i, patient in enumerate(pd.unique(luminex_shared.obs_names)):
    print(f"{i} {patient}")
    luminex_patient = luminex_shared[luminex_shared.obs.COMBAT_ID==patient, :]
    row = luminex_patient.X.toarray()
    if row.shape[0] > 1:
        row = row.mean(axis=0, dtype = "float64")
    else:
        row = row[0]
    
    luminex_pseudobulks_shared.loc[patient] = row

0 G05073
1 H00053
2 H00054
3 H00058
4 H00064
5 H00070
6 H00072
7 H00085
8 N00006
9 N00007
10 N00012
11 N00017
12 N00021
13 N00023
14 N00024
15 N00028
16 N00029
17 N00032
18 N00033
19 N00037
20 N00038
21 N00039
22 N00047
23 N00050
24 S00002
25 S00003
26 S00005
27 S00006
28 S00007
29 S00016
30 S00020
31 S00024
32 S00027
33 S00030
34 S00033
35 S00034
36 S00037
37 S00039
38 S00040
39 S00041
40 S00042
41 S00045
42 S00048
43 S00049
44 S00052
45 S00053
46 S00054
47 S00056
48 S00058
49 S00060
50 S00064
51 S00065
52 S00067
53 S00068
54 S00069
55 S00072
56 S00076
57 S00078
58 S00081
59 S00082
60 S00094
61 S00095
62 S00097
63 S00099
64 S00104
65 S00106
66 S00109
67 S00113
68 S00114
69 S00119
70 S00124
71 S00129
72 S00134
73 S00142
74 S00148


In [15]:
facs_pseudobulks_shared = pd.DataFrame(columns = facs_shared.var_names, dtype = "float64")

for i, patient in enumerate(pd.unique(facs_shared.obs_names)):
    print(f"{i} {patient}")
    facs_patient = facs_shared[facs_shared.obs.COMBAT_ID==patient, :]
    row = facs_patient.X.toarray()
    if row.shape[0] > 1:
        row = row.mean(axis=0, dtype = "float64")
    else:
        row = row[0]
    
    facs_pseudobulks_shared.loc[patient] = row

0 G05073
1 H00053
2 H00054
3 H00058
4 H00064
5 H00070
6 H00072
7 H00085
8 N00006
9 N00007
10 N00012
11 N00017
12 N00021
13 N00023
14 N00024
15 N00028
16 N00029
17 N00032
18 N00033
19 N00037
20 N00038
21 N00039
22 N00047
23 N00050
24 S00002
25 S00003
26 S00005
27 S00006
28 S00007
29 S00016
30 S00020
31 S00024
32 S00027
33 S00030
34 S00033
35 S00034
36 S00037
37 S00039
38 S00040
39 S00041
40 S00042
41 S00045
42 S00048
43 S00049
44 S00052
45 S00053
46 S00054
47 S00056
48 S00058
49 S00060
50 S00064
51 S00065
52 S00067
53 S00068
54 S00069
55 S00072
56 S00076
57 S00078
58 S00081
59 S00082
60 S00094
61 S00095
62 S00097
63 S00099
64 S00104
65 S00106
66 S00109
67 S00113
68 S00114
69 S00119
70 S00124
71 S00129
72 S00134
73 S00142
74 S00148


## Postprocessing

In [16]:
# Seperate noise from non-noise data 
id_to_pb = {}
id_to_pb_noise = {}
noise_id_to_name = {}
noise_name_to_id = {}
noise = 0

id = name_to_id["luminex"]
id_to_pb[id] = luminex_pseudobulks_shared

id = name_to_id["facs"]
id_to_pb[id] = facs_pseudobulks_shared
                
for name, pb in cytof_pseudobulks_shared.items():
    name = "cytof_" + name
    if not (str(name).find("grey") > -1 and str(name).find("grey60") == -1) and not (str(name).find("nan") > -1):
        id = name_to_id[name]
        id_to_pb[id] = pb
    else:
        id = "N" + str(noise)
        noise += 1
        id_to_pb_noise[id] = pb
        noise_name_to_id[name] = id
        noise_id_to_name[id] = name

for name, pb in adt_cite_pseudobulks_shared_p.items():
    name = "adt_" + name
    if not (str(name).find("grey") > -1 and str(name).find("grey60") == -1) and not (str(name).find("nan") > -1):
        id = name_to_id[name]
        id_to_pb[id] = pb
    else:
        id = "N" + str(noise)
        noise += 1
        id_to_pb_noise[id] = pb
        noise_name_to_id[name] = id
        noise_id_to_name[id] = name

for name, pb in rna_cite_pseudobulks_shared_p.items():
    name = "citeRNA_" + name
    if not (str(name).find("grey") > -1 and str(name).find("grey60") == -1) and not (str(name).find("nan") > -1):
        id = name_to_id[name]
        id_to_pb[id] = pb
    else:
        id = "N" + str(noise)
        noise += 1
        id_to_pb_noise[id] = pb
        noise_name_to_id[name] = id
        noise_id_to_name[id] = name

for name, pb in rna_bulk_pseudobulks_shared.items():
    name = "bulkRNA_" + name
    if not (str(name).find("grey") > -1 and str(name).find("grey60") == -1) and not (str(name).find("nan") > -1):
        id = name_to_id[name]
        id_to_pb[id] = pb
    else:
        id = "N" + str(noise)
        noise += 1
        id_to_pb_noise[id] = pb
        noise_name_to_id[name] = id
        noise_id_to_name[id] = name

In [17]:
disease_to_disease_group = {}
disease_group_to_disease = {"Sepsis": [], "Healthy": [], "COVID": [], "Flu": []}

for d in set(COMBAT_CLINVAR_for_processed["Source"]):
    if d.find("Sepsis") > -1:
        dg = "Sepsis"
    elif d.find("COVID_HCW") > -1:
        dg = "Healthy"
    elif d.find("COVID") > -1 :
        dg = "COVID"
    elif d == "HV":
        dg = "Healthy"
    elif d == "Flu":
        dg = "Flu"
    
    disease_to_disease_group[d] = dg
    disease_group_to_disease[dg].append(d)

In [18]:
patient_to_disease_assigned = {}
patient_to_disease_group = {}
disease_group_to_patient = {"Sepsis": [], "Healthy": [], "COVID": []}
disease_assigned_to_patient = {'COVID_CONV': [], 'COVID_CRIT': [], 'COVID_HCW_CONV': [], 'COVID_HCW_MILD':[], 'COVID_MILD':[], 'COVID_SEV':[], 'HV':[],'Sepsis':[],'Sepsis_CONV':[]}
patient_to_disease_all = {}


for p in shared_patients:
    data = COMBAT_CLINVAR_for_processed[COMBAT_CLINVAR_for_processed["COMBAT_ID"] == p]
    Disease_data = list(data["Source"])[0]
    DiseaseGroup_data = disease_to_disease_group[Disease_data]


    patient_to_disease_assigned[p] = Disease_data
    patient_to_disease_group[p] = DiseaseGroup_data
    disease_group_to_patient[DiseaseGroup_data].append(p)
    disease_assigned_to_patient[Disease_data].append(p)
    patient_to_disease_all[p] = list(data["Source"])

In [19]:
id_to_pb_anndata = {}
for id, pb in id_to_pb.items():
    #anndata_obj = ad.AnnData(quant.fit_transform(pb.to_numpy()), dtype="float64")
    anndata_obj = ad.AnnData(pb.to_numpy(), dtype="float64")
    anndata_obj.obs_names = list(pb.index)
    anndata_obj.var_names = list(pb.columns)

    anndata_obj.layers["quant_N"] = quant_N.fit_transform(pb.to_numpy())
    anndata_obj.layers["quant_U"] = quant_U.fit_transform(pb.to_numpy())
    anndata_obj.layers["power"] = power.fit_transform(pb.to_numpy())
    anndata_obj.layers["minmax"] = minmax.fit_transform(pb.to_numpy())
    anndata_obj.layers["maxabs"] = maxabs.fit_transform(pb.to_numpy())

    module = "None"
    modality = id_to_name[id].split("_")[0]
    if id in list(id_to_module_name.keys()):
        module = id_to_module_name[id]

    if modality == "citeRNA":
        ct = id_to_name[id].split("_")[1].split("-")[0]
        geex = id_to_name[id].split("_")[1].split("-")[1]
        anndata_obj.layers["standard"] = standard_sparse.fit_transform(pb.to_numpy())
    elif modality == "facs": 
        ct = "CD4"
        geex = "None"
        anndata_obj.layers["standard"] = standard.fit_transform(pb.to_numpy())
    elif modality == "luminex":
        ct = "bulk"
        geex = "None"
        anndata_obj.layers["standard"] = standard.fit_transform(pb.to_numpy())
    elif modality == "bulkRNA":
        ct = "bulk"
        geex = id_to_name[id].split("_")[1]
        anndata_obj.layers["standard"] = standard.fit_transform(pb.to_numpy())
    elif modality == "adt":
        anndata_obj.layers["standard"] = standard_sparse.fit_transform(pb.to_numpy())
        ct = id_to_name[id].split("_")[1]
        geex = "None"
    else:
        ct = id_to_name[id].split("_")[1]
        geex = "None"
        anndata_obj.layers["standard"] = standard.fit_transform(pb.to_numpy())

    Disease = []
    DiseaseGroup = []
    Age = []
    Sex = []
    BMI = []
    Hospitalstay = []
    Death28 = []
    Institute = []
    PreExistingHeartDisease = []
    PreExistingLungDisease = []
    PreExistingKidneyDisease = []
    PreExistingDiabetes = []
    PreExistingHypertension = []
    PreExistingImmunocompromised = []
    Smoking = []
    Symptomatic = []
    Requiredvasoactive = []
    Respiratorysupport = []
    SARSCoV2PCR = []
    Outcome = []
    TimeSinceOnset = []
    StateChange = []

    for p in list(pb.index):
        data = COMBAT_CLINVAR_for_processed[COMBAT_CLINVAR_for_processed["COMBAT_ID"] == p]
        Disease_data = list(data["Source"])[0]
        DiseaseGroup_data = disease_to_disease_group[Disease_data]
        Age_data = list(data["Age"])[0]
        Sex_data = list(data["Sex"])[0]
        BMI_data = list(data["BMI"])[0]
        Hospitalstay_data = set(data["Hospitalstay"]) # Exclude NaN if len > 0
        Death28_data = np.mean(list(data["Death28"])) 
        Institute_data = list(data["Institute"])[0]
        PreExistingHeartDisease_data = list(data["PreExistingHeartDisease"])[0] 
        PreExistingLungDisease_data = list(data["PreExistingLungDisease"])[0] 
        PreExistingKidneyDisease_data = list(data["PreExistingKidneyDisease"])[0]
        PreExistingDiabetes_data = list(data["PreExistingDiabetes"])[0] 
        PreExistingHypertension_data = list(data["PreExistingHypertension"])[0] 
        PreExistingImmunocompromised_data = list(data["PreExistingImmunocompromised"])[0] 
        Smoking_data = list(data["Smoking"])[0]
        Symptomatic_data = set(data["Symptomatic"]) # Exclude NaN if len > 0
        Requiredvasoactive_data = set(data["Requiredvasoactive"]) # Exclude NaN if len > 0
        Respiratorysupport_data = set(data["Respiratorysupport"]) # Exclude NaN if len > 0
        SARSCoV2PCR_data = np.mean(list(data["SARSCoV2PCR"])) 
        Outcome_data = np.mean(list(data["Outcome"])) 
        TimeSinceOnset_data = set(data["TimeSinceOnset"]) # Exclude NaN if len > 0
        StateChange_data = len(set(data["Source"])) > 1


        #################################################################
        Hospitalstay_data_fixed = 0 # Skip Nan
        count = 0
        for d in Hospitalstay_data:
            if not math.isnan(d):
                Hospitalstay_data_fixed+= d
                count += 1
        if count > 0:
            Hospitalstay_data = Hospitalstay_data_fixed / count
        else: 
            Hospitalstay_data = -1 #-1 means Nan
        #################################################################
        Symptomatic_data_fixed = 0 # Nan means 0, so asymtomatic 
        for d in Symptomatic_data:
            if not math.isnan(d):
                Symptomatic_data_fixed+= d
        Symptomatic_data = Symptomatic_data_fixed / len(Symptomatic_data)
        #################################################################
        Requiredvasoactive_data_fixed = 0 # Skip Nan
        count = 0
        for d in Requiredvasoactive_data:
            if not math.isnan(d):
                Requiredvasoactive_data_fixed += d
                count += 1
        if count > 0:
            Requiredvasoactive_data = Requiredvasoactive_data_fixed / count
        else: 
            Requiredvasoactive_data = -1 #-1 means Nan
        #################################################################
        Respiratorysupport_data_fixed = 0 # Skip Nan
        count = 0
        for d in Respiratorysupport_data:
            if not math.isnan(d):
                Respiratorysupport_data_fixed += d
                count += 1
        if count > 0:
            Respiratorysupport_data = Respiratorysupport_data_fixed / count
        else: 
            Respiratorysupport_data = -1 #-1 means Nan
        #################################################################
        TimeSinceOnset_data_fixed = 0 # Skip Nan
        count = 0
        for d in TimeSinceOnset_data:
            if not math.isnan(d):
                TimeSinceOnset_data_fixed += d
                count += 1
        if count > 0:
            TimeSinceOnset_data = TimeSinceOnset_data_fixed / count
        else: 
            TimeSinceOnset_data = -1 #-1 means Nan


   
        Disease.append(Disease_data)
        DiseaseGroup.append(DiseaseGroup_data)
        Age.append(Age_data)
        BMI.append(BMI_data)
        Hospitalstay.append(Hospitalstay_data)
        Death28.append(Death28_data)
        Institute.append(Institute_data)
        PreExistingHeartDisease.append(PreExistingHeartDisease_data)
        PreExistingLungDisease.append(PreExistingLungDisease_data)
        PreExistingKidneyDisease.append(PreExistingKidneyDisease_data)
        PreExistingDiabetes.append(PreExistingDiabetes_data)
        PreExistingHypertension.append(PreExistingHypertension_data)
        PreExistingImmunocompromised.append(PreExistingImmunocompromised_data)
        Smoking.append(Smoking_data)
        Symptomatic.append(Symptomatic_data)
        Requiredvasoactive.append(Requiredvasoactive_data)
        Respiratorysupport.append(Respiratorysupport_data)
        SARSCoV2PCR.append(SARSCoV2PCR_data)
        Outcome.append(Outcome_data)
        TimeSinceOnset.append(TimeSinceOnset_data)
        StateChange.append(StateChange_data)
        Sex.append(Sex_data)


    

    anndata_obj.obs["Disease"] = Disease 
    anndata_obj.obs["DiseaseGroup"] = DiseaseGroup 
    anndata_obj.obs["Age"] = Age 
    anndata_obj.obs["BMI"] = BMI 
    anndata_obj.obs["Hospitalstay"] = Hospitalstay
    anndata_obj.obs["Death28"] = Death28 
    anndata_obj.obs["Institute"] = Institute
    anndata_obj.obs["PreExistingHeartDisease"] = PreExistingHeartDisease 
    anndata_obj.obs["PreExistingLungDisease"] = PreExistingLungDisease 
    anndata_obj.obs["PreExistingKidneyDisease"] = PreExistingKidneyDisease 
    anndata_obj.obs["PreExistingDiabetes"] = PreExistingDiabetes 
    anndata_obj.obs["PreExistingHypertension"] = PreExistingHypertension 
    anndata_obj.obs["PreExistingImmunocompromised"] = PreExistingImmunocompromised 
    anndata_obj.obs["Smoking"] = Smoking 
    anndata_obj.obs["Symptomatic"] = Symptomatic
    anndata_obj.obs["Requiredvasoactive"] = Requiredvasoactive 
    anndata_obj.obs["Respiratorysupport"] = Respiratorysupport 
    anndata_obj.obs["SARSCoV2PCR"] = SARSCoV2PCR 
    anndata_obj.obs["Outcome"] = Outcome 
    anndata_obj.obs["TimeSinceOnset"] = TimeSinceOnset 
    anndata_obj.obs["StateChange"] = StateChange 
    anndata_obj.obs["Sex"] = Sex 
    anndata_obj.uns["name"] = id_to_name[id]
    anndata_obj.uns["id"] = id
    anndata_obj.uns["modality"] = modality
    anndata_obj.uns["module"] = module
    anndata_obj.uns["gene-exp"] = geex
    anndata_obj.uns["cell-type"] = ct 
    anndata_obj.uns["level"] = "Level 1"
    anndata_obj.layers["original"] = anndata_obj.X.copy()


    anndata_obj.X = anndata_obj.X.copy()
    id_to_pb_anndata[id] = anndata_obj



In [20]:
id_to_pb_noise_anndata = {}
for id, pb in id_to_pb_noise.items():
    #anndata_obj = ad.AnnData(quant.fit_transform(pb.to_numpy()), dtype="float64")
    anndata_obj = ad.AnnData(pb.to_numpy(), dtype="float64")
    anndata_obj.obs_names = list(pb.index)
    anndata_obj.var_names = list(pb.columns)

    anndata_obj.layers["quant_N"] = quant_N.fit_transform(pb.to_numpy())
    anndata_obj.layers["quant_U"] = quant_U.fit_transform(pb.to_numpy())
    anndata_obj.layers["power"] = power.fit_transform(pb.to_numpy())
    anndata_obj.layers["minmax"] = minmax.fit_transform(pb.to_numpy())
    anndata_obj.layers["maxabs"] = maxabs.fit_transform(pb.to_numpy())
    anndata_obj.layers["standard"] = standard_sparse.fit_transform(pb.to_numpy())

    module = "Noise"
    modality = noise_id_to_name[id].split("_")[0]

    Disease = []
    DiseaseGroup = []
    Age = []
    Sex = []
    BMI = []
    Hospitalstay = []
    Death28 = []
    Institute = []
    PreExistingHeartDisease = []
    PreExistingLungDisease = []
    PreExistingKidneyDisease = []
    PreExistingDiabetes = []
    PreExistingHypertension = []
    PreExistingImmunocompromised = []
    Smoking = []
    Symptomatic = []
    Requiredvasoactive = []
    Respiratorysupport = []
    SARSCoV2PCR = []
    Outcome = []
    TimeSinceOnset = []
    StateChange = []

    for p in list(pb.index):
        data = COMBAT_CLINVAR_for_processed[COMBAT_CLINVAR_for_processed["COMBAT_ID"] == p]
        Disease_data = list(data["Source"])[0]
        DiseaseGroup_data = disease_to_disease_group[Disease_data]
        Age_data = list(data["Age"])[0]
        Sex_data = list(data["Sex"])[0]
        BMI_data = list(data["BMI"])[0]
        Hospitalstay_data = set(data["Hospitalstay"]) # Exclude NaN if len > 0
        Death28_data = np.mean(list(data["Death28"])) 
        Institute_data = list(data["Institute"])[0]
        PreExistingHeartDisease_data = list(data["PreExistingHeartDisease"])[0] 
        PreExistingLungDisease_data = list(data["PreExistingLungDisease"])[0] 
        PreExistingKidneyDisease_data = list(data["PreExistingKidneyDisease"])[0]
        PreExistingDiabetes_data = list(data["PreExistingDiabetes"])[0] 
        PreExistingHypertension_data = list(data["PreExistingHypertension"])[0] 
        PreExistingImmunocompromised_data = list(data["PreExistingImmunocompromised"])[0] 
        Smoking_data = list(data["Smoking"])[0]
        Symptomatic_data = set(data["Symptomatic"]) # Exclude NaN if len > 0
        Requiredvasoactive_data = set(data["Requiredvasoactive"]) # Exclude NaN if len > 0
        Respiratorysupport_data = set(data["Respiratorysupport"]) # Exclude NaN if len > 0
        SARSCoV2PCR_data = np.mean(list(data["SARSCoV2PCR"])) 
        Outcome_data = np.mean(list(data["Outcome"])) 
        TimeSinceOnset_data = set(data["TimeSinceOnset"]) # Exclude NaN if len > 0
        StateChange_data = len(set(data["Source"])) > 1


        #################################################################
        Hospitalstay_data_fixed = 0 # Skip Nan
        count = 0
        for d in Hospitalstay_data:
            if not math.isnan(d):
                Hospitalstay_data_fixed+= d
                count += 1
        if count > 0:
            Hospitalstay_data = Hospitalstay_data_fixed / count
        else: 
            Hospitalstay_data = -1 #-1 means Nan
        #################################################################
        Symptomatic_data_fixed = 0 # Nan means 0, so asymtomatic 
        for d in Symptomatic_data:
            if not math.isnan(d):
                Symptomatic_data_fixed+= d
        Symptomatic_data = Symptomatic_data_fixed / len(Symptomatic_data)
        #################################################################
        Requiredvasoactive_data_fixed = 0 # Skip Nan
        count = 0
        for d in Requiredvasoactive_data:
            if not math.isnan(d):
                Requiredvasoactive_data_fixed += d
                count += 1
        if count > 0:
            Requiredvasoactive_data = Requiredvasoactive_data_fixed / count
        else: 
            Requiredvasoactive_data = -1 #-1 means Nan
        #################################################################
        Respiratorysupport_data_fixed = 0 # Skip Nan
        count = 0
        for d in Respiratorysupport_data:
            if not math.isnan(d):
                Respiratorysupport_data_fixed += d
                count += 1
        if count > 0:
            Respiratorysupport_data = Respiratorysupport_data_fixed / count
        else: 
            Respiratorysupport_data = -1 #-1 means Nan
        #################################################################
        TimeSinceOnset_data_fixed = 0 # Skip Nan
        count = 0
        for d in TimeSinceOnset_data:
            if not math.isnan(d):
                TimeSinceOnset_data_fixed += d
                count += 1
        if count > 0:
            TimeSinceOnset_data = TimeSinceOnset_data_fixed / count
        else: 
            TimeSinceOnset_data = -1 #-1 means Nan


   
        Disease.append(Disease_data)
        DiseaseGroup.append(DiseaseGroup_data)
        Age.append(Age_data)
        BMI.append(BMI_data)
        Hospitalstay.append(Hospitalstay_data)
        Death28.append(Death28_data)
        Institute.append(Institute_data)
        PreExistingHeartDisease.append(PreExistingHeartDisease_data)
        PreExistingLungDisease.append(PreExistingLungDisease_data)
        PreExistingKidneyDisease.append(PreExistingKidneyDisease_data)
        PreExistingDiabetes.append(PreExistingDiabetes_data)
        PreExistingHypertension.append(PreExistingHypertension_data)
        PreExistingImmunocompromised.append(PreExistingImmunocompromised_data)
        Smoking.append(Smoking_data)
        Symptomatic.append(Symptomatic_data)
        Requiredvasoactive.append(Requiredvasoactive_data)
        Respiratorysupport.append(Respiratorysupport_data)
        SARSCoV2PCR.append(SARSCoV2PCR_data)
        Outcome.append(Outcome_data)
        TimeSinceOnset.append(TimeSinceOnset_data)
        StateChange.append(StateChange_data)
        Sex.append(Sex_data)


    

    anndata_obj.obs["Disease"] = Disease 
    anndata_obj.obs["DiseaseGroup"] = DiseaseGroup 
    anndata_obj.obs["Age"] = Age 
    anndata_obj.obs["BMI"] = BMI 
    anndata_obj.obs["Hospitalstay"] = Hospitalstay
    anndata_obj.obs["Death28"] = Death28 
    anndata_obj.obs["Institute"] = Institute
    anndata_obj.obs["PreExistingHeartDisease"] = PreExistingHeartDisease 
    anndata_obj.obs["PreExistingLungDisease"] = PreExistingLungDisease 
    anndata_obj.obs["PreExistingKidneyDisease"] = PreExistingKidneyDisease 
    anndata_obj.obs["PreExistingDiabetes"] = PreExistingDiabetes 
    anndata_obj.obs["PreExistingHypertension"] = PreExistingHypertension 
    anndata_obj.obs["PreExistingImmunocompromised"] = PreExistingImmunocompromised 
    anndata_obj.obs["Smoking"] = Smoking 
    anndata_obj.obs["Symptomatic"] = Symptomatic
    anndata_obj.obs["Requiredvasoactive"] = Requiredvasoactive 
    anndata_obj.obs["Respiratorysupport"] = Respiratorysupport 
    anndata_obj.obs["SARSCoV2PCR"] = SARSCoV2PCR 
    anndata_obj.obs["Outcome"] = Outcome 
    anndata_obj.obs["TimeSinceOnset"] = TimeSinceOnset 
    anndata_obj.obs["StateChange"] = StateChange 
    anndata_obj.obs["Sex"] = Sex 
    anndata_obj.uns["name"] = noise_id_to_name[id]
    anndata_obj.uns["id"] = id
    anndata_obj.uns["modality"] = modality
    anndata_obj.uns["module"] = module
    anndata_obj.layers["original"] = anndata_obj.X.copy()

    #anndata_obj.X = standard_scaler.fit_transform(anndata_obj.X.copy())
    anndata_obj.X =anndata_obj.X.copy()

    id_to_pb_noise_anndata[id] = anndata_obj



In [21]:
#################### Non Noise PB Data #################################
with open(os.path.join(shared, 'id_to_pb.pickle'), 'wb') as f: # DataFrame 
    pickle.dump(id_to_pb, f)
with open(os.path.join(shared, 'id_to_pb_ad.pickle'), 'wb') as f: # AnnData
    pickle.dump(id_to_pb_anndata, f)
#################### Noise PB Data #####################################
with open(os.path.join(shared, 'id_to_pb_noise.pickle'), 'wb') as f:
    pickle.dump(id_to_pb_noise, f)
with open(os.path.join(shared, 'id_to_pb_noise_anndata.pickle'), 'wb') as f:
    pickle.dump(id_to_pb_noise_anndata, f)
#################### id to ... ##########################################
with open(os.path.join(shared, 'noise_id_to_name.pickle'), 'wb') as f: # noise
    pickle.dump(noise_id_to_name, f) 
with open(os.path.join(shared, 'noise_name_to_id.pickle'), 'wb') as f:
    pickle.dump(noise_name_to_id, f)

with open(os.path.join(complementary, 'id_to_module_name.pickle'), 'wb') as f: # Cycling etc.
    pickle.dump(id_to_module_name, f)
with open(os.path.join(complementary, 'module_name_to_id.pickle'), 'wb') as f:
    pickle.dump(module_name_to_id, f)
#################### Diseases #####################################
with open(os.path.join(complementary, 'disease_group_to_disease.pickle'), 'wb') as f:
    pickle.dump(disease_group_to_disease, f)
with open(os.path.join(complementary, 'disease_to_disease_group.pickle'), 'wb') as f:
    pickle.dump(disease_to_disease_group, f)

with open(os.path.join(shared, 'patient_to_disease_group.pickle'), 'wb') as f:
    pickle.dump(patient_to_disease_group, f)
with open(os.path.join(shared, 'disease_group_to_patient.pickle'), 'wb') as f:
    pickle.dump(disease_group_to_patient, f)

with open(os.path.join(shared, 'patient_to_disease.pickle'), 'wb') as f:
    pickle.dump(patient_to_disease_assigned, f)
with open(os.path.join(shared, 'disease_to_patient.pickle'), 'wb') as f:
    pickle.dump(disease_assigned_to_patient, f)

with open(os.path.join(shared, 'shared_patient_75_to_all_diseases.pickle'), 'wb') as f:
    pickle.dump(patient_to_disease_all, f)

## 74 Shared Patients Across 246 Networks

In [22]:
patient_inclusion_in_graphs = {}
for id, pb in id_to_pb.items():
    patients_in_the_graph_one_hot = []
    for p in shared_patients:
        if p in list(pb.index):
            patients_in_the_graph_one_hot.append(1)
        else:
            patients_in_the_graph_one_hot.append(0)
    patient_inclusion_in_graphs[id] = patients_in_the_graph_one_hot

patient_inclusion_in_graphs = pd.DataFrame(data=patient_inclusion_in_graphs, index=shared_patients, dtype = "float64").transpose()

In [23]:
large_graphs = {"75":[], "74":[]}
for g in patient_inclusion_in_graphs.index:
    if patient_inclusion_in_graphs.sum(axis = 1)[g] == 75:
        large_graphs["75"].append(g)
    elif patient_inclusion_in_graphs.sum(axis = 1)[g] == 74:
        large_graphs["74"].append(g)

print("number of large graphs is ", len(large_graphs["74"]) + len(large_graphs["75"]))

number of large graphs is  246


In [24]:
len(patient_inclusion_in_graphs[patient_inclusion_in_graphs.sum(axis = 1) < 74]) == 406 - 40 - 246

True

In [25]:
for i, id in enumerate(large_graphs["74"]):
    if i == 0:
        shared_patients_across_networks_74 = set(id_to_pb[id].index)
    else:
        shared_patients_across_networks_74 = shared_patients_across_networks_74.intersection(set(id_to_pb[id].index))

for id in large_graphs["75"]:
    shared_patients_across_networks_74 = shared_patients_across_networks_74.intersection(set(id_to_pb[id].index))

shared_patients_across_networks_74 = list(shared_patients_across_networks_74)
print(len(shared_patients_across_networks_74) == 74)

True


In [26]:
# Noise
patient_inclusion_in_noise_graphs = {}
for name, pb in id_to_pb_noise.items():
    patients_in_the_graph_one_hot = []
    for p in shared_patients:
        if p in list(pb.index):
            patients_in_the_graph_one_hot.append(1)
        else:
            patients_in_the_graph_one_hot.append(0)
    patient_inclusion_in_noise_graphs[name] = patients_in_the_graph_one_hot

patient_inclusion_in_noise_graphs = pd.DataFrame(data=patient_inclusion_in_noise_graphs, index=shared_patients, dtype = "float64").transpose()

large_noise_graphs = {"75":[], "74":[]}
for g in patient_inclusion_in_noise_graphs.index:
    if patient_inclusion_in_noise_graphs.sum(axis = 1)[g] == 75:
        large_noise_graphs["75"].append(g)
    elif patient_inclusion_in_noise_graphs.sum(axis = 1)[g] == 74:
        large_noise_graphs["74"].append(g)

print("number of large graphs is ", len(large_noise_graphs["74"]) + len(large_noise_graphs["75"]))

number of large graphs is  34


In [27]:
len(patient_inclusion_in_noise_graphs[patient_inclusion_in_noise_graphs.sum(axis = 1) < 74]) == 40 - 34

True

In [28]:
for i, name in enumerate(large_noise_graphs["74"]):
    if i == 0:
        shared_patients_across_noise_networks_74 = set(id_to_pb_noise[name].index)
    else:
        shared_patients_across_noise_networks_74 = shared_patients_across_noise_networks_74.intersection(set(id_to_pb_noise[name].index))

for name in large_noise_graphs["75"]:
    shared_patients_across_noise_networks_74 = shared_patients_across_noise_networks_74.intersection(set(id_to_pb_noise[name].index))

shared_patients_across_noise_networks_74 = list(shared_patients_across_noise_networks_74)
print(len(shared_patients_across_noise_networks_74) == 74)

True


In [29]:
shared_patients_across_networks_74.sort()

## PB Level1

In [30]:
graphs_with_74_nodes = {}
graph_frequency = {'luminex': 0, 'cytof':0, 'facs':0, 'adt':0, 'citeRNA':0, 'bulkRNA':0}
for id in id_to_name.keys():
    if id in large_graphs["75"] or id in large_graphs["74"]:
        g = id_to_pb_anndata[id][shared_patients_across_networks_74]
        graph_frequency[id_to_name[id].split("_")[0]] += 1
        graphs_with_74_nodes[id] = g.copy()


graph_frequency

  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


{'luminex': 1,
 'cytof': 12,
 'facs': 1,
 'adt': 11,
 'citeRNA': 209,
 'bulkRNA': 12}

In [31]:
noise_graphs_with_74_nodes = {}
noise_frequency = {'luminex': 0, 'cytof':0, 'facs':0, 'adt':0, 'citeRNA':0, 'bulkRNA':0}
for id in noise_id_to_name.keys():
    if id in large_noise_graphs["75"] or id in large_noise_graphs["74"]:
        g = id_to_pb_noise_anndata[id][shared_patients_across_networks_74]
        noise_graphs_with_74_nodes[id] = g 
        noise_frequency[noise_id_to_name[id].split("_")[0]] += 1

noise_frequency

{'luminex': 0, 'cytof': 1, 'facs': 0, 'adt': 1, 'citeRNA': 31, 'bulkRNA': 1}

In [32]:
with open(os.path.join(shared, 'patients_74.pickle'), 'wb') as f:
    pickle.dump(shared_patients_across_networks_74, f)

with open(os.path.join(level1, 'noise_graphs_74.pickle'), 'wb') as f:
    pickle.dump(noise_graphs_with_74_nodes, f)
with open(os.path.join(level1, 'graphs_74.pickle'), 'wb') as f:
    pickle.dump(graphs_with_74_nodes, f)

In [33]:
module_data = {"IFN":{}, "AP1":{}, "cycling":{}, "C19":{}, "ZNF":{}}

# Seperate the module information
IFN_graph_ids = set(large_graphs["74"]).intersection(module_name_to_id["IFN"]).union(set(large_graphs["75"]).intersection(module_name_to_id["IFN"]))
AP1_graph_ids = set(large_graphs["74"]).intersection(module_name_to_id["AP1"]).union(set(large_graphs["75"]).intersection(module_name_to_id["AP1"]))
cycling_graph_ids = set(large_graphs["74"]).intersection(module_name_to_id["cycling"]).union(set(large_graphs["75"]).intersection(module_name_to_id["cycling"]))
C19_graph_ids = set(large_graphs["74"]).intersection(module_name_to_id["C19"]).union(set(large_graphs["75"]).intersection(module_name_to_id["C19"]))
ZNF_graph_ids = set(large_graphs["74"]).intersection(module_name_to_id["ZNF"]).union(set(large_graphs["75"]).intersection(module_name_to_id["ZNF"]))

module_data["IFN"]["graph_ids"] = IFN_graph_ids
module_data["AP1"]["graph_ids"] = AP1_graph_ids
module_data["cycling"]["graph_ids"] = cycling_graph_ids
module_data["C19"]["graph_ids"] = C19_graph_ids
module_data["ZNF"]["graph_ids"] = ZNF_graph_ids

IFN_graphs = []
AP1_graphs = []
cycling_graphs = []
C19_graphs = []
ZNF_graphs = []

for id in IFN_graph_ids:
    g = id_to_pb_anndata[id]
    IFN_graphs.append(g)

for id in AP1_graph_ids:
    g = id_to_pb_anndata[id]
    AP1_graphs.append(g)

for id in cycling_graph_ids:
    g = id_to_pb_anndata[id]
    cycling_graphs.append(g)

for id in C19_graph_ids:
    g = id_to_pb_anndata[id]
    C19_graphs.append(g)

for id in ZNF_graph_ids:
    g = id_to_pb_anndata[id]
    ZNF_graphs.append(g)

module_data["IFN"]["graphs"] = IFN_graphs
module_data["AP1"]["graphs"] = AP1_graphs
module_data["cycling"]["graphs"] = cycling_graphs
module_data["C19"]["graphs"] = C19_graphs
module_data["ZNF"]["graphs"] = ZNF_graphs


In [34]:
with open(os.path.join(level1, 'module_data.pickle'), 'wb') as f:
    pickle.dump(module_data, f)

In [35]:
ground_truth_dg = {}
ground_truth_d = {}
for p in graphs_with_74_nodes["G0"].obs_names:
    dg = graphs_with_74_nodes["G0"][p].obs.DiseaseGroup[0]
    d = graphs_with_74_nodes["G0"][p].obs.Disease[0]

    if dg == "Healthy":
        ground_truth_dg[p] = 0
    elif dg == "Sepsis":
        ground_truth_dg[p] = 1
    elif dg == "COVID":
        ground_truth_dg[p] = 2

    if d == "COVID_HCW_MILD":
        ground_truth_d[p] = 0
    elif d == "HV":
        ground_truth_d[p] = 1
    elif d == "Sepsis":
        ground_truth_d[p] = 2
    elif d == "COVID_MILD":
        ground_truth_d[p] = 3
    elif d == "COVID_SEV":
        ground_truth_d[p] = 4
    elif d == "COVID_CRIT":
        ground_truth_d[p] = 5

with open(os.path.join(shared, 'ground_truth_dg.pickle'), 'wb') as f:
    pickle.dump(ground_truth_dg, f)

with open(os.path.join(shared, 'ground_truth_d.pickle'), 'wb') as f:
    pickle.dump(ground_truth_d, f)

In [36]:
id_to_feature_matrices_level1 = {} 
id_to_feature_matrices_quant_N_level1 = {}
id_to_feature_matrices_quant_U_level1 = {}
id_to_feature_matrices_minmax_level1 = {}
id_to_feature_matrices_maxabs_level1 = {}
id_to_feature_matrices_power_level1={}
id_to_feature_matrices_standard_level1 = {}

for id, g in graphs_with_74_nodes.items():
    id_to_feature_matrices_level1[id] = g.X.copy()
    id_to_feature_matrices_quant_N_level1[id] = g.layers["quant_N"].copy()
    id_to_feature_matrices_quant_U_level1[id] = g.layers["quant_U"].copy()
    id_to_feature_matrices_power_level1[id] = g.layers["power"].copy()
    id_to_feature_matrices_minmax_level1[id] = g.layers["minmax"].copy()
    id_to_feature_matrices_maxabs_level1[id] = g.layers["maxabs"].copy()
    id_to_feature_matrices_standard_level1[id] = g.layers["standard"].copy()

with open(os.path.join(level1, 'feature_matrices.pickle'), 'wb') as f:
    pickle.dump(id_to_feature_matrices_level1, f)

with open(os.path.join(level1, 'feature_matrices_quant_N.pickle'), 'wb') as f:
    pickle.dump(id_to_feature_matrices_quant_N_level1, f)

with open(os.path.join(level1, 'feature_matrices_quant_U.pickle'), 'wb') as f:
    pickle.dump(id_to_feature_matrices_quant_U_level1, f)

with open(os.path.join(level1, 'feature_matrices_power.pickle'), 'wb') as f:
    pickle.dump(id_to_feature_matrices_power_level1, f)

with open(os.path.join(level1, 'feature_matrices_minmax.pickle'), 'wb') as f:
    pickle.dump(id_to_feature_matrices_minmax_level1, f)

with open(os.path.join(level1, 'feature_matrices_maxabs.pickle'), 'wb') as f:
    pickle.dump(id_to_feature_matrices_maxabs_level1, f)

with open(os.path.join(level1, 'feature_matrices_standard.pickle'), 'wb') as f:
    pickle.dump(id_to_feature_matrices_standard_level1, f)


In [37]:
mogcn_data_path_level1 = os.path.abspath(os.path.join(os.getcwd(),"../../../MoGCN-master/data-level1"))

pd.DataFrame(ground_truth_d, index = ["labels"]).T.to_csv(os.path.join(mogcn_data_path_level1, "../labels/gt_d"))
pd.DataFrame(ground_truth_dg, index = ["labels"]).T.to_csv(os.path.join(mogcn_data_path_level1, "../labels/gt_dg"))

for data in graphs_with_74_nodes.values():
    name = data.uns["name"]
    index =  data.to_df().index
    columns =  data.to_df().columns

    data.to_df().to_csv(os.path.join(mogcn_data_path_level1 + "/wot", name))
    pd.DataFrame(data.layers["quant_U"] , index=index, columns=columns).to_csv(os.path.join(mogcn_data_path_level1 + "/quant_U", name))
    pd.DataFrame(data.layers["quant_N"] , index=index, columns=columns).to_csv(os.path.join(mogcn_data_path_level1 + "/quant_N", name))
    pd.DataFrame(data.layers["power"] , index=index, columns=columns).to_csv(os.path.join(mogcn_data_path_level1 + "/power", name))
    pd.DataFrame(data.layers["minmax"] , index=index, columns=columns).to_csv(os.path.join(mogcn_data_path_level1 + "/minmax", name))
    pd.DataFrame(data.layers["maxabs"] , index=index, columns=columns).to_csv(os.path.join(mogcn_data_path_level1 + "/maxabs", name))
    pd.DataFrame(data.layers["standard"] , index=index, columns=columns).to_csv(os.path.join(mogcn_data_path_level1 + "/standard", name))


In [38]:
mogcn_data_path_nc = os.path.abspath(os.path.join(os.getcwd(),"../../../MoGCN-master/data-neg-control/"))
for data in noise_graphs_with_74_nodes.values():
    name = data.uns["name"]
    index =  data.to_df().index
    columns =  data.to_df().columns

    data.to_df().to_csv(os.path.join(mogcn_data_path_nc + "/wot",  "noise_" + name))
    pd.DataFrame(data.layers["quant_U"] , index=index, columns=columns).to_csv(os.path.join(mogcn_data_path_nc + "/quant_U",  "noise_" + name))
    pd.DataFrame(data.layers["quant_N"] , index=index, columns=columns).to_csv(os.path.join(mogcn_data_path_nc + "/quant_N",  "noise_" + name))
    pd.DataFrame(data.layers["power"] , index=index, columns=columns).to_csv(os.path.join(mogcn_data_path_nc + "/power",  "noise_" + name))
    pd.DataFrame(data.layers["minmax"] , index=index, columns=columns).to_csv(os.path.join(mogcn_data_path_nc + "/minmax",  "noise_" + name))
    pd.DataFrame(data.layers["maxabs"] , index=index, columns=columns).to_csv(os.path.join(mogcn_data_path_nc + "/maxabs",  "noise_" + name))
    pd.DataFrame(data.layers["standard"] , index=index, columns=columns).to_csv(os.path.join(mogcn_data_path_nc + "/standard",  "noise_" + name))
    
for data in graphs_with_74_nodes.values():
    name = data.uns["name"]
    index =  data.to_df().index
    columns =  data.to_df().columns

    data.to_df().to_csv(os.path.join(mogcn_data_path_nc + "/wot",  "noise_" + name))
    pd.DataFrame(data.layers["quant_U"] , index=index, columns=columns).to_csv(os.path.join(mogcn_data_path_nc + "/quant_U", name))
    pd.DataFrame(data.layers["quant_N"] , index=index, columns=columns).to_csv(os.path.join(mogcn_data_path_nc + "/quant_N", name))
    pd.DataFrame(data.layers["power"] , index=index, columns=columns).to_csv(os.path.join(mogcn_data_path_nc + "/power", name))
    pd.DataFrame(data.layers["minmax"] , index=index, columns=columns).to_csv(os.path.join(mogcn_data_path_nc + "/minmax", name))
    pd.DataFrame(data.layers["maxabs"] , index=index, columns=columns).to_csv(os.path.join(mogcn_data_path_nc + "/maxabs", name))
    pd.DataFrame(data.layers["standard"] , index=index, columns=columns).to_csv(os.path.join(mogcn_data_path_nc + "/standard", name))

In [39]:
cell_types_with_74 = {"citeRNA": [], "adt":[], "cytof":[] } # Entails information about the pseudobulks (cell-types) that were included in the analysis.
for id, g in graphs_with_74_nodes.items():
    m = g.uns["modality"]
    name = g.uns["name"]
    ct = g.uns["cell-type"]
    if m == "citeRNA":
        cell_types_with_74["citeRNA"].append(ct)
    elif m == "adt":
        cell_types_with_74["adt"].append(ct)
    elif m == "cytof":
        cell_types_with_74["cytof"].append(ct)

cell_types_with_74["citeRNA"]  = list(set(cell_types_with_74["citeRNA"] ))
cell_types_with_74["adt"]  = list(set(cell_types_with_74["adt"] ))
cell_types_with_74["cytof"]  = list(set(cell_types_with_74["cytof"] ))
        

In [40]:
with open(os.path.join(level1, 'cell_types_included.pickle'), 'wb') as f:
    pickle.dump(cell_types_with_74, f)

In [41]:
with open(os.path.join(level1, 'graphs_74.pickle'), 'rb') as f:
    graphs_with_74_nodes = pickle.load(f)

## PB Level2

In [42]:
adt_copy = adt_shared.copy()
adt_copy.obs_names = [str(i) for i in range(len(adt_copy.obs_names)) ]
indices = []
for p in shared_patients_across_networks_74:
    indices += list(adt_copy.obs_names[adt_copy.obs.COMBAT_ID == p])

adt_shared_copy = adt_copy[indices]

adt_shared_copy.obs_names = adt_shared_copy.obs.COMBAT_ID.copy()
adt_shared_copy = adt_shared_copy[adt_shared_copy.obs["Annotation_major_subset"].isin(cell_types_with_74["adt"])].copy()

adt_shared_copy.X = adt_shared_copy.layers["raw"]

  utils.warn_names_duplicates("obs")
AnnData expects .obs.index to contain strings, but got values like:
    ['G05073', 'G05073', 'G05073', 'G05073', 'G05073']

    Inferred to be: categorical

  names = self._prep_dim_index(names, "obs")
  utils.warn_names_duplicates("obs")


In [43]:
adt_raw_L2 = {}
for i, patient in enumerate(pd.unique(adt_shared_copy.obs_names)):
    adt_cite_patient = adt_shared_copy[adt_shared_copy.obs.COMBAT_ID == patient, :]
    for cell_type in pd.unique(adt_shared_copy.obs.Annotation_major_subset):
        adt_cite_patient_cell_type = adt_cite_patient[adt_cite_patient.obs.Annotation_major_subset == cell_type, :]
        if len(adt_cite_patient_cell_type) > 0:
            print(i, patient, cell_type)
            key = f'{cell_type}'
            row = adt_cite_patient_cell_type.X.toarray()
            if row.shape[0] > 1:
                row = row.sum(axis=0, dtype = "float64")
            else:
                row = row[0]
            if key not in adt_raw_L2:
                df = pd.DataFrame(columns = adt_shared_copy.var_names, dtype = "float64")
                df.loc[patient] = row
                adt_raw_L2[key] = df
            else:
                adt_raw_L2[key].loc[patient] = row

0 G05073 GDT
0 G05073 CD4
0 G05073 NK
0 G05073 CD8
0 G05073 cMono
0 G05073 B
0 G05073 DC
0 G05073 ncMono
0 G05073 DP
0 G05073 PB
0 G05073 HSC
1 H00053 GDT
1 H00053 CD4
1 H00053 NK
1 H00053 CD8
1 H00053 cMono
1 H00053 B
1 H00053 DC
1 H00053 ncMono
1 H00053 DP
1 H00053 PB
1 H00053 HSC
2 H00054 GDT
2 H00054 CD4
2 H00054 NK
2 H00054 CD8
2 H00054 cMono
2 H00054 B
2 H00054 DC
2 H00054 ncMono
2 H00054 DP
2 H00054 PB
2 H00054 HSC
3 H00058 GDT
3 H00058 CD4
3 H00058 NK
3 H00058 CD8
3 H00058 cMono
3 H00058 B
3 H00058 DC
3 H00058 ncMono
3 H00058 DP
3 H00058 PB
3 H00058 HSC
4 H00064 GDT
4 H00064 CD4
4 H00064 NK
4 H00064 CD8
4 H00064 cMono
4 H00064 B
4 H00064 DC
4 H00064 ncMono
4 H00064 DP
4 H00064 PB
4 H00064 HSC
5 H00070 GDT
5 H00070 CD4
5 H00070 NK
5 H00070 CD8
5 H00070 cMono
5 H00070 B
5 H00070 DC
5 H00070 ncMono
5 H00070 DP
5 H00070 PB
5 H00070 HSC
6 H00072 GDT
6 H00072 CD4
6 H00072 NK
6 H00072 CD8
6 H00072 cMono
6 H00072 B
6 H00072 DC
6 H00072 ncMono
6 H00072 DP
6 H00072 PB
6 H00072 HSC
7 H000

In [44]:
scRNA_copy = scRNA_shared.copy()
scRNA_copy.obs_names = [str(i) for i in range(len(scRNA_copy.obs_names)) ]
indices = []
for p in shared_patients_across_networks_74:
    indices += list(scRNA_copy.obs_names[scRNA_copy.obs.COMBAT_ID == p])

scRNA_shared_copy = scRNA_copy[indices]

scRNA_shared_copy.obs_names = scRNA_shared_copy.obs.COMBAT_ID.copy()

  utils.warn_names_duplicates("obs")
AnnData expects .obs.index to contain strings, but got values like:
    ['G05073', 'G05073', 'G05073', 'G05073', 'G05073']

    Inferred to be: categorical

  names = self._prep_dim_index(names, "obs")


In [45]:
scRNA_shared_copy = scRNA_shared_copy[scRNA_shared_copy.obs["Annotation_major_subset"].isin(cell_types_with_74["citeRNA"])].copy()
exclude = list(set(membership_all_celltypes[membership_all_celltypes.module == "grey"]["gene_name"]))
scRNA_shared_copy = scRNA_shared_copy[:, ~scRNA_shared_copy.var_names.isin(exclude)]

  utils.warn_names_duplicates("obs")


In [46]:
rna_cite_pseudobulks_shared_p_L2 = {}

for i, patient in enumerate(pd.unique(scRNA_shared_copy.obs_names)): # Go through every patient
        rna_cite_patient = scRNA_shared_copy[scRNA_shared_copy.obs.COMBAT_ID == patient,:] # Filter out all the rows/ measurements that have the ID COMBAT_ID
        for cell_type in pd.unique(scRNA_shared_copy.obs.Annotation_major_subset): # Go through every cell type 
            rna_cite_patient_cell_type = rna_cite_patient[rna_cite_patient.obs.Annotation_major_subset == cell_type, :] # From all the patients with PID, filter out those measurements that are related to a specific cell type
            if len(rna_cite_patient_cell_type) > 0:
                print(i, patient, cell_type)
                if rna_cite_patient_cell_type.shape[0] == 0:
                    raise ValueError("no observations for the current sum")
         
                if np.isnan(rna_cite_patient_cell_type.layers['raw'].data).any():
                    raise ValueError("row contains nan care!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
                    
                row = rna_cite_patient_cell_type.X.toarray()  # use normalized counts of the selected gene columns belonging to a specific module
                                                                  # of the selected patient with PID 
                                                                  # of the selected cell types
                if row.shape[0] > 1:
                    row = row.mean(axis=0, dtype = "float64") # If multiple such data exist, average them 
                else:
                    row = row[0]

                if np.any(np.isnan(row)):
                    raise ValueError("row contains nan care!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
                key = f'{cell_type}'
                if key not in rna_cite_pseudobulks_shared_p_L2:
                    df = pd.DataFrame(columns=rna_cite_patient.var_names, dtype = "float64")
                    df.loc[patient] = row
                    rna_cite_pseudobulks_shared_p_L2[key] = df
                else:
                    rna_cite_pseudobulks_shared_p_L2[key].loc[patient] = row
                    

0 G05073 GDT
0 G05073 CD4
0 G05073 NK
0 G05073 CD8
0 G05073 cMono
0 G05073 B
0 G05073 DC
0 G05073 ncMono
0 G05073 DP
0 G05073 PB
0 G05073 HSC
1 H00053 GDT
1 H00053 CD4
1 H00053 NK
1 H00053 CD8
1 H00053 cMono
1 H00053 B
1 H00053 DC
1 H00053 ncMono
1 H00053 DP
1 H00053 PB
1 H00053 HSC
2 H00054 GDT
2 H00054 CD4
2 H00054 NK
2 H00054 CD8
2 H00054 cMono
2 H00054 B
2 H00054 DC
2 H00054 ncMono
2 H00054 DP
2 H00054 PB
2 H00054 HSC
3 H00058 GDT
3 H00058 CD4
3 H00058 NK
3 H00058 CD8
3 H00058 cMono
3 H00058 B
3 H00058 DC
3 H00058 ncMono
3 H00058 DP
3 H00058 PB
3 H00058 HSC
4 H00064 GDT
4 H00064 CD4
4 H00064 NK
4 H00064 CD8
4 H00064 cMono
4 H00064 B
4 H00064 DC
4 H00064 ncMono
4 H00064 DP
4 H00064 PB
4 H00064 HSC
5 H00070 GDT
5 H00070 CD4
5 H00070 NK
5 H00070 CD8
5 H00070 cMono
5 H00070 B
5 H00070 DC
5 H00070 ncMono
5 H00070 DP
5 H00070 PB
5 H00070 HSC
6 H00072 GDT
6 H00072 CD4
6 H00072 NK
6 H00072 CD8
6 H00072 cMono
6 H00072 B
6 H00072 DC
6 H00072 ncMono
6 H00072 DP
6 H00072 PB
6 H00072 HSC
7 H000

In [47]:
rna_cite_pseudobulks_shared_p_L2_raw = {}

scRNA_shared_copy_raw = scRNA_shared_copy.copy()
scRNA_shared_copy_raw.X = scRNA_shared_copy.layers["raw"]

for i, patient in enumerate(pd.unique(scRNA_shared_copy_raw.obs_names)): # Go through every patient
        rna_cite_patient = scRNA_shared_copy_raw[scRNA_shared_copy_raw.obs.COMBAT_ID == patient,:] # Filter out all the rows/ measurements that have the ID COMBAT_ID
        for cell_type in pd.unique(scRNA_shared_copy_raw.obs.Annotation_major_subset): # Go through every cell type 
            rna_cite_patient_cell_type = rna_cite_patient[rna_cite_patient.obs.Annotation_major_subset == cell_type, :] # From all the patients with PID, filter out those measurements that are related to a specific cell type
            if len(rna_cite_patient_cell_type) > 0:
                print(i, patient, cell_type)
                if rna_cite_patient_cell_type.shape[0] == 0:
                    raise ValueError("no observations for the current sum")
         
                if np.isnan(rna_cite_patient_cell_type.layers['raw'].data).any():
                    raise ValueError("row contains nan care!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
                    
                row = rna_cite_patient_cell_type.X.toarray()  # use normalized counts of the selected gene columns belonging to a specific module
                                                                  # of the selected patient with PID 
                                                                  # of the selected cell types
                if row.shape[0] > 1:
                    row = row.sum(axis=0, dtype = "float64") # If multiple such data exist, average them 
                else:
                    row = row[0]

                if np.any(np.isnan(row)):
                    raise ValueError("row contains nan care!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
                key = f'{cell_type}'
                if key not in rna_cite_pseudobulks_shared_p_L2_raw:
                    df = pd.DataFrame(columns=rna_cite_patient.var_names, dtype = "float64")
                    df.loc[patient] = row
                    rna_cite_pseudobulks_shared_p_L2_raw[key] = df
                else:
                    rna_cite_pseudobulks_shared_p_L2_raw[key].loc[patient] = row
                    

  utils.warn_names_duplicates("obs")


0 G05073 GDT
0 G05073 CD4
0 G05073 NK
0 G05073 CD8
0 G05073 cMono
0 G05073 B
0 G05073 DC
0 G05073 ncMono
0 G05073 DP
0 G05073 PB
0 G05073 HSC
1 H00053 GDT
1 H00053 CD4
1 H00053 NK
1 H00053 CD8
1 H00053 cMono
1 H00053 B
1 H00053 DC
1 H00053 ncMono
1 H00053 DP
1 H00053 PB
1 H00053 HSC
2 H00054 GDT
2 H00054 CD4
2 H00054 NK
2 H00054 CD8
2 H00054 cMono
2 H00054 B
2 H00054 DC
2 H00054 ncMono
2 H00054 DP
2 H00054 PB
2 H00054 HSC
3 H00058 GDT
3 H00058 CD4
3 H00058 NK
3 H00058 CD8
3 H00058 cMono
3 H00058 B
3 H00058 DC
3 H00058 ncMono
3 H00058 DP
3 H00058 PB
3 H00058 HSC
4 H00064 GDT
4 H00064 CD4
4 H00064 NK
4 H00064 CD8
4 H00064 cMono
4 H00064 B
4 H00064 DC
4 H00064 ncMono
4 H00064 DP
4 H00064 PB
4 H00064 HSC
5 H00070 GDT
5 H00070 CD4
5 H00070 NK
5 H00070 CD8
5 H00070 cMono
5 H00070 B
5 H00070 DC
5 H00070 ncMono
5 H00070 DP
5 H00070 PB
5 H00070 HSC
6 H00072 GDT
6 H00072 CD4
6 H00072 NK
6 H00072 CD8
6 H00072 cMono
6 H00072 B
6 H00072 DC
6 H00072 ncMono
6 H00072 DP
6 H00072 PB
6 H00072 HSC
7 H000

In [48]:
bulkRNA_copy = bulkRNA_shared.copy()
bulkRNA_copy.obs_names = [str(i) for i in range(len(bulkRNA_copy.obs_names)) ]
indices = []
for p in shared_patients_across_networks_74:
    indices += list(bulkRNA_copy.obs_names[bulkRNA_copy.obs.COMBAT_ID == p])

bulkRNA_shared_copy = bulkRNA_copy[indices]
bulkRNA_shared_copy.obs_names = bulkRNA_shared_copy.obs.COMBAT_ID.copy()

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")
AnnData expects .obs.index to contain strings, but got values like:
    ['G05073', 'H00053', 'H00054', 'H00058', 'H00064']

    Inferred to be: categorical

  names = self._prep_dim_index(names, "obs")
  utils.warn_names_duplicates("var")


In [49]:
exclude = list(bulk_genes_membership[bulk_genes_membership.module == "grey"]["gene_name"])
bulkRNA_shared_copy = bulkRNA_shared_copy[:, ~bulkRNA_shared_copy.var_names.isin(exclude)].copy()

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


In [50]:
len(bulkRNA.var_names) - len(bulkRNA_shared_copy.var_names) == len(exclude)

True

In [51]:
rna_bulk_pseudobulks_shared_L2 = pd.DataFrame(columns = bulkRNA_shared_copy.var_names, dtype = "float64")

for i, patient in enumerate(pd.unique(bulkRNA_shared_copy.obs_names)):
    print(f"{i} {patient}")
    bulkRNA_patient = bulkRNA_shared_copy[bulkRNA_shared_copy.obs.COMBAT_ID==patient, :]
    row = bulkRNA_patient.X.toarray()
    if row.shape[0] > 1:
        row = row.mean(axis=0, dtype = "float64")
    else:
        row = row[0]
    
    rna_bulk_pseudobulks_shared_L2.loc[patient] = row

0 G05073
1 H00053
2 H00054
3 H00058
4 H00064
5 H00070
6 H00072
7 H00085
8 N00006
9 N00007
10 N00012
11 N00017
12 N00021
13 N00023
14 N00024
15 N00028
16 N00029
17 N00032
18 N00033
19 N00037
20 N00038
21 N00039
22 N00047
23 N00050
24 S00002
25 S00003
26 S00005
27 S00006
28 S00007
29 S00016
30 S00020
31 S00024
32 S00027
33 S00033
34 S00034
35 S00037
36 S00039
37 S00040
38 S00041
39 S00042
40 S00045
41 S00048
42 S00049
43 S00052
44 S00053
45 S00054
46 S00056
47 S00058
48 S00060
49 S00064
50 S00065
51 S00067
52 S00068
53 S00069
54 S00072
55 S00076
56 S00078
57 S00081
58 S00082
59 S00094
60 S00095
61 S00097
62 S00099
63 S00104
64 S00106
65 S00109
66 S00113
67 S00114
68 S00119
69 S00124
70 S00129
71 S00134
72 S00142
73 S00148


In [52]:
level =  "Level 2"
module = "None"
geexp = "None"

citeRNA_pbs = {}
bulkRNA_pbs = None
rest = []

for id, g in graphs_with_74_nodes.items():
    m = g.uns["modality"]
    name = g.uns["name"]
    if m == "citeRNA":
        ct = name.split("_")[1].split("-")[0]
        if ct not in list(citeRNA_pbs.keys()):
            citeRNA_pbs[ct] = g.to_df()
        else:
            citeRNA_pbs[ct].join(g.to_df(), rsuffix = name.split("_")[1].split("-")[1])
    elif m == "bulkRNA":
        if bulkRNA_pbs is None:
            bulkRNA_pbs = g.to_df()
        else:
            bulkRNA_pbs.join(g.to_df(), rsuffix = name.split("_")[1])
    else:
        rest.append(g.copy())

In [53]:
print(len(rna_cite_pseudobulks_shared_p_L2))

11


In [54]:
level2_graphs = {}
last_index = int(list(graphs_with_74_nodes.keys())[-1].split("G")[1])

########################################## bulkRNA #################################################
last_index+= 1

id = "G" + str(last_index)
name = "bulkRNA"

id_to_name[id] = name + "_L2"
name_to_id[name + "_L2"] = id


bulkRNA_ad = ad.AnnData(rna_bulk_pseudobulks_shared_L2)
bulkRNA_ad.obs_names = rna_bulk_pseudobulks_shared_L2.index
bulkRNA_ad.var_names = rna_bulk_pseudobulks_shared_L2.columns

bulkRNA_ad.uns["name"] = name
bulkRNA_ad.uns["id"] = id
bulkRNA_ad.uns["modality"] = "bulkRNA"
bulkRNA_ad.uns["module"] = module
bulkRNA_ad.uns["cell-type"] = "bulk"
bulkRNA_ad.uns["gene-exp"] = geexp
bulkRNA_ad.uns["level"] = level

bulkRNA_ad.layers["quant_N"] = quant_N.fit_transform(rna_bulk_pseudobulks_shared_L2.to_numpy())
bulkRNA_ad.layers["quant_U"] = quant_U.fit_transform(rna_bulk_pseudobulks_shared_L2.to_numpy())
bulkRNA_ad.layers["power"] = power.fit_transform(rna_bulk_pseudobulks_shared_L2.to_numpy())
bulkRNA_ad.layers["minmax"] = minmax.fit_transform(rna_bulk_pseudobulks_shared_L2.to_numpy())
bulkRNA_ad.layers["maxabs"] = maxabs.fit_transform(rna_bulk_pseudobulks_shared_L2.to_numpy())
bulkRNA_ad.layers["standard"] = standard.fit_transform(rna_bulk_pseudobulks_shared_L2.to_numpy())

bulkRNA_ad = bulkRNA_ad[shared_patients_across_networks_74]
bulkRNA_ad.obs = graphs_with_74_nodes["G0"].obs.copy()

level2_graphs[id] = bulkRNA_ad.copy()
######################################## citeRNA ###################################################
for key, g in rna_cite_pseudobulks_shared_p_L2.items():
    last_index+= 1

    id = "G" + str(last_index)
    name = "citeRNA_" + key 
    
    id_to_name[id] = name + "_L2"
    name_to_id[name + "_L2"] = id 


    citeRNA_ad = ad.AnnData(g)
    citeRNA_ad.obs_names = g.index
    citeRNA_ad.var_names = g.columns

    citeRNA_ad.uns["name"] = name
    citeRNA_ad.uns["id"] = id
    citeRNA_ad.uns["modality"] = "citeRNA"
    citeRNA_ad.uns["module"] = module
    citeRNA_ad.uns["cell-type"] = key
    citeRNA_ad.uns["gene-exp"] = geexp
    citeRNA_ad.uns["level"] = level

    citeRNA_ad.layers["quant_N"] = quant_N.fit_transform(g.to_numpy())
    citeRNA_ad.layers["quant_U"] = quant_N.fit_transform(g.to_numpy())
    citeRNA_ad.layers["power"] = power.fit_transform(g.to_numpy())
    citeRNA_ad.layers["minmax"] = minmax.fit_transform(g.to_numpy())
    citeRNA_ad.layers["maxabs"] = maxabs.fit_transform(g.to_numpy())
    citeRNA_ad.layers["standard"] = standard_sparse.fit_transform(g.to_numpy())

    citeRNA_ad = citeRNA_ad[shared_patients_across_networks_74]
    citeRNA_ad.obs = graphs_with_74_nodes["G0"].obs.copy()

    level2_graphs[id] = citeRNA_ad.copy()

######################################## rest ###################################################
for i, g in enumerate(rest):
    last_index+= 1
    id = "G" + str(last_index)
    name = g.uns["name"]
    
    id_to_name[id] = name + "_L2"
    name_to_id[name + "_L2"] = id 

    g.uns["id"] = id
    g.uns["level"] = level

    level2_graphs[id] = g.copy()

  utils.warn_names_duplicates("var")
AnnData expects .var.index to contain strings, but got values like:
    ['CCNE1', 'GINS1', 'FEN1', 'SPATS2', 'H2BC10']

    Inferred to be: categorical

  names = self._prep_dim_index(names, "var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


In [55]:
with open(os.path.join(level2, 'graphs_74.pickle'), 'wb') as f:
    pickle.dump(level2_graphs, f)

In [56]:
len(level2_graphs) == 37

True

In [57]:
id_to_feature_matrices_level2 = {} 
id_to_feature_matrices_quant_U_level2 = {}
id_to_feature_matrices_quant_N_level2 = {}
id_to_feature_matrices_power_level2 = {}
id_to_feature_matrices_minmax_level2 = {}
id_to_feature_matrices_maxabs_level2 = {}
id_to_feature_matrices_standard_level2 = {}

for id, g in level2_graphs.items():
    id_to_feature_matrices_level2[id] = g.X.copy()
    id_to_feature_matrices_quant_U_level2[id] = g.layers["quant_U"].copy()
    id_to_feature_matrices_quant_N_level2[id] = g.layers["quant_N"].copy()
    id_to_feature_matrices_power_level2[id] = g.layers["power"].copy()
    id_to_feature_matrices_minmax_level2[id] = g.layers["minmax"].copy()
    id_to_feature_matrices_maxabs_level2[id] = g.layers["maxabs"].copy()
    id_to_feature_matrices_standard_level2[id] = g.layers["standard"].copy()

with open(os.path.join(level2, 'feature_matrices.pickle'), 'wb') as f:
    pickle.dump(id_to_feature_matrices_level2, f)

with open(os.path.join(level2, 'feature_matrices_quant_U.pickle'), 'wb') as f:
    pickle.dump(id_to_feature_matrices_quant_U_level2, f)

with open(os.path.join(level2, 'feature_matrices_quant_N.pickle'), 'wb') as f:
    pickle.dump(id_to_feature_matrices_quant_N_level2, f)

with open(os.path.join(level2, 'feature_matrices_power.pickle'), 'wb') as f:
    pickle.dump(id_to_feature_matrices_power_level2, f)

with open(os.path.join(level2, 'feature_matrices_minmax.pickle'), 'wb') as f:
    pickle.dump(id_to_feature_matrices_minmax_level2, f)

with open(os.path.join(level2, 'feature_matrices_maxabs.pickle'), 'wb') as f:
    pickle.dump(id_to_feature_matrices_maxabs_level2, f)

with open(os.path.join(level2, 'feature_matrices_standard.pickle'), 'wb') as f:
    pickle.dump(id_to_feature_matrices_standard_level2, f)

In [58]:
mogcn_data_path_level2 = os.path.abspath(os.path.join(os.getcwd(),"../../../MoGCN-master/data-level2/"))
for data in level2_graphs.values():
    name = data.uns["name"]
    index = data.to_df().index
    columns = data.to_df().columns

    data.to_df().to_csv(os.path.join(mogcn_data_path_level2 + "/wot", name))
    pd.DataFrame(data.layers["quant_U"], index=index, columns=columns).to_csv(os.path.join(mogcn_data_path_level2 + "/quant_U", name))
    pd.DataFrame(data.layers["quant_N"], index=index, columns=columns).to_csv(os.path.join(mogcn_data_path_level2 + "/quant_N", name))
    pd.DataFrame(data.layers["power"], index=index, columns=columns).to_csv(os.path.join(mogcn_data_path_level2 + "/power", name))
    pd.DataFrame(data.layers["minmax"], index=index, columns=columns).to_csv(os.path.join(mogcn_data_path_level2 + "/minmax", name))
    pd.DataFrame(data.layers["maxabs"], index=index, columns=columns).to_csv(os.path.join(mogcn_data_path_level2 + "/maxabs", name))
    pd.DataFrame(data.layers["standard"], index=index, columns=columns).to_csv(os.path.join(mogcn_data_path_level2 + "/standard", name))

## PB Level3

In [59]:
with open(os.path.join(shared, 'patients_74.pickle'), 'rb') as f:
    shared_patients_across_networks_74 = pickle.load(f)

with open(os.path.join(level2, 'graphs_74.pickle'), 'rb') as f:
    level2_graphs = pickle.load(f)

In [60]:
# ADT
adt_shared_copy = adt_shared.copy()
adt_shared_copy.obs_names = [str(i) for i in range(len(adt_shared_copy.obs_names)) ]
indices = []
for p in shared_patients_across_networks_74:
    indices += list(adt_shared_copy.obs_names[adt_shared_copy.obs.COMBAT_ID == p])

adt_shared_copy = adt_shared_copy[indices]
adt_shared_copy.obs_names = adt_shared_copy.obs.COMBAT_ID.copy()

# BulkRNA
bulkRNA_shared_copy = bulkRNA_shared.copy()
bulkRNA_shared_copy.obs_names = [str(i) for i in range(len(bulkRNA_shared_copy.obs_names)) ]
indices = []
for p in shared_patients_across_networks_74:
    indices += list(bulkRNA_shared_copy.obs_names[bulkRNA_shared_copy.obs.COMBAT_ID == p])
    
bulkRNA_shared_copy = bulkRNA_shared_copy[indices]
bulkRNA_shared_copy.obs_names = bulkRNA_shared_copy.obs.COMBAT_ID.copy()

# CiteRNA
scRNA_shared_copy = scRNA_shared.copy()
scRNA_shared_copy.obs_names = [str(i) for i in range(len(scRNA_shared_copy.obs_names)) ]
indices = []
for p in shared_patients_across_networks_74:
    indices += list(scRNA_shared_copy.obs_names[scRNA_shared_copy.obs.COMBAT_ID == p])
    
scRNA_shared_copy = scRNA_shared_copy[indices]
scRNA_shared_copy.obs_names = scRNA_shared_copy.obs.COMBAT_ID.copy()

# CyTOF
cytof_shared_copy = cytof_shared.copy()
cytof_shared_copy.obs_names = [str(i) for i in range(len(cytof_shared_copy.obs_names)) ]
indices = []
for p in shared_patients_across_networks_74:
    indices += list(cytof_shared_copy.obs_names[cytof_shared_copy.obs.COMBAT_ID == p])
    
cytof_shared_copy = cytof_shared_copy[indices]
cytof_shared_copy.obs_names = cytof_shared_copy.obs.COMBAT_ID.copy()

# Luminex
luminex_shared_copy = luminex_shared.copy()
luminex_shared_copy.obs_names = [str(i) for i in range(len(luminex_shared_copy.obs_names)) ]
indices = []
for p in shared_patients_across_networks_74:
    indices += list(luminex_shared_copy.obs_names[luminex_shared_copy.obs.COMBAT_ID == p])
    
luminex_shared_copy = luminex_shared_copy[indices]
luminex_shared_copy.obs_names = luminex_shared_copy.obs.COMBAT_ID.copy()

# FACS
facs_shared_copy = facs_shared.copy()
facs_shared_copy.obs_names = [str(i) for i in range(len(facs_shared_copy.obs_names)) ]
indices = []
for p in shared_patients_across_networks_74:
    indices += list(facs_shared_copy.obs_names[facs_shared_copy.obs.COMBAT_ID == p])
    
facs_shared_copy = facs_shared_copy[indices]
facs_shared_copy.obs_names = facs_shared_copy.obs.COMBAT_ID.copy()

  utils.warn_names_duplicates("obs")
AnnData expects .obs.index to contain strings, but got values like:
    ['G05073', 'G05073', 'G05073', 'G05073', 'G05073']

    Inferred to be: categorical

  names = self._prep_dim_index(names, "obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")
AnnData expects .obs.index to contain strings, but got values like:
    ['G05073', 'H00053', 'H00054', 'H00058', 'H00064']

    Inferred to be: categorical

  names = self._prep_dim_index(names, "obs")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("obs")
AnnData expects .obs.index to contain strings, but got values like:
    ['G05073', 'G05073', 'G05073', 'G05073', 'G05073']

    Inferred to be: categorical

  names = self._prep_dim_index(names, "obs")
  utils.warn_names_duplicates("obs")
AnnData expects .obs.index to contain strings, but got values like:
    ['G05073', 'G05073', 'G05073', 'G05073', 'G05073']

    Inferred to be: categorical

  names = self.

In [61]:
scRNA_shared_copy = scRNA_shared_copy[scRNA_shared_copy.obs["Annotation_major_subset"].isin(cell_types_with_74["citeRNA"])].copy()

  utils.warn_names_duplicates("obs")


In [62]:
exclude = list(membership_all_celltypes[membership_all_celltypes.module == "grey"]["gene_name"])
scRNA_shared_copy = scRNA_shared_copy[:, ~scRNA_shared_copy.var_names.isin(exclude)]

In [63]:
level3_graphs = {}
last_index = int(list(level2_graphs.keys())[-1].split("G")[1])
last_index+= 1

x = scRNA_shared_copy.to_df().groupby("COMBAT_ID").mean()

ad_obj = ad.AnnData(x)
id = "G" + str(last_index)
name = "citeRNA"


ad_obj.uns["name"] = name
ad_obj.uns["id"] = id 
ad_obj.uns["modality"] = name

ad_obj.uns["module"] = "None"
ad_obj.uns["cell-type"] = "bulk"
ad_obj.uns["level"] = "Level 3"
ad_obj.uns["gene-exp"] = "None"


ad_obj.obs_names = x.index
ad_obj.var_names = x.columns

ad_obj = ad_obj[shared_patients_across_networks_74]
ad_obj.obs = graphs_with_74_nodes["G0"].obs.copy()

ad_obj.layers["quant_N"] = quant_N.fit_transform(x.to_numpy())
ad_obj.layers["minmax"] = minmax.fit_transform(x.to_numpy())
ad_obj.layers["maxabs"] = maxabs.fit_transform(x.to_numpy())
ad_obj.layers["standard"] = standard_sparse.fit_transform(x.to_numpy())
ad_obj.layers["quant_U"] = quant_U.fit_transform(x.to_numpy())
ad_obj.layers["power"] = power.fit_transform(x.to_numpy())

id_to_name[id] = name + "_L3"
name_to_id[name + "_L3"] = id

level3_graphs[id] = ad_obj.copy()

AnnData expects .obs.index to contain strings, but got values like:
    ['G05073', 'H00053', 'H00054', 'H00058', 'H00064']

    Inferred to be: categorical

  names = self._prep_dim_index(names, "obs")


In [64]:
scRNA_raw = scRNA_shared_copy.copy()
scRNA_raw.X = scRNA_raw.layers["raw"]
x = scRNA_raw.to_df().groupby("COMBAT_ID").sum()

ad_obj = ad.AnnData(x)
id = "G" + str(last_index)
name = "citeRNA"


ad_obj.uns["name"] = name
ad_obj.uns["id"] = id 
ad_obj.uns["modality"] = name

ad_obj.uns["module"] = "None"
ad_obj.uns["cell-type"] = "bulk"
ad_obj.uns["level"] = "Level 3"
ad_obj.uns["gene-exp"] = "None"


ad_obj.obs_names = x.index
ad_obj.var_names = x.columns

ad_obj = ad_obj[shared_patients_across_networks_74]
ad_obj.obs = graphs_with_74_nodes["G0"].obs.copy()

ad_obj.layers["quant_N"] = quant_N.fit_transform(x.to_numpy())
ad_obj.layers["minmax"] = minmax.fit_transform(x.to_numpy())
ad_obj.layers["maxabs"] = maxabs.fit_transform(x.to_numpy())
ad_obj.layers["standard"] = standard_sparse.fit_transform(x.to_numpy())
ad_obj.layers["quant_U"] = quant_U.fit_transform(x.to_numpy())

scRNA_raw = ad_obj.copy()

  utils.warn_names_duplicates("obs")
AnnData expects .obs.index to contain strings, but got values like:
    ['G05073', 'H00053', 'H00054', 'H00058', 'H00064']

    Inferred to be: categorical

  names = self._prep_dim_index(names, "obs")


In [65]:
adt_shared_copy = adt_shared_copy[adt_shared_copy.obs["Annotation_major_subset"].isin(cell_types_with_74["adt"])].copy()

  utils.warn_names_duplicates("obs")


In [66]:
last_index+= 1

x = adt_shared_copy.to_df().groupby("COMBAT_ID").mean()
ad_obj = ad.AnnData(x)
id = "G" + str(last_index)
name = "adt"


ad_obj.uns["name"] = name
ad_obj.uns["id"] = id 
ad_obj.uns["modality"] = name

ad_obj.uns["module"] = "None"
ad_obj.uns["cell-type"] = "bulk"
ad_obj.uns["level"] = "Level 3"
ad_obj.uns["gene-exp"] = "None"


ad_obj.obs_names = x.index
ad_obj.var_names = x.columns

ad_obj = ad_obj[shared_patients_across_networks_74]
ad_obj.obs = graphs_with_74_nodes["G0"].obs.copy()

ad_obj.layers["quant_N"] = quant_N.fit_transform(x.to_numpy())
ad_obj.layers["minmax"] = minmax.fit_transform(x.to_numpy())
ad_obj.layers["maxabs"] = maxabs.fit_transform(x.to_numpy())
ad_obj.layers["standard"] = standard_sparse.fit_transform(x.to_numpy())
ad_obj.layers["quant_U"] = quant_U.fit_transform(x.to_numpy())


id_to_name[id] = name + "_L3"
name_to_id[name + "_L3"] = id

level3_graphs[id] = ad_obj.copy()

AnnData expects .obs.index to contain strings, but got values like:
    ['G05073', 'H00053', 'H00054', 'H00058', 'H00064']

    Inferred to be: categorical

  names = self._prep_dim_index(names, "obs")


In [67]:
adt_raw = adt_shared_copy.copy()
adt_raw.X = adt_raw.layers["raw"]
x = adt_raw.to_df().groupby("COMBAT_ID").sum()
ad_obj = ad.AnnData(x)
id = "G" + str(last_index)
name = "adt"


ad_obj.uns["name"] = name
ad_obj.uns["id"] = id 
ad_obj.uns["modality"] = name

ad_obj.uns["module"] = "None"
ad_obj.uns["cell-type"] = "bulk"
ad_obj.uns["level"] = "Level 3"
ad_obj.uns["gene-exp"] = "None"


ad_obj.obs_names = x.index
ad_obj.var_names = x.columns

ad_obj = ad_obj[shared_patients_across_networks_74]
ad_obj.obs = graphs_with_74_nodes["G0"].obs.copy()

ad_obj.layers["quant_N"] = quant_N.fit_transform(x.to_numpy())
ad_obj.layers["minmax"] = minmax.fit_transform(x.to_numpy())
ad_obj.layers["maxabs"] = maxabs.fit_transform(x.to_numpy())
ad_obj.layers["standard"] = standard_sparse.fit_transform(x.to_numpy())
ad_obj.layers["quant_U"] = quant_U.fit_transform(x.to_numpy())



adt_raw = ad_obj.copy()

  utils.warn_names_duplicates("obs")
AnnData expects .obs.index to contain strings, but got values like:
    ['G05073', 'H00053', 'H00054', 'H00058', 'H00064']

    Inferred to be: categorical

  names = self._prep_dim_index(names, "obs")


In [68]:
cytof_shared_copy = cytof_shared_copy[cytof_shared_copy.obs["Annotation_major_subset"].isin(cell_types_with_74["cytof"])].copy()

  utils.warn_names_duplicates("obs")


In [69]:
last_index+= 1

x = cytof_shared_copy.to_df().groupby("COMBAT_ID").mean()
ad_obj = ad.AnnData(x)
id = "G" + str(last_index)
name = "cytof"


ad_obj.uns["name"] = name
ad_obj.uns["id"] = id 
ad_obj.uns["modality"] = name

ad_obj.uns["module"] = "None"
ad_obj.uns["cell-type"] = "bulk"
ad_obj.uns["level"] = "Level 3"
ad_obj.uns["gene-exp"] = "None"


ad_obj.obs_names = x.index
ad_obj.var_names = x.columns

ad_obj = ad_obj[shared_patients_across_networks_74]
ad_obj.obs = graphs_with_74_nodes["G0"].obs.copy()

ad_obj.layers["quant_N"] = quant_N.fit_transform(x.to_numpy())
ad_obj.layers["minmax"] = minmax.fit_transform(x.to_numpy())
ad_obj.layers["maxabs"] = maxabs.fit_transform(x.to_numpy())
ad_obj.layers["standard"] = standard.fit_transform(x.to_numpy())
ad_obj.layers["quant_U"] = quant_U.fit_transform(x.to_numpy())



id_to_name[id] = name + "_L3"
name_to_id[name + "_L3"] = id

level3_graphs[id] = ad_obj.copy()

AnnData expects .obs.index to contain strings, but got values like:
    ['G05073', 'H00053', 'H00054', 'H00058', 'H00064']

    Inferred to be: categorical

  names = self._prep_dim_index(names, "obs")


In [70]:
exclude = list(bulk_genes_membership[bulk_genes_membership.module == "grey"]["gene_name"])
bulkRNA_shared_copy = bulkRNA_shared_copy[:, ~bulkRNA_shared_copy.var_names.isin(exclude)].copy()

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


In [71]:
last_index+= 1

x = bulkRNA_shared_copy.to_df().groupby("COMBAT_ID").mean()
ad_obj = ad.AnnData(x)
id = "G" + str(last_index)
name = "bulkRNA"


ad_obj.uns["name"] = name
ad_obj.uns["id"] = id 
ad_obj.uns["modality"] = name

ad_obj.uns["module"] = "None"
ad_obj.uns["cell-type"] = "bulk"
ad_obj.uns["level"] = "Level 3"
ad_obj.uns["gene-exp"] = "None"

ad_obj.obs_names = x.index
ad_obj.var_names = x.columns

ad_obj = ad_obj[shared_patients_across_networks_74]
ad_obj.obs = graphs_with_74_nodes["G0"].obs.copy()

ad_obj.layers["quant_N"] = quant_N.fit_transform(x.to_numpy())
ad_obj.layers["minmax"] = minmax.fit_transform(x.to_numpy())
ad_obj.layers["maxabs"] = maxabs.fit_transform(x.to_numpy())
ad_obj.layers["standard"] = standard.fit_transform(x.to_numpy())
ad_obj.layers["quant_U"] = quant_U.fit_transform(x.to_numpy())
ad_obj.layers["power"] = power.fit_transform(x.to_numpy())

id_to_name[id] = name + "_L3"
name_to_id[name + "_L3"]  = id

level3_graphs[id] = ad_obj.copy()

  utils.warn_names_duplicates("var")
AnnData expects .obs.index to contain strings, but got values like:
    ['G05073', 'H00053', 'H00054', 'H00058', 'H00064']

    Inferred to be: categorical

  names = self._prep_dim_index(names, "obs")
AnnData expects .var.index to contain strings, but got values like:
    ['CCNE1', 'GINS1', 'FEN1', 'SPATS2', 'H2BC10']

    Inferred to be: categorical

  names = self._prep_dim_index(names, "var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)
  utils.warn_names_duplicates("var")


In [72]:
last_index+= 1

x = facs_shared_copy.to_df().groupby("COMBAT_ID").mean()

ad_obj = ad.AnnData(x)
id = "G" + str(last_index)
name = "facs"


ad_obj.uns["name"] = name
ad_obj.uns["id"] = id 
ad_obj.uns["modality"] = name

ad_obj.uns["module"] = "None"
ad_obj.uns["cell-type"] = "CD4"
ad_obj.uns["level"] = "Level 3"
ad_obj.uns["gene-exp"] = "None"

ad_obj.obs_names = x.index
ad_obj.var_names = x.columns

ad_obj = ad_obj[shared_patients_across_networks_74]
ad_obj.obs = graphs_with_74_nodes["G0"].obs.copy()

ad_obj.layers["quant_N"] = quant_N.fit_transform(x.to_numpy())
ad_obj.layers["minmax"] = minmax.fit_transform(x.to_numpy())
ad_obj.layers["maxabs"] = maxabs.fit_transform(x.to_numpy())
ad_obj.layers["standard"] = standard.fit_transform(x.to_numpy())
ad_obj.layers["quant_U"] = quant_U.fit_transform(x.to_numpy())
ad_obj.layers["power"] = power.fit_transform(x.to_numpy())

id_to_name[id] = name + "_L3"
name_to_id[name + "_L3"] = id

level3_graphs[id] = ad_obj.copy()

AnnData expects .obs.index to contain strings, but got values like:
    ['G05073', 'H00053', 'H00054', 'H00058', 'H00064']

    Inferred to be: categorical

  names = self._prep_dim_index(names, "obs")
  x = um.multiply(x, x, out=x)


In [73]:
last_index+= 1

x = luminex_shared_copy.to_df().groupby("COMBAT_ID").mean()

ad_obj = ad.AnnData(x)
id = "G" + str(last_index)
name = "luminex"


ad_obj.uns["name"] = name
ad_obj.uns["id"] = id 
ad_obj.uns["modality"] = name

ad_obj.uns["module"] = "None"
ad_obj.uns["cell-type"] = "bulk"
ad_obj.uns["level"] = "Level 3"
ad_obj.uns["gene-exp"] = "None"

ad_obj.obs_names = x.index
ad_obj.var_names = x.columns

ad_obj = ad_obj[shared_patients_across_networks_74]
ad_obj.obs = graphs_with_74_nodes["G0"].obs.copy()

ad_obj.layers["quant_N"] = quant_N.fit_transform(x.to_numpy())
ad_obj.layers["minmax"] = minmax.fit_transform(x.to_numpy())
ad_obj.layers["maxabs"] = maxabs.fit_transform(x.to_numpy())
ad_obj.layers["standard"] = standard.fit_transform(x.to_numpy())
ad_obj.layers["quant_U"] = quant_U.fit_transform(x.to_numpy())
ad_obj.layers["power"] = x.to_numpy().copy()

id_to_name[id] = name + "_L3"
name_to_id[name + "_L3"] = id

level3_graphs[id] = ad_obj.copy()

AnnData expects .obs.index to contain strings, but got values like:
    ['G05073', 'H00053', 'H00054', 'H00058', 'H00064']

    Inferred to be: categorical

  names = self._prep_dim_index(names, "obs")


In [74]:
with open(os.path.join(level3, 'graphs_74.pickle'), 'wb') as f:
    pickle.dump(level3_graphs, f)

In [75]:
with open(os.path.join(shared, 'id_to_name.pickle'), 'wb') as f:
    pickle.dump(id_to_name, f)
with open(os.path.join(shared, 'name_to_id.pickle'), 'wb') as f:
    pickle.dump(name_to_id, f)

with open(os.path.join(shared, 'id_to_pb_noise.pickle'), 'wb') as f:
    pickle.dump(id_to_pb_noise, f)
with open(os.path.join(shared, 'id_to_pb.pickle'), 'wb') as f:
    pickle.dump(id_to_pb, f)

with open(os.path.join(shared, 'id_to_pb_anndata.pickle'), 'wb') as f:
    pickle.dump(id_to_pb_anndata, f)
with open(os.path.join(shared, 'id_to_pb_noise_anndata.pickle'), 'wb') as f:
    pickle.dump(id_to_pb_noise_anndata, f)

In [76]:
id_to_feature_matrices_level3 = {} 
id_to_feature_matrices_quant_U_level3 = {}
id_to_feature_matrices_quant_N_level3 = {}
id_to_feature_matrices_power_level3 = {}
id_to_feature_matrices_minmax_level3 = {}
id_to_feature_matrices_maxabs_level3 = {}
id_to_feature_matrices_standard_level3 = {}

for id, g in level3_graphs.items():
    id_to_feature_matrices_level3[id] = g.X.copy()
    id_to_feature_matrices_quant_N_level3[id] = g.layers["quant_N"].copy()
    id_to_feature_matrices_quant_U_level3[id] = g.layers["quant_U"].copy()
    id_to_feature_matrices_minmax_level3[id] = g.layers["minmax"].copy()
    id_to_feature_matrices_maxabs_level3[id] = g.layers["maxabs"].copy()
    id_to_feature_matrices_standard_level3[id] = g.layers["standard"].copy()

with open(os.path.join(level3, 'feature_matrices.pickle'), 'wb') as f:
    pickle.dump(id_to_feature_matrices_level3, f)

with open(os.path.join(level3, 'feature_matrices_quant_N.pickle'), 'wb') as f:
    pickle.dump(id_to_feature_matrices_quant_N_level3, f)

with open(os.path.join(level3, 'feature_matrices_quant_U.pickle'), 'wb') as f:
    pickle.dump(id_to_feature_matrices_quant_U_level3, f)

with open(os.path.join(level3, 'feature_matrices_minmax.pickle'), 'wb') as f:
    pickle.dump(id_to_feature_matrices_minmax_level3, f)

with open(os.path.join(level3, 'feature_matrices_maxabs.pickle'), 'wb') as f:
    pickle.dump(id_to_feature_matrices_maxabs_level3, f)

with open(os.path.join(level3, 'feature_matrices_standard.pickle'), 'wb') as f:
    pickle.dump(id_to_feature_matrices_standard_level3, f)

In [77]:
mogcn_data_path_level3 = os.path.abspath(os.path.join(os.getcwd(),"../../../MoGCN-master/data-level3/"))
for data in level3_graphs.values():
    name = data.uns["name"]
    index = data.to_df().index
    columns = data.to_df().columns

    data.to_df().to_csv(os.path.join(mogcn_data_path_level3 + "/wot", name))
    pd.DataFrame(data.layers["quant_U"], index=index, columns=columns).to_csv(os.path.join(mogcn_data_path_level3 + "/quant_U", name))
    pd.DataFrame(data.layers["quant_N"], index=index, columns=columns).to_csv(os.path.join(mogcn_data_path_level3 + "/quant_N", name))
    pd.DataFrame(data.layers["minmax"], index=index, columns=columns).to_csv(os.path.join(mogcn_data_path_level3 + "/minmax", name))
    pd.DataFrame(data.layers["maxabs"], index=index, columns=columns).to_csv(os.path.join(mogcn_data_path_level3 + "/maxabs", name))
    pd.DataFrame(data.layers["standard"], index=index, columns=columns).to_csv(os.path.join(mogcn_data_path_level3 + "/standard", name))

## Second Post Processing 

In [78]:
id_to_modality = {id:ad_obj.uns["modality"] for id, ad_obj in  graphs_with_74_nodes.items()}
modality_to_ids = {"facs":[], "luminex":[], "citeRNA":[], "bulkRNA":[], "cytof":[], "adt":[]}

for id, m in id_to_modality.items():
    modality_to_ids[m].append(id)
    
with open(os.path.join(shared, "id_to_modality.pickle"), "wb") as f:
    pickle.dump(id_to_modality, f)

with open(os.path.join(shared, "modality_to_id.pickle"), "wb") as f:
    pickle.dump(modality_to_ids, f)

In [79]:
raw_data = {"adt_L2": adt_raw_L2, "citeRNA_L2": rna_cite_pseudobulks_shared_p_L2_raw, "adt_L3":adt_raw, "citeRNA_L3": scRNA_raw  }
with open(os.path.join(output, "analysis_input/raw/raw_data.pickle"), "wb") as f:
    pickle.dump(raw_data, f)