In [1]:
import anndata as ad
import networkx as nx
import scanpy as sc
import scglue
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import scipy

### Note
Full guidance graph and cite subgraph are created in 3 different ways depending on the kind of weights chosen for edges between features of different modalities
- **without correlation** (no_corr): all edges are set to weight 1 and sign 1
- **rna-adt correlation** (rna_adt_corr): all edges are set to weight 1 and sign 1, except for the edges between common genes of rna and adt assays for which pearson correlation is used instead (weight = absolute value of the correlation, sign = sign of the correlation)
- **all correlations** (all_corr): correlation is used for all common genes across modalities

## Anndata loading

In [2]:
rna = ad.read_h5ad("../pp_harm_data/rna-pp-harm-sub.h5ad")
adt = ad.read_h5ad("../pp_harm_data/adt-pp-harm-sub.h5ad")
cytof = ad.read_h5ad("../pp_harm_data/cytof-pp-harm-sub.h5ad")
facs = ad.read_h5ad("../pp_harm_data/facs-pp-harm-sub.h5ad")

## Finding common genes

In [3]:
rna_adt_edges = rna.var_names.intersection(adt.var_names)
rna_cytof_edges = rna.var_names.intersection(cytof.var_names)
rna_facs_edges = rna.var_names.intersection(facs.var_names)
adt_cytof_edges = adt.var_names.intersection(cytof.var_names)
adt_facs_edges = adt.var_names.intersection(facs.var_names)
cytof_facs_edges = cytof.var_names.intersection(facs.var_names)

In [4]:
rna_adt_edges

Index(['CD34', 'CD207', 'CD28', 'CX3CR1', 'XCR1', 'CD86', 'CD38', 'CD14',
       'CD83', 'CD24', 'CD36', 'CD27', 'CD163', 'CLEC12A', 'CD19', 'CD40',
       'CD70', 'CD22', 'CD33', 'NLRP2'],
      dtype='object')

In [9]:
## list of common genes across rna, adt and cytof, AFTER VARIABLE SELECTION ON RNA
adt.var_names.intersection(cytof.var_names)

Index(['CD3', 'CD8', 'CD19', 'CD33', 'CD11c', 'CD45RA', 'CD123', 'CD4', 'CD14',
       'CD16', 'CD25', 'CD45RO', 'CD20', 'IgM', 'CD103', 'CD69', 'CD161',
       'CD27', 'HLA_DR', 'CD39', 'CX3CR1', 'IgA', 'CD28', 'CD38', 'CD45',
       'CD99'],
      dtype='object')

In [10]:
#number of common genes across rna, adt and cytof
len(adt.var_names.intersection(cytof.var_names))

26

In [5]:
"""rna_adt_edges = rna.var.merge(adt.var, "inner", left_index=True, right_index=True).index
rna_cytof_edges = rna.var.merge(cytof.var, "inner", left_index=True, right_index=True).index
rna_facs_edges = rna.var.merge(facs.var, "inner", left_index=True, right_index=True).index
adt_cytof_edges = adt.var.merge(cytof.var, "inner", left_index=True, right_index=True).index
adt_facs_edges = adt.var.merge(facs.var, "inner", left_index=True, right_index=True).index
cytof_facs_edges = cytof.var.merge(facs.var, "inner", left_index=True, right_index=True).index"""

'rna_adt_edges = rna.var.merge(adt.var, "inner", left_index=True, right_index=True).index\nrna_cytof_edges = rna.var.merge(cytof.var, "inner", left_index=True, right_index=True).index\nrna_facs_edges = rna.var.merge(facs.var, "inner", left_index=True, right_index=True).index\nadt_cytof_edges = adt.var.merge(cytof.var, "inner", left_index=True, right_index=True).index\nadt_facs_edges = adt.var.merge(facs.var, "inner", left_index=True, right_index=True).index\ncytof_facs_edges = cytof.var.merge(facs.var, "inner", left_index=True, right_index=True).index'

In [6]:
#rna_adt_edges, len(rna_adt_edges)

In [7]:
#rna_cytof_edges, len(rna_cytof_edges)

In [8]:
#rna_facs_edges, len(rna_facs_edges)

In [9]:
#adt_cytof_edges, len(adt_cytof_edges)

In [10]:
#adt_facs_edges, len(adt_facs_edges)

In [11]:
#cytof_facs_edges, len(cytof_facs_edges)

In [12]:
#len(rna_adt_edges) + len(rna_cytof_edges) + len(rna_facs_edges) + len(adt_cytof_edges) + len(adt_facs_edges) + len(cytof_facs_edges)

## Computing correlation coefficients between common genes\proteins across modalities

In [13]:
def compute_corr(common_genes, adata1, adata2):
    correlations = {}

    for gene in common_genes:
        corr, _ = pearsonr(np.squeeze(np.asarray(adata1[:,gene].X.todense())),#modify string according to adatas
                                                np.squeeze(np.asarray(adata2[:,gene].X.todense())))
        
        correlations.update({gene:corr})

    return correlations

In [14]:
rna_adt_corrs = compute_corr(rna_adt_edges, rna, adt)
#rna_adt_corrs

In [15]:
rna_cytof_corrs = compute_corr(rna_cytof_edges, rna, cytof)
#rna_cytof_corrs

In [16]:
rna_facs_corrs = compute_corr(rna_facs_edges, rna, facs)
#rna_facs_corrs

In [17]:
adt_cytof_corrs = compute_corr(adt_cytof_edges, adt, cytof)
#adt_cytof_corrs

In [18]:
adt_facs_corrs = compute_corr(adt_facs_edges, adt, facs)
#adt_facs_corrs

In [19]:
cytof_facs_corrs = compute_corr(cytof_facs_edges, cytof, facs)
#cytof_facs_corrs

#   <font color='red'> Building the guidance graph with no correlation</font>

In [20]:
adt.var.index = adt.var.index + '_adt'

In [21]:
cytof.var.index = cytof.var.index + '_cytof'

In [22]:
facs.var.index = facs.var.index + '_facs'

In [23]:
guidance = nx.Graph()
guidance

<networkx.classes.graph.Graph at 0x7f5d7ad6f5b0>

In [24]:
guidance.add_nodes_from(rna.var.index)
#len(list(guidance.nodes)) - len(rna.var.index)

In [25]:
guidance.add_nodes_from(adt.var.index)
#len(list(guidance.nodes)) - len(rna.var) - len(adt.var)

In [26]:
guidance.add_nodes_from(cytof.var.index)
#len(list(guidance.nodes)) - len(rna.var) - len(adt.var) - len(cytof.var)

In [27]:
guidance.add_nodes_from(facs.var.index)
#len(list(guidance.nodes)) - len(rna.var) - len(adt.var) - len(cytof.var) - len(facs.var)

In [28]:
guidance.add_edges_from([(node, node, {'weight':1, 'sign':1}) for node in list(guidance.nodes)])
#len(list(guidance.edges)) - len(list(guidance.nodes))

In [29]:
guidance.add_edges_from([(node, node + '_adt', {'weight':1, 'sign':1}) for node in rna_adt_edges])
#len(list(guidance.edges)) - len(list(guidance.nodes)) - len(rna_adt_edges)

In [30]:
guidance.add_edges_from([(node, node + '_cytof', {'weight':1, 'sign':1}) for node in rna_cytof_edges])
#len(list(guidance.edges)) - len(list(guidance.nodes)) - len(rna_adt_edges) - len(rna_cytof_edges)

In [31]:
guidance.add_edges_from([(node, node + '_facs', {'weight':1, 'sign':1}) for node in rna_facs_edges])
#len(list(guidance.edges)) - len(list(guidance.nodes)) - len(rna_adt_edges) - len(rna_cytof_edges) - len(rna_facs_edges)

In [32]:
guidance.add_edges_from([(node + '_adt', node + '_cytof', {'weight':1, 'sign':1}) for node in adt_cytof_edges])
#(len(list(guidance.edges)) - len(list(guidance.nodes)) - len(rna_adt_edges) - len(rna_cytof_edges) - len(rna_facs_edges) -
#    len(adt_cytof_edges))

In [33]:
guidance.add_edges_from([(node + '_adt', node + '_facs', {'weight':1, 'sign':1}) for node in adt_facs_edges])
#(len(list(guidance.edges)) - len(list(guidance.nodes)) - len(rna_adt_edges) - len(rna_cytof_edges) - len(rna_facs_edges) -
#    len(adt_cytof_edges) - len(adt_facs_edges))

In [34]:
guidance.add_edges_from([(node + '_cytof', node + '_facs', {'weight':1, 'sign':1}) for node in cytof_facs_edges])
#(len(list(guidance.edges)) - len(list(guidance.nodes)) - len(rna_adt_edges) - len(rna_cytof_edges) - len(rna_facs_edges) -
#    len(adt_cytof_edges) - len(adt_facs_edges) - len(cytof_facs_edges))

In [35]:
#len(list(guidance.nodes)) - len(rna.var) - len(adt.var) - len(cytof.var) -len(facs.var)

In [36]:
scglue.graph.check_graph(guidance, [rna, adt, cytof, facs])

[INFO] check_graph: Checking variable coverage...
[INFO] check_graph: Checking edge attributes...
[INFO] check_graph: Checking self-loops...
[INFO] check_graph: Checking graph symmetry...
[INFO] check_graph: All checks passed!


## Building cite subgraph from full graph - no correlation

In [37]:
guidance_sub_nodes = rna.var_names.tolist() + adt.var_names.tolist()
#len(guidance_sub_nodes)

In [38]:
guidance_cite = guidance.subgraph(guidance_sub_nodes)

In [39]:
#len(guidance_cite.nodes), len(guidance_cite.edges)

In [40]:
#len(guidance_cite.edges) - len(rna.var_names) - len(adt.var_names) - len(rna_adt_edges)

## Saving graph for full data and subgraph for sub data - no correlation

In [41]:
nx.write_graphml(guidance, "guidance_graphs/no_corr/guidance.graphml.gz")

In [42]:
nx.write_graphml(guidance_cite, "guidance_graphs/no_corr/guidance_cite.graphml.gz")

# <font color='red'>Building the guidance graph with rna - adt correlation</font>

In [43]:
guidance = nx.Graph()
guidance

<networkx.classes.graph.Graph at 0x7f5d9a25cb50>

In [44]:
guidance.add_nodes_from(rna.var.index)
#len(list(guidance.nodes)) - len(rna.var.index)

In [45]:
guidance.add_nodes_from(adt.var.index)
#len(list(guidance.nodes)) - len(rna.var) - len(adt.var)

In [46]:
guidance.add_nodes_from(cytof.var.index)
#len(list(guidance.nodes)) - len(rna.var) - len(adt.var) - len(cytof.var)

In [47]:
guidance.add_nodes_from(facs.var.index)
#len(list(guidance.nodes)) - len(rna.var) - len(adt.var) - len(cytof.var) - len(facs.var)

In [48]:
guidance.add_edges_from([(node, node, {'weight':1, 'sign':1}) for node in list(guidance.nodes)])
#len(list(guidance.edges)) - len(list(guidance.nodes))

In [49]:
guidance.add_edges_from([(node, node + '_adt', {'weight':np.absolute(rna_adt_corrs[node]), 
                                                'sign':np.sign(rna_adt_corrs[node])}) for node in rna_adt_edges])
#len(list(guidance.edges)) - len(list(guidance.nodes)) - len(rna_adt_edges)

In [50]:
guidance.add_edges_from([(node, node + '_cytof', {'weight':1, 'sign':1}) for node in rna_cytof_edges])
#len(list(guidance.edges)) - len(list(guidance.nodes)) - len(rna_adt_edges) - len(rna_cytof_edges)

In [51]:
guidance.add_edges_from([(node, node + '_facs', {'weight':1, 'sign':1}) for node in rna_facs_edges])
#len(list(guidance.edges)) - len(list(guidance.nodes)) - len(rna_adt_edges) - len(rna_cytof_edges) - len(rna_facs_edges)

In [52]:
guidance.add_edges_from([(node + '_adt', node + '_cytof', {'weight':1, 'sign':1}) for node in adt_cytof_edges])
#(len(list(guidance.edges)) - len(list(guidance.nodes)) - len(rna_adt_edges) - len(rna_cytof_edges) - len(rna_facs_edges) -
#    len(adt_cytof_edges))

In [53]:
guidance.add_edges_from([(node + '_adt', node + '_facs', {'weight':1, 'sign':1}) for node in adt_facs_edges])
#(len(list(guidance.edges)) - len(list(guidance.nodes)) - len(rna_adt_edges) - len(rna_cytof_edges) - len(rna_facs_edges) -
#    len(adt_cytof_edges) - len(adt_facs_edges))

In [54]:
guidance.add_edges_from([(node + '_cytof', node + '_facs', {'weight':1, 'sign':1}) for node in cytof_facs_edges])
#(len(list(guidance.edges)) - len(list(guidance.nodes)) - len(rna_adt_edges) - len(rna_cytof_edges) - len(rna_facs_edges) -
#    len(adt_cytof_edges) - len(adt_facs_edges) - len(cytof_facs_edges))

In [55]:
#len(list(guidance.nodes)) - len(rna.var) - len(adt.var) - len(cytof.var) -len(facs.var)

In [56]:
scglue.graph.check_graph(guidance, [rna, adt, cytof, facs])

[INFO] check_graph: Checking variable coverage...
[INFO] check_graph: Checking edge attributes...
[INFO] check_graph: Checking self-loops...
[INFO] check_graph: Checking graph symmetry...
[INFO] check_graph: All checks passed!


## Building cite subgraph from full graph - rna_adt correlation

In [57]:
guidance_sub_nodes = rna.var_names.tolist() + adt.var_names.tolist()
len(guidance_sub_nodes)

4192

In [59]:
guidance_cite = guidance.subgraph(guidance_sub_nodes)

In [60]:
len(guidance_cite.nodes), len(guidance_cite.edges)

(4192, 4212)

In [61]:
len(guidance_cite.edges) - len(rna.var_names) - len(adt.var_names) - len(rna_adt_edges)

0

## Saving graph for full data and subgraph for sub data - rna_adt correlation

In [62]:
nx.write_graphml(guidance, "guidance_graphs/rna_adt_corr/guidance.graphml.gz")

In [63]:
nx.write_graphml(guidance_cite, "guidance_graphs/rna_adt_corr/guidance_cite.graphml.gz")

#  <font color='red'> Building the guidance graph - all correlations </font>

In [64]:
guidance = nx.Graph()
guidance

<networkx.classes.graph.Graph at 0x7f5d7ad7cc70>

In [65]:
guidance.add_nodes_from(rna.var.index)
#len(list(guidance.nodes)) - len(rna.var.index)

In [66]:
guidance.add_nodes_from(adt.var.index)
#len(list(guidance.nodes)) - len(rna.var) - len(adt.var)

In [67]:
guidance.add_nodes_from(cytof.var.index)
#len(list(guidance.nodes)) - len(rna.var) - len(adt.var) - len(cytof.var)

In [68]:
guidance.add_nodes_from(facs.var.index)
#len(list(guidance.nodes)) - len(rna.var) - len(adt.var) - len(cytof.var) - len(facs.var)

In [69]:
guidance.add_edges_from([(node, node, {'weight':1, 'sign':1}) for node in list(guidance.nodes)])
#len(list(guidance.edges)) - len(list(guidance.nodes))

In [70]:
guidance.add_edges_from([(node, node + '_adt', {'weight':np.absolute(rna_adt_corrs[node]), 
                                                'sign':np.sign(rna_adt_corrs[node])}) for node in rna_adt_edges])
#len(list(guidance.edges)) - len(list(guidance.nodes)) - len(rna_adt_edges)

In [71]:
guidance.add_edges_from([(node, node + '_cytof', {'weight':np.absolute(rna_cytof_corrs[node]), 
                                                'sign':np.sign(rna_cytof_corrs[node])}) for node in rna_cytof_edges])
#len(list(guidance.edges)) - len(list(guidance.nodes)) - len(rna_adt_edges) - len(rna_cytof_edges)

In [72]:
guidance.add_edges_from([(node, node + '_facs', {'weight':np.absolute(rna_facs_corrs[node]), 
                                                'sign':np.sign(rna_facs_corrs[node])}) for node in rna_facs_edges])
#len(list(guidance.edges)) - len(list(guidance.nodes)) - len(rna_adt_edges) - len(rna_cytof_edges) - len(rna_facs_edges)

In [73]:
guidance.add_edges_from([(node + '_adt', node + '_cytof', {'weight':np.absolute(adt_cytof_corrs[node]), 
                                                        'sign':np.sign(adt_cytof_corrs[node])}) for node in adt_cytof_edges])
#(len(list(guidance.edges)) - len(list(guidance.nodes)) - len(rna_adt_edges) - len(rna_cytof_edges) - len(rna_facs_edges) -
#    len(adt_cytof_edges))

In [74]:
guidance.add_edges_from([(node + '_adt', node + '_facs', {'weight':np.absolute(adt_facs_corrs[node]), 
                                                        'sign':np.sign(adt_facs_corrs[node])}) for node in adt_facs_edges])
#(len(list(guidance.edges)) - len(list(guidance.nodes)) - len(rna_adt_edges) - len(rna_cytof_edges) - len(rna_facs_edges) -
#    len(adt_cytof_edges) - len(adt_facs_edges))

In [75]:
guidance.add_edges_from([(node + '_cytof', node + '_facs', {'weight':np.absolute(cytof_facs_corrs[node]), 
                                                        'sign':np.sign(cytof_facs_corrs[node])}) for node in cytof_facs_edges])
#(len(list(guidance.edges)) - len(list(guidance.nodes)) - len(rna_adt_edges) - len(rna_cytof_edges) - len(rna_facs_edges) -
#    len(adt_cytof_edges) - len(adt_facs_edges) - len(cytof_facs_edges))

In [76]:
#len(list(guidance.nodes)) - len(rna.var) - len(adt.var) - len(cytof.var) -len(facs.var)

In [77]:
scglue.graph.check_graph(guidance, [rna, adt, cytof, facs])

[INFO] check_graph: Checking variable coverage...
[INFO] check_graph: Checking edge attributes...
[INFO] check_graph: Checking self-loops...
[INFO] check_graph: Checking graph symmetry...
[INFO] check_graph: All checks passed!


## Building cite subgraph from full graph - all correlations

In [78]:
guidance_sub_nodes = rna.var_names.tolist() + adt.var_names.tolist()
len(guidance_sub_nodes)

4192

In [79]:
guidance_cite = guidance.subgraph(guidance_sub_nodes)

In [80]:
len(guidance_cite.nodes), len(guidance_cite.edges)

(4192, 4212)

In [81]:
len(guidance_cite.edges) - len(rna.var_names) - len(adt.var_names) - len(rna_adt_edges)

0

## Saving graph for full data and subgraph for sub data - all correlations

In [82]:
nx.write_graphml(guidance, "guidance_graphs/all_corr/guidance.graphml.gz")

In [83]:
nx.write_graphml(guidance_cite, "guidance_graphs/all_corr/guidance_cite.graphml.gz")