In [1]:
import anndata as ad
import networkx as nx
import scanpy as sc
import scglue
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import scipy
import os

In [2]:
def load_data():
    print("loading data..\n")
    
    rna = ad.read_h5ad("../pp_harm_data/all_samples/rna-pp-harm-sub.h5ad")
    adt = ad.read_h5ad("../pp_harm_data/all_samples/adt-pp-harm-sub.h5ad")
    cytof = ad.read_h5ad("../pp_harm_data/all_samples/cytof-pp-harm-sub.h5ad")
    facs = ad.read_h5ad("../pp_harm_data/all_samples/facs-pp-harm-sub.h5ad")

    print("rna.shape: {}".format(rna.shape))
    print("adt.shape: {}".format(adt.shape))
    print("cytof.shape: {}".format(cytof.shape))
    print("facs.shape: {}\n".format(facs.shape))
    
    return rna, adt, cytof, facs

In [3]:
def intersecting_genes(rna, adt, cytof, facs):
    
    print("finding intersecting genes:..\n\n")
    
    rna_adt_edges = rna.var_names.intersection(adt.var_names)
    rna_cytof_edges = rna.var_names.intersection(cytof.var_names)
    rna_facs_edges = rna.var_names.intersection(facs.var_names)
    adt_cytof_edges = adt.var_names.intersection(cytof.var_names)
    adt_facs_edges = adt.var_names.intersection(facs.var_names)
    cytof_facs_edges = cytof.var_names.intersection(facs.var_names)
    
    print("edges between rna and adt: {}\n\n{}\n".format(len(rna_adt_edges), rna_adt_edges))
    print("edges between rna and cytof: {}\n\n{}\n".format(len(rna_cytof_edges), rna_cytof_edges))
    print("edges between rna and facs: {}\n\n{}\n\n".format(len(adt_cytof_edges), rna_facs_edges))
    print("edges between adt and cytof: {}\n\n{}\n\n".format(len(adt_cytof_edges), adt_cytof_edges))
    print("edges between adt and facs: {}\n\n{}\n\n".format(len(adt_cytof_edges), adt_facs_edges))
    print("edges between cytof and facs: {}\n\n{}\n\n".format(len(adt_cytof_edges), cytof_facs_edges))

    return rna_adt_edges, rna_cytof_edges, rna_facs_edges, adt_cytof_edges, adt_facs_edges, cytof_facs_edges

In [4]:
def convert_obs_names(adt, cytof, facs):
    adt.var.index = adt.var.index + '_adt'
    cytof.var.index = cytof.var.index + '_cytof'
    facs.var.index = facs.var.index + '_facs'

In [5]:
def build_full_graph(rna, adt, cytof, facs):
    
    (rna_adt_edges, rna_cytof_edges, rna_facs_edges, 
         adt_cytof_edges, adt_facs_edges, cytof_facs_edges) = intersecting_genes(rna, adt, cytof, facs)
    
    convert_obs_names(adt, cytof, facs) #converts obs names so that the graph distringuishes features from different modalities
    
    print("building full guidance graph..\n\n")
    
    guidance = nx.Graph()

    guidance.add_nodes_from(rna.var.index)
    #print(len(list(guidance.nodes)) - len(rna.var.index))

    guidance.add_nodes_from(adt.var.index)
    #print(len(list(guidance.nodes)) - len(rna.var) - len(adt.var))

    guidance.add_nodes_from(cytof.var.index)
    #print(len(list(guidance.nodes)) - len(rna.var) - len(adt.var) - len(cytof.var))

    guidance.add_nodes_from(facs.var.index)
    #print(len(list(guidance.nodes)) - len(rna.var) - len(adt.var) - len(cytof.var) - len(facs.var))

    guidance.add_edges_from([(node, node, {'weight':1, 'sign':1}) for node in list(guidance.nodes)])
    #print(len(list(guidance.edges)) - len(list(guidance.nodes)))

    guidance.add_edges_from([(node, node + '_adt', {'weight':0.9, 'sign':1}) for node in rna_adt_edges])
    #print(len(list(guidance.edges)) - len(list(guidance.nodes)) - len(rna_adt_edges))

    guidance.add_edges_from([(node, node + '_cytof', {'weight':0.9, 'sign':1}) for node in rna_cytof_edges])
    #print(len(list(guidance.edges)) - len(list(guidance.nodes)) - len(rna_adt_edges) - len(rna_cytof_edges))

    guidance.add_edges_from([(node, node + '_facs', {'weight':0.9, 'sign':1}) for node in rna_facs_edges])
    #print(len(list(guidance.edges)) - len(list(guidance.nodes)) - len(rna_adt_edges) - len(rna_cytof_edges) - len(rna_facs_edges))

    guidance.add_edges_from([(node + '_adt', node + '_cytof', {'weight':0.9, 'sign':1}) for node in adt_cytof_edges])
    #print(len(list(guidance.edges)) - len(list(guidance.nodes)) - len(rna_adt_edges) - len(rna_cytof_edges) - len(rna_facs_edges) -
        #len(adt_cytof_edges))

    guidance.add_edges_from([(node + '_adt', node + '_facs', {'weight':0.9, 'sign':1}) for node in adt_facs_edges])
    #print((len(list(guidance.edges)) - len(list(guidance.nodes)) - len(rna_adt_edges) - len(rna_cytof_edges) - len(rna_facs_edges) -
        #len(adt_cytof_edges) - len(adt_facs_edges)))

    guidance.add_edges_from([(node + '_cytof', node + '_facs', {'weight':0.9, 'sign':1}) for node in cytof_facs_edges])
    #print((len(list(guidance.edges)) - len(list(guidance.nodes)) - len(rna_adt_edges) - len(rna_cytof_edges) - len(rna_facs_edges) -
        #len(adt_cytof_edges) - len(adt_facs_edges) - len(cytof_facs_edges)))

    #print("checking full graph number of nodes")
    #print(len(list(guidance.nodes)) - len(rna.var) - len(adt.var) - len(cytof.var) -len(facs.var))
    
    print("full graph number of nodes: {}\nfull graph number of edges: {}\n\n".format(len(list(guidance.nodes)),
                                                                                  len(list(guidance.edges))))
    
    return guidance

In [6]:
def cite_subgraph(full_graph, rna, adt):
    
    print("building cite_subgraph.. \n\n")
    
    node_list = rna.var_names.tolist() + adt.var_names.tolist()
    cite_graph = full_graph.subgraph(node_list)
    
    print("cite graph number of nodes: {}\ncite graph number of edges: {}\n\n".format(len(list(cite_graph.nodes)),
                                                                                  len(list(cite_graph.edges))))
    
    #print("checking cite number of nodes")
    #print(len(list(cite_graph.nodes)) - len(rna.var) - len(adt.var))
        
    return cite_graph

In [7]:
def trimodal_subgraph(full_graph, rna, adt, cytof):
    
    print("building trimodal subgraph.. \n\n")
    
    node_list = rna.var_names.tolist() + adt.var_names.tolist() + cytof.var_names.tolist()
    trimodal_graph = full_graph.subgraph(node_list)
    
    print("trimodal graph number of nodes: {}\ntrimodal graph number of edges: {}\n\n".format(len(list(trimodal_graph.nodes)),
                                                                                          len(list(trimodal_graph.edges))))
    
    #print("checking trimodal number of nodes")
    #print(len(list(trimodal_graph.nodes)) - len(rna.var) - len(adt.var) - len(cytof.var))
    
    return trimodal_graph

In [8]:
def write_graphs(full_graph, cite_graph, trimodal_graph):
    
    graph_path = "./guidance_graphs/point_nine_corr/all_samples/"
    
    os.makedirs(graph_path, exist_ok=True)
    
    print("writing graphs in path: {}".format(graph_path))
    
    nx.write_graphml(full_graph, graph_path + "full_graph.graphml.gz")
    nx.write_graphml(cite_graph, graph_path + "cite_graph.graphml.gz")
    nx.write_graphml(trimodal_graph, graph_path + "trimodal_graph.graphml.gz")
    
    print("\n\nwriting complete")

In [9]:
def main():
    rna, adt, cytof, facs = load_data() # load data
    full_graph = build_full_graph(rna, adt, cytof, facs) #build full graph
    cite_graph = cite_subgraph(full_graph, rna, adt) #build cite graph
    trimodal_graph = trimodal_subgraph(full_graph, rna, adt, cytof) #build trimodal graph
    write_graphs(full_graph, cite_graph, trimodal_graph) #write graphs

In [10]:
main()

loading data..

rna.shape: (836148, 4000)
adt.shape: (836148, 192)
cytof.shape: (1000000, 48)
facs.shape: (131920, 12)

finding intersecting genes:..


edges between rna and adt: 16

Index(['CD34', 'CD207', 'CX3CR1', 'CD38', 'CD14', 'CD83', 'CD24', 'CD36',
       'CD163', 'CD69', 'CLEC12A', 'CD19', 'CD7', 'CD40', 'CD70', 'CD22'],
      dtype='object')

edges between rna and cytof: 8

Index(['CTLA4', 'CX3CR1', 'CD38', 'CD14', 'FOXP3', 'CD69', 'CD19', 'CCR7'], dtype='object')

edges between rna and facs: 26

Index(['CCR4', 'CD38', 'CCR6', 'CXCR3', 'CCR7'], dtype='object')


edges between adt and cytof: 26

Index(['CD3', 'CD8', 'CD19', 'CD33', 'CD11c', 'CD45RA', 'CD123', 'CD4', 'CD14',
       'CD16', 'CD25', 'CD45RO', 'CD20', 'IgM', 'CD103', 'CD69', 'CD161',
       'CD27', 'HLA_DR', 'CD39', 'CX3CR1', 'IgA', 'CD28', 'CD38', 'CD45',
       'CD99'],
      dtype='object')


edges between adt and facs: 26

Index(['CD45RA', 'CD25', 'CD27', 'CD38'], dtype='object')


edges between cytof and facs