In [1]:
pwd

'/home/yang1641/PINNACLE'

In [5]:
cd evaluate

/home/yang1641/PINNACLE/evaluate


In [3]:
import glob
from collections import Counter
import os
import numpy as np
import pandas as pd
import json
import networkx as nx
import umap
from matplotlib import pyplot as plt
import seaborn as sns
import plotly.express as px

import torch

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
from visualize_representations import read_ppi, read_embed, plot_PINNACLE_embeddings, plot_emb, fit_umap, plot_protein_umap, plot_metagraph_umap

In [11]:
# see https://github.com/mims-harvard/PINNACLE/blob/main/pinnacle/run_pinnacle.sh
# and https://github.com/mims-harvard/PINNACLE/blob/main/pinnacle/parse_args.py

args_manual = {
    'pc_att_channels': 16,
    'feat_mat': 1024, 
    'output': 16,
    'hidden': 64,
    'lr': 0.01,
    'wd': 5e-4,
    'dropout': 0.6,
    'gradclip': 1.0,
    'n_heads': 8,
    'lmbda': 0.1,
    'theta': 0.3,
    'lr_cent': 0.1,
    'loss_type': "BCE",
    'plot': False,
    'G_f': '/scratch/gilbreth/yang1641/exome/data/PINNACLE/networks/global_ppi_edgelist.txt', 
    'ppi_dir': '/scratch/gilbreth/yang1641/exome/data/PINNACLE/networks/ppi_edgelists/', 
    'mg_f': '/scratch/gilbreth/yang1641/exome/data/PINNACLE/networks/mg_edgelist.txt', 
    'epochs': 10, 
    'resume_run': '', 
    'loader': 'graphsaint', 
    'batch_size': 8, 
    'norm': None,
    'save_prefix': '/scratch/gilbreth/yang1641/exome/results/pinnacle/reproduce/epoch250/try1', 
    'do_sweep': False, 
    'do_sweep_plot': False, 
    'do_plot_metagraph': False
              }

In [9]:
import argparse

In [10]:
parser = argparse.ArgumentParser()

In [12]:
input_f = "/scratch/gilbreth/yang1641/exome/results/pinnacle/reproduce/epoch250/"
output_dir = "figures/"

In [13]:
print("Read in data...")
ppi_x, mg_x, labels_dict = read_embed(
    input_f + "try1_protein_embed.pth", 
    input_f + "try1_mg_embed.pth", 
    input_f + "try1_labels_dict.txt")

Read in data...
PPI embeddings torch.Size([394760, 128])
Meta graph embeddings torch.Size([218, 128])


In [14]:
ppi_layers = read_ppi("/scratch/gilbreth/yang1641/exome/data/PINNACLE/networks/ppi_edgelists/")

In [15]:
type(ppi_layers)

dict

In [16]:
ppi_layers

{'duct_epithelial_cell': <networkx.classes.graph.Graph at 0x1476b30b5eb0>,
 'myoepithelial_cell': <networkx.classes.graph.Graph at 0x1476b30d1760>,
 'mucus_secreting_cell': <networkx.classes.graph.Graph at 0x1476b3083490>,
 'intestinal_crypt_stem_cell': <networkx.classes.graph.Graph at 0x14777ea34700>,
 'cd4-positive,_alpha-beta_memory_t_cell': <networkx.classes.graph.Graph at 0x1476b2e6fee0>,
 'plasmacytoid_dendritic_cell': <networkx.classes.graph.Graph at 0x1476b30fc160>,
 'ciliated_cell': <networkx.classes.graph.Graph at 0x1476b30ef790>,
 'mature_enterocyte': <networkx.classes.graph.Graph at 0x1476b30efac0>,
 'naive_regulatory_t_cell': <networkx.classes.graph.Graph at 0x14777cd2c700>,
 'pulmonary_ionocyte': <networkx.classes.graph.Graph at 0x14777cd2c640>,
 'erythroid_progenitor': <networkx.classes.graph.Graph at 0x1476b3155a00>,
 'pancreatic_alpha_cell': <networkx.classes.graph.Graph at 0x1476b3155280>,
 'sperm': <networkx.classes.graph.Graph at 0x1476b3155040>,
 'tongue_muscle_cel

In [17]:
cd ..

/home/yang1641/PINNACLE


In [24]:
from pinnacle.generate_input import read_data
from pinnacle.parse_args import get_hparams

import argparse

# Convert dictionary to argparse Namespace
args = argparse.Namespace(**args_manual)

type(args)
print(args)

# Generate hyperparameters
hparams_raw = get_hparams(args)

Namespace(G_f='/scratch/gilbreth/yang1641/exome/data/PINNACLE/networks/global_ppi_edgelist.txt', batch_size=8, do_plot_metagraph=False, do_sweep=False, do_sweep_plot=False, dropout=0.6, epochs=10, feat_mat=1024, gradclip=1.0, hidden=64, lmbda=0.1, loader='graphsaint', loss_type='BCE', lr=0.01, lr_cent=0.1, mg_f='/scratch/gilbreth/yang1641/exome/data/PINNACLE/networks/mg_edgelist.txt', n_heads=8, norm=None, output=16, pc_att_channels=16, plot=False, ppi_dir='/scratch/gilbreth/yang1641/exome/data/PINNACLE/networks/ppi_edgelists/', resume_run='', save_prefix='/scratch/gilbreth/yang1641/exome/results/pinnacle/reproduce/epoch250/try1', theta=0.3, wd=0.0005)
Hyperparameters: {'pc_att_channels': 16, 'feat_mat': 1024, 'output': 16, 'hidden': 64, 'lr': 0.01, 'wd': 0.0005, 'dropout': 0.6, 'gradclip': 1.0, 'n_heads': 8, 'lambda': 0.1, 'theta': 0.3, 'lr_cent': 0.1, 'loss_type': 'BCE', 'plot': False}


In [26]:
ppi_data, mg_data, edge_attr_dict, celltype_map, tissue_neighbors, ppi_layers, metagraph = read_data(
    args.G_f, args.ppi_dir, args.mg_f, #hparams['feat_mat']
    args.feat_mat
)

Number of PPI layers: 156 156 156 156
Number of nodes: 218 Number of edges: 4018
{'duct epithelial cell': <networkx.classes.graph.Graph object at 0x1476b31a1070>, 'myoepithelial cell': <networkx.classes.graph.Graph object at 0x14760fabff70>, 'mucus secreting cell': <networkx.classes.graph.Graph object at 0x14760facf430>, 'intestinal crypt stem cell': <networkx.classes.graph.Graph object at 0x14760facfe50>, 'cd4-positive, alpha-beta memory t cell': <networkx.classes.graph.Graph object at 0x14760facf250>, 'plasmacytoid dendritic cell': <networkx.classes.graph.Graph object at 0x14760faae2b0>, 'ciliated cell': <networkx.classes.graph.Graph object at 0x14760faae610>, 'mature enterocyte': <networkx.classes.graph.Graph object at 0x14760faaec70>, 'naive regulatory t cell': <networkx.classes.graph.Graph object at 0x14760faaeee0>, 'pulmonary ionocyte': <networkx.classes.graph.Graph object at 0x14760faf1610>, 'erythroid progenitor': <networkx.classes.graph.Graph object at 0x14760faf12b0>, 'pancre

All relabeled nodes: [0, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 4, 10, 11, 25, 39, 40, 76, 78, 84, 132, 196, 1, 2, 3, 5, 6, 7, 8, 9, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 37, 38, 41, 42, 43, 44, 45, 47, 48, 49, 50, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 77, 79, 80, 81, 82, 83, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 124, 125, 126, 127, 128, 130, 131, 134, 135, 137, 139, 140, 141, 142, 195, 189, 198, 201, 191, 46, 112, 202, 51, 133, 136, 138, 203, 184, 101, 182, 164, 199, 187, 123, 36, 190, 204, 129, 165, 216, 175, 207, 192, 186, 183, 178, 166, 171, 209, 194, 200, 211, 167, 206, 181, 208, 177, 172, 170, 159, 169, 188, 174, 215, 162, 214, 161, 156, 157, 210, 205, 180, 197, 213, 212, 163, 179, 173, 168, 217, 193, 176, 185, 160, 158

In [27]:
ppi_layers

{'duct epithelial cell': <networkx.classes.graph.Graph at 0x1476b2ef1430>,
 'myoepithelial cell': <networkx.classes.graph.Graph at 0x1476b311a160>,
 'mucus secreting cell': <networkx.classes.graph.Graph at 0x14760facfc40>,
 'intestinal crypt stem cell': <networkx.classes.graph.Graph at 0x14760facfb20>,
 'cd4-positive, alpha-beta memory t cell': <networkx.classes.graph.Graph at 0x14760facfd00>,
 'plasmacytoid dendritic cell': <networkx.classes.graph.Graph at 0x14760facf220>,
 'ciliated cell': <networkx.classes.graph.Graph at 0x14760faaefd0>,
 'mature enterocyte': <networkx.classes.graph.Graph at 0x14760faaeca0>,
 'naive regulatory t cell': <networkx.classes.graph.Graph at 0x14760faae2e0>,
 'pulmonary ionocyte': <networkx.classes.graph.Graph at 0x14760faae0d0>,
 'erythroid progenitor': <networkx.classes.graph.Graph at 0x14760faf1490>,
 'pancreatic alpha cell': <networkx.classes.graph.Graph at 0x14760faf17f0>,
 'sperm': <networkx.classes.graph.Graph at 0x14760faf11f0>,
 'tongue muscle cel

In [None]:
# TODO: consider using read_ppi defined by yujue - maybe want ppi_layers have keys that do not include underscore

In [28]:
metagraph = nx.read_edgelist(args.mg_f, delimiter = "\t")

In [29]:
# Remove sanity check stuff
sanity_idx = [i for i, l in enumerate(labels_dict["Cell Type"]) if "Sanity" not in l]
new_labels_dict = dict()
for k, v in labels_dict.items():
    new_labels_dict[k] = np.array(v)[sanity_idx]

In [30]:
new_labels_dict

{'Cell Type': array(['CCI_acinar cell of salivary gland', 'CCI_adipocyte',
        'CCI_adventitial cell', ..., 'type i nk t cell',
        'type i nk t cell', 'type i nk t cell'], dtype='<U72'),
 'Name': array(['CCI_acinar cell of salivary gland', 'CCI_adipocyte',
        'CCI_adventitial cell', ..., 'SP110', 'ZNF101', 'TMC8'],
       dtype='<U72'),
 'Degree': array([100, 100, 100, ...,   2,   1,   1]),
 'Relative Degree': array([1.     , 1.     , 1.     , ..., 0.02778, 0.01389, 0.01389]),
 'Overlap': array([ 0,  0,  0, ..., 34,  7, 13])}

In [52]:
new_labels_dict['Name'].tolist()

['CCI_acinar cell of salivary gland',
 'CCI_adipocyte',
 'CCI_adventitial cell',
 'CCI_alveolar fibroblast',
 'CCI_artery endothelial cell',
 'CCI_b cell',
 'CCI_basal cell',
 'CCI_basal cell of prostate epithelium',
 'CCI_bladder urothelial cell',
 'CCI_bronchial smooth muscle cell',
 'CCI_bronchial vessel endothelial cell',
 'CCI_capillary endothelial cell',
 'CCI_cardiac endothelial cell',
 'CCI_cardiac muscle cell',
 'CCI_cd141-positive myeloid dendritic cell',
 'CCI_cd1c-positive myeloid dendritic cell',
 'CCI_cd24 neutrophil',
 'CCI_cd4-positive helper t cell',
 'CCI_cd4-positive, alpha-beta memory t cell',
 'CCI_cd8-positive, alpha-beta cytokine secreting effector t cell',
 'CCI_cd8-positive, alpha-beta cytotoxic t cell',
 'CCI_cell of skeletal muscle',
 'CCI_ciliary body',
 'CCI_ciliated cell',
 'CCI_ciliated epithelial cell',
 'CCI_classical monocyte',
 'CCI_club cell',
 'CCI_club cell of prostate epithelium',
 'CCI_conjunctival epithelial cell',
 'CCI_connective tissue cell',

In [31]:
print("Plot embeddings...")
if args.do_sweep:
    umap_param = {"n_neighbors": [10, 20, 30, 40, 50, 100], "min_dist": [0.1, 0.15, 0.2, 0.25, 0.5, 0.75, 0.8, 0.9]}
else:
    umap_param = {"n_neighbors": 10, "min_dist": 0.9}

Plot embeddings...


In [34]:
pwd

'/home/yang1641/PINNACLE'

In [45]:
('/').join(args.save_prefix.split('/')[:-1]) + '/' + output_dir

'/scratch/gilbreth/yang1641/exome/results/pinnacle/reproduce/epoch250/figures/'

In [46]:
mkdir -p /scratch/gilbreth/yang1641/exome/results/pinnacle/reproduce/epoch250/figures/

# figures folder contain files that are quoted - but still can view

In [47]:
plot_emb(ppi_x, mg_x, new_labels_dict, ppi_layers, metagraph, umap_param, 
         args.do_plot_metagraph, args.do_sweep, args.do_sweep_plot, #output_dir
        ('/').join(args.save_prefix.split('/')[:-1]) + '/' + output_dir
        )

Fit UMAP...
UMAP reduced: (394760, 2)
x 394760
y 394760
Cell Type 394760
Node Type 394760
Node Name 394760
medullary thymic epithelial cell
bronchial vessel endothelial cell
lung microvascular endothelial cell
retinal blood vessel endothelial cell
kidney epithelial cell
tongue muscle cell
cell of skeletal muscle
mesenchymal stem cell
fibroblast of breast
fibroblast of cardiac tissue
b cell
cd4-positive helper t cell


In [53]:
umap_param

{'n_neighbors': 10, 'min_dist': 0.9}