# Linear NCEM Example

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pytorch_lightning as pl
import torch
import anndata as ad
from geome import ann2data, transforms
from utils.datasets import DatasetHartmann  # utils only for this example
from utils.models.linear_ncem import LinearNCEM  # utils only for this example
from utils import datamodule
import warnings

In [3]:
fields = {
    "x": ["obs/Cluster_preprocessed", "obs/donor", "obsm/design_matrix"],
    "y": ["X"],
}


preprocess = transforms.Categorize(keys=["donor", "Cluster_preprocessed", "point"], axis="obs")


transform = transforms.Compose(
    [
        transforms.AddEdgeIndex(
            edge_index_key="edge_index",spatial_key="spatial", key_added="spatial", func_args={"n_neighs": 10}
        ),
        transforms.AddDesignMatrix(
            "obs/Cluster_preprocessed",
            "obs/donor",
            "obsp/spatial_distances",
            "design_matrix",
        )
    ]
)



category_to_iterate = "point"


a2d = ann2data.Ann2DataByCategory(
    fields=fields,
    category=category_to_iterate,
    preprocess=preprocess,
    transform=transform,
)

# Mibitof
# supress the warning from the old dataset code
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    dataset = DatasetHartmann(data_path="./example_data/hartmann/")
    adatas = list(dataset.img_celldata.values())

# Merge the list of adatas and convert some string to categories as they should be
adata = ad.concat(adatas)

Loading data from raw files
registering celldata
collecting image-wise celldata
adding graph-level covariates
Loaded 58 images with complete data from 4 patients over 63747 cells with 36 cell features and 8 distinct celltypes.


In [4]:
datas = list(a2d(adata))
datas

[Data(x=[1338, 88], y=[1338, 36]),
 Data(x=[311, 88], y=[311, 36]),
 Data(x=[768, 88], y=[768, 36]),
 Data(x=[1020, 88], y=[1020, 36]),
 Data(x=[2100, 88], y=[2100, 36]),
 Data(x=[1325, 88], y=[1325, 36]),
 Data(x=[1091, 88], y=[1091, 36]),
 Data(x=[1046, 88], y=[1046, 36]),
 Data(x=[618, 88], y=[618, 36]),
 Data(x=[61, 88], y=[61, 36]),
 Data(x=[1316, 88], y=[1316, 36]),
 Data(x=[1540, 88], y=[1540, 36]),
 Data(x=[1822, 88], y=[1822, 36]),
 Data(x=[863, 88], y=[863, 36]),
 Data(x=[564, 88], y=[564, 36]),
 Data(x=[1023, 88], y=[1023, 36]),
 Data(x=[324, 88], y=[324, 36]),
 Data(x=[287, 88], y=[287, 36]),
 Data(x=[636, 88], y=[636, 36]),
 Data(x=[890, 88], y=[890, 36]),
 Data(x=[1235, 88], y=[1235, 36]),
 Data(x=[1020, 88], y=[1020, 36]),
 Data(x=[1241, 88], y=[1241, 36]),
 Data(x=[1438, 88], y=[1438, 36]),
 Data(x=[1021, 88], y=[1021, 36]),
 Data(x=[1632, 88], y=[1632, 36]),
 Data(x=[780, 88], y=[780, 36]),
 Data(x=[524, 88], y=[524, 36]),
 Data(x=[669, 88], y=[669, 36]),
 Data(x=[241,

In [5]:
num_features = datas[0].x.shape[1]
out_channels = datas[0].y.shape[1]
num_features, out_channels

(88, 36)

In [6]:
dm = datamodule.GraphAnnDataModule(datas=datas, num_workers=12, batch_size=12, learning_type="node")
model = LinearNCEM(in_channels=num_features, out_channels=out_channels, lr=0.0001, weight_decay=0.000001)

In [7]:
trainer: pl.Trainer = pl.Trainer(accelerator="gpu" if torch.torch.cuda.is_available() else "cpu", max_epochs=100)

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/Users/selman.ozleyen/mambaforge/envs/geome/lib/python3.11/site-packages/pytorch_lightning/trainer/setup.py:187: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
/Users/selman.ozleyen/mambaforge/envs/geome/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


In [8]:
trainer.fit(model, datamodule=dm)


  | Name        | Type            | Params
------------------------------------------------
0 | model_sigma | Linear          | 3.2 K 
1 | model_mu    | Linear          | 3.2 K 
2 | loss_module | GaussianNLLLoss | 0     
------------------------------------------------
6.4 K     Trainable params
0         Non-trainable params
6.4 K     Total params
0.026     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/Users/selman.ozleyen/mambaforge/envs/geome/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:436: Consider setting `persistent_workers=True` in 'val_dataloader' to speed up the dataloader worker initialization.


                                                                           

/Users/selman.ozleyen/mambaforge/envs/geome/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:436: Consider setting `persistent_workers=True` in 'train_dataloader' to speed up the dataloader worker initialization.


Epoch 0:  11%|█         | 485/4516 [00:08<01:07, 60.00it/s, v_num=17]

/Users/selman.ozleyen/mambaforge/envs/geome/lib/python3.11/site-packages/pytorch_lightning/trainer/call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...


In [9]:
trainer.test(model, datamodule=dm)

/Users/selman.ozleyen/mambaforge/envs/geome/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:436: Consider setting `persistent_workers=True` in 'test_dataloader' to speed up the dataloader worker initialization.


Testing DataLoader 0: 100%|██████████| 266/266 [00:00<00:00, 421.45it/s]


[{'test_r2_score': -24.66176986694336, 'test_loss': -0.08943767845630646}]