# Params description

In [1]:
import pandas as pd

In [2]:
PARAMS = \
    {'preprocess':
        {
            # We only recommend modifying the following 2 hyperparameter including path and dataset_file.
            'path': './dataset/',
            'dataset_file': pd.DataFrame(
                [['raw-liver-human-Martin.h5ad', True, 'gene_matches_human2monkey.csv',
                  'raw-liver-monkey-Martin.h5ad', False],

                 ['raw-liver-human-Martin.h5ad', True, 'gene_matches_human2mouse.csv',
                  'raw-liver-mouse-Martin.h5ad', False],

                 ['raw-liver-human-Martin.h5ad', True, 'gene_matches_human2zebrafish.csv',
                  'raw-liver-zebrafish-ggj5.h5ad', False],

                 ],
                # 'specie1 dataset', 'True represents specie1 dataset has the manual annotations, and vice versa does not', 'many-to-many homologous genes', 'specie2 dataset', 'True represents specie2 dataset has the manual annotations, and vice versa does not'.
                columns=['source', 'source label', 'relationship', 'destination', 'destination label']),  # column names indicate the above files

            # do not need to change
            'graph_mode': 'undirected',  # undirected or directed, default undirected
            'feature_gene': 'HIG',  # feature type
            'sample_ratio': 1,  # default 1, set to ratio of (0, 1] to down sample the dataset
            'get_balance': 'False'  # set ref and query with the same cell type
        },

        'train': {
            # We only recommend modifying the following 5 hyperparameter or use the default value.
            'device': 'cuda:0',  # cpu or cuda
            'train_mode': 'mini_batch',  # mini_batch or full batch
            'epoch_integration': 10,   # integration epoch
            'epoch_annotation': 10,  # annotation epoch
            'batch_size': 1024,  # batch_size

            # do not need to change
            'dim_hidden': 128,  # the dims of cell or gene embedding
            'gnn_layer_num': 2,  # the number of gnn layers
            'encoder': 'GCN',   # the type of gnn encoder
            'classifier': 'GAT',  # the type of classifier encoder
            'res': True,   # use residual or not
            'share': True,   # share the parameters or not
            'cluster': False,   # the epoch of clusters in training step
            'epoch_cluster': 10,   # the number of epochs for clustering in training step
            'cluster_num': 5,  # the number of clusters in training step
            'domain': False,  # use domain adaption or not
            'reconstruct': True,  # reconstruct the node feature or not

        },

        'postprocess': {}
    }

# For simplicity, we only need to change the input data to run CAMEX of your own data

## We put the gene expression ".h5ad" files and many-to-many homologous genes ".csv" files in the dataset folder.

## We recommend to use the "raw-organ-specie-dataset name.h5ad" to name gene expression, because we will calculate the relationships between the datasets in CAMEX.

In [3]:
%%html
<img src="./files.png", width = 800, heigth = 400>

## adata.obs must have a column of cell_ontology_class

In [4]:
import scanpy as sc

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
adata = sc.read_h5ad('./dataset/raw-liver-human-Martin.h5ad')
adata.obs.head(5)

  utils.warn_names_duplicates("var")


Unnamed: 0_level_0,UMAP_1,UMAP_2,cluster,annot,sample,patient,cell,digest,typeSample,diet,cell_ontology_class
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AAAGTCCCAGGACAGT-40,-0.055124,4.632419,13,Macrophages,CS170,H38,AAAGTCCCAGGACAGT-40,nuclei,nucSeq,Lean,Macrophages
AACAAGAGTTTACTGG-40,0.180024,4.72365,13,Macrophages,CS170,H38,AACAAGAGTTTACTGG-40,nuclei,nucSeq,Lean,Macrophages
AAGCATCCAACCGCCA-40,0.172339,4.693343,13,Macrophages,CS170,H38,AAGCATCCAACCGCCA-40,nuclei,nucSeq,Lean,Macrophages
AAGGAATAGGCTGTAG-40,0.422765,4.473393,13,Macrophages,CS170,H38,AAGGAATAGGCTGTAG-40,nuclei,nucSeq,Lean,Macrophages
AAGTTCGAGTAAACTG-40,0.038288,4.285389,13,Macrophages,CS170,H38,AAGTTCGAGTAAACTG-40,nuclei,nucSeq,Lean,Macrophages


## It is worth noting that columns in a many-to-many homology relationship are the names of the datasets of the corresponding species.

## Here, we use humans as a reference, mapping the genes of other species into the space of human genes.

In [6]:
%%html
<img src="./many-to-many.png", width = 800, heigth = 400>

## In addition, True represents specie dataset has the manual annotations, and vice versa does not

## You can set any dataset to True for integration, and there must be a dataset True for annotations.

In [7]:
['raw-liver-human-Martin.h5ad', True, 'gene_matches_human2monkey.csv', 'raw-liver-monkey-Martin.h5ad', False]

['raw-liver-human-Martin.h5ad',
 True,
 'gene_matches_human2monkey.csv',
 'raw-liver-monkey-Martin.h5ad',
 False]