# Terminal commands for data access

```
autossh -AtX vschuma@ssh1.mdc-berlin.de ssh vschuma@murphy.mdc-berlin.net
# open the ssh screen and/or type
ssh -NR 6667:localhost:22 vfs@79.197.179.128 (current network ID)

# mount
sshfs -p 6667 vschuma@localhost:/data/local/rajewsky/home/vschuma /mnt/murphy_workspace/
or
sshfs vschuma@murphy:/data/local/rajewsky/home/vschuma/ ~/mounts/murphy_workspace/

# if working with the mount is to slow, copy the data to a ramdisk
sudo mount -t tmpfs -o size=6g tmpfs /mnt/ramdisk/

# then copy the file to the ramdisk
mkdir /mnt/ramdisk/data && rsync -av /mnt/murphy_workspace/NSTT/data/GarciaAlsono_uterus/hot_data/* /mnt/ramdisk/data
or
mkdir /dev/shm/data && rsync -av /home/vschuma/mounts/murphy_workspace/NSTT/data/GarciaAlsono_uterus/hot_data/* /dev/shm/data/
# symlink inside the pycharm project to the data dir
(example command)
ln -s /mnt/ramdisk/data data
e.g.: ln -s /mnt/murphy_workspace/NSTT/data/ data
or
ln -s /dev/shm/data/ data
```

In [1]:
%matplotlib inline
import anndata
import novosparc
import os
import copy
import numpy as np
import pandas as pd
import scanpy as sc
import squidpy as sq
import seaborn as sns
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt

import plotly.express as px
import plotly.graph_objects as go

# Data

symlinked to `/data/local/rajewsky/home/vschuma/NSTT/outputs/LabelTransfer/TNBC`
## TNBC in-house atlas

In [3]:
# atlas coordinates
#atlas_dir = 'output'
atlas_dir = 'data'
target_space_path = os.path.join(atlas_dir, 'TNBC_6w_fc51_4_coordinates.csv')
locations = pd.read_csv(target_space_path, sep=',')
num_locations = locations.shape[0] # coming from the spatial data #
locations_apriori = locations[:num_locations][['xcoord', 'ycoord']].values
locations = locations_apriori

# atlas data
atlas_path = os.path.join(atlas_dir, 'TNBC_reconstruction_atlas.csv')
chunks = pd.read_csv(atlas_path, index_col=0, chunksize=1000000)
df = pd.concat(chunks)
atlas = sc.AnnData(df)
atlas_genes = atlas.var.index.tolist()
print(len(atlas_genes))
print(atlas.shape)
print(locations.shape)
atlas.obsm['spatial'] = locations

600
(55478, 600)
(55478, 2)


  atlas = sc.AnnData(df)


## TNBC single-nuclei data

for preprocessing and information see `metadata_transfer_TNBC_sndata.ipynb`

In [5]:
# Reading expression data to scanpy AnnData (cells x genes)
# data_dir = 'output'
data_dir = 'data'
data_path = os.path.join(data_dir, 'TNBC_sn_integrated_dge.csv')
dataset = sc.read(data_path).T
gene_names = dataset.var.index.tolist()
num_cells, num_genes = dataset.shape
print('number of cells total: %d' % num_cells)
print('number of genes: %d' % num_genes)

# use only a subset for testing
num_cells = 100
print('number of cells used: %d' % num_cells)

number of cells: 26978
number of genes: 3000


In [None]:
# only some genes
pl_genes = ['Rbp1', 'Hsd17b2',]
novosparc.pl.embedding(atlas, pl_genes,
                       pt_size=0.1, size_x=7, size_y=4.5)
                        # todo: this should throw an error when the pl_genes input format is wrong,
                        # why are the dots not filled anymore?
                        # there should be an info about the size unit
                        # why does the size of the whole thing does not automatically get's adjusted when I change the x,y size?
# all the genes
# pl_genes = atlas.var.index.to_frame()
# novosparc.pl.embedding(atlas, pl_genes.values.flatten().tolist())


# Reconstruction
## create tissue object

In [None]:
# calculate cost matrix
# params for smooth cost # only needed when/for the part where you don't use the atlas!
num_neighbors_s = num_neighbors_t = 5

# params for linear cost
markers = list(set(atlas_genes).intersection(gene_names))
atlas_matrix = atlas.to_df()[markers].values
markers_idx = pd.DataFrame({'markers_idx': np.arange(num_genes)}, index=gene_names)
markers_to_use = np.concatenate(markers_idx.loc[markers].values)

# construct tissue object
tissue = novosparc.cm.Tissue(dataset=dataset, locations=locations_apriori)


# setup smooth
num_neighbors_s = num_neighbors_t = 5

# alternative 1: setup both assumptions
tissue.setup_reconstruction(atlas_matrix=atlas_matrix,
                            markers_to_use=markers_to_use,
                            num_neighbors_s=num_neighbors_s,
                            num_neighbors_t=num_neighbors_t)

# compute optimal transport of cells to locations
alpha_linear = 0.1
epsilon = 5e-3
# tissue.dge = sparse.csr_matrix(tissue.dge)
tissue.reconstruct(alpha_linear=alpha_linear, epsilon=epsilon)

## save tissue object

In [None]:
save_dir = 'data'
# this is worng though, tissue is no anndata, so either I need to convert it first or save it in
# a different format
tissue.write(os.path.join(save_dir, 'TNBC_tissueObj_basicRecon.h5ad'))

In [None]:
# create
dataset_tissue = sc.AnnData(tissue.sdge, dtype=float)
dataset_tissue.obsm['spatial'] = locations

# save\
dataset_tissue.write(os.path.join(save_dir, 'TNBC_tissue_sdge_basicRecon.h5ad'))