# Terminal commands for data access

```
autossh -AtX vschuma@ssh1.mdc-berlin.de ssh vschuma@murphy.mdc-berlin.net
# open the ssh screen and/or type
ssh -NR 6667:localhost:22 vfs@79.197.179.128 (current network ID)

# mount
sshfs -p 6667 vschuma@localhost:/data/local/rajewsky/home/vschuma /mnt/murphy_workspace/
or
sshfs vschuma@murphy:/data/local/rajewsky/home/vschuma/ ~/mounts/murphy_workspace/

# if working with the mount is to slow, copy the data to a ramdisk
sudo mount -t tmpfs -o size=6g tmpfs /mnt/ramdisk/

# then copy the file to the ramdisk
mkdir /mnt/ramdisk/data && rsync -av /mnt/murphy_workspace/NSTT/data/GarciaAlsono_uterus/hot_data/* /mnt/ramdisk/data
or
mkdir /dev/shm/data && rsync -av /home/vschuma/mounts/murphy_workspace/NSTT/data/GarciaAlsono_uterus/hot_data/* /dev/shm/data/
# symlink inside the pycharm project to the data dir
(example command)
ln -s /mnt/ramdisk/data data
e.g.: ln -s /mnt/murphy_workspace/NSTT/data/ data
or
ln -s /dev/shm/data/ data
```

In [1]:
%matplotlib inline
import anndata
import novosparc
import os
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt
import copy


import plotly.express as px
import plotly.graph_objects as go

# Reading data
I got the data from Anna & Tancredi. The spatial file is 6w tumor slide, spatial in-house method processed with spacemake. The other data is annotated single-nuclei data, exported by Anna. Method details - idk.

The data is located in Anna's home directory on the lab server:
`/data/local/rajewsky/home/aantona/novaseq/H5AD_snRNA_ST/`
I created a symlink within my own directory:
`/data/local/rajewsky/home/vschuma/NSTT/data/TNBC`

In [2]:
data_dir = 'data' # symlinked to /data/local/rajewsky/home/vschuma/NSTT/output/LabelTransfer/...

## Single-nuclei data

The `snRNA_TNBC_integreted.h5ad` file is the processed file from Anna which she imported to seurat, did stuff and all the annotations with it and exported back from seurat --> h5ad.

### get from h5ad file

In [4]:
%%time

sn_file = os.path.join(data_dir,"TNBC_sn_reduced.h5ad") # whatever reduced sample file
adata_sn = sc.read_h5ad(sn_file) # this is ~10GB RAM
print(adata_sn)

AnnData object with n_obs × n_vars = 26978 × 3000
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_SCT', 'nFeature_SCT', 'percent_MT', 'SCT_snn_res.0.8', 'Seurat_clusters_individual', 'S.Score', 'G2M.Score', 'Phase', 'doublet_score', 'doublet_assignment', 'annotated_idents', 'condition', 'annotated_Cellcompartments', 'annotated_Celltypes', 'annotated_Cellcompartments2', 'seurat_clusters', 'annotated_CelltypesState.res.2', 'annotated_compartment.res.2', 'annotated_compartmentCellType.res.2', 'celltype_res2.condition', 'celltype_res2', 'annotated_compartmentCellType2.res.2', 'Reannotation_detailed', 'Reannotation_global', 'celltype.condition', 'ConditionGlobal'
    var: 'features', 'SCT_features'
    uns: 'neighbors'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'SCT'
    obsp: 'distances'
CPU times: user 242 ms, sys: 1.38 s, total: 1.62 s
Wall time: 28.2 s


### save h5ad as pickle

In [None]:
import pickle

KeyboardInterrupt: 

In [20]:
save_dir = 'data'
#create a pickle file
snRNA_file = os.path.join(save_dir, "TNBC_snRNA_integrated.zip")

In [22]:
%%time
picklefile = open(snRNA_file, 'wb')
#pickle the dictionary and write it to file
pickle.dump(adata_sn, picklefile)
#close the file
picklefile.close()

KeyboardInterrupt: 

### get from pickle

In [23]:
%%time
picklefile = open(snRNA_file, 'rb')
#unpickle the dataframe
adata_sn = pickle.load(picklefile)
print(type(adata_sn))

EOFError: Ran out of input

In [26]:
# outfile = 'output/TNBC_sn_reduced.h5ad'
# # there seems to be some naming issue coming from seurat, found this hack at: https://github.com/theislab/scvelo/issues/255
# adata_reduced.__dict__['_raw'].__dict__['_var'] = adata_reduced.__dict__['_raw'].__dict__['_var'].rename(columns={'_index': 'features'})
# adata_reduced.write(outfile)

# Extract single parts for reconstruction
## DGE

In [16]:
sn_dge = pd.DataFrame(adata_sn.X, columns=adata_sn.var_names) # 3k genes, 26978 cells

In [17]:
savepath = os.path.join(data_dir,'TNBC_sn_integrated_dge.zip')
sn_dge.to_pickle(savepath)

## Metadata
* 'Phase'
* 'annotated_idents'
* 'condition'
* 'annotated_Cellcompartments'
* 'annotated_Celltypes'

In [None]:
meta_o_interest = ['Phase','annotated_idents','celltype.condition']

# meta interest but it's only numbers
meta_notUsable = ['annotated_Cellcompartments','annotated_Celltypes','Reannotation_global','Reannotation_detailed']

In [None]:
for annotation in meta_o_interest:
    # create acceptable filename
    if '.' in annotation:
        suffix = annotation.replace(".","_").lower()
    else:
        suffix = annotation.lower()

    filename = f"TNBC_meta_{suffix}.csv"
    outdir = "output"
    filename = os.path.join(outdir,filename)
    # save as csv
    adata_sn.obs[annotation].to_csv(filename, index=False)

AAACCCAAGCTTCATG-1_1    20
AAACCCAAGCTTGTGT-1_1     4
AAACCCAAGGCTTTCA-1_1    22
AAACCCACAACAACAA-1_1    35
AAACCCACAATACGCT-1_1    23
                        ..
TTTGTTGAGCAGGCTA-1_3     9
TTTGTTGAGGTCGACA-1_3    20
TTTGTTGAGTGTAGTA-1_3     1
TTTGTTGCAGAACTAA-1_3    13
TTTGTTGGTCTCGACG-1_3     3
Name: seurat_clusters, Length: 26978, dtype: int32