# Terminal commands for data access

```
autossh -AtX vschuma@ssh1.mdc-berlin.de ssh vschuma@murphy.mdc-berlin.net
# open the ssh screen and/or type
ssh -NR 6667:localhost:22 vfs@79.197.179.128 (current network ID)

# mount
sshfs -p 6667 vschuma@localhost:/data/local/rajewsky/home/vschuma /mnt/murphy_workspace/
or
sshfs vschuma@murphy:/data/local/rajewsky/home/vschuma/ ~/mounts/murphy_workspace/

# if working with the mount is to slow, copy the data to a ramdisk
sudo mount -t tmpfs -o size=6g tmpfs /mnt/ramdisk/

# then copy the file to the ramdisk
mkdir /mnt/ramdisk/data && rsync -av /mnt/murphy_workspace/NSTT/data/GarciaAlsono_uterus/hot_data/* /mnt/ramdisk/data
or
mkdir /dev/shm/data && rsync -av /home/vschuma/mounts/murphy_workspace/NSTT/data/GarciaAlsono_uterus/hot_data/* /dev/shm/data/
# symlink inside the pycharm project to the data dir
(example command)
ln -s /mnt/ramdisk/data data
e.g.: ln -s /mnt/murphy_workspace/NSTT/data/ data
or
ln -s /dev/shm/data/ data
```

In [1]:
%matplotlib inline
import anndata
import novosparc
import os
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt
import copy


import plotly.express as px
import plotly.graph_objects as go

# Reading data
I got the data from Anna & Tancredi. The spatial file is 6w tumor slide, spatial in-house method processed with spacemake. The other data is annotated single-nuclei data, exported by Anna. Method details - idk.

The data is located in Anna's home directory on the lab server:
`/data/local/rajewsky/home/aantona/novaseq/H5AD_snRNA_ST/`
I created a symlink within my own directory:
`/data/local/rajewsky/home/vschuma/NSTT/data/TNBC`

In [2]:
data_dir = 'data' # symlinked to /data/local/rajewsky/home/vschuma/NSTT/output/LabelTransfer/...

# Single-nuclei data

The `snRNA_TNBC_integreted.h5ad` file is the processed file from Anna which she imported to seurat, did stuff and all the annotations with it and exported back from seurat --> h5ad.

### get from h5ad file

In [4]:
%%time

sn_file = os.path.join(data_dir,"TNBC_sn_reduced.h5ad") # whatever reduced sample file
adata_sn = sc.read_h5ad(sn_file) # this is ~10GB RAM
print(adata_sn)

AnnData object with n_obs × n_vars = 26978 × 3000
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_SCT', 'nFeature_SCT', 'percent_MT', 'SCT_snn_res.0.8', 'Seurat_clusters_individual', 'S.Score', 'G2M.Score', 'Phase', 'doublet_score', 'doublet_assignment', 'annotated_idents', 'condition', 'annotated_Cellcompartments', 'annotated_Celltypes', 'annotated_Cellcompartments2', 'seurat_clusters', 'annotated_CelltypesState.res.2', 'annotated_compartment.res.2', 'annotated_compartmentCellType.res.2', 'celltype_res2.condition', 'celltype_res2', 'annotated_compartmentCellType2.res.2', 'Reannotation_detailed', 'Reannotation_global', 'celltype.condition', 'ConditionGlobal'
    var: 'features', 'SCT_features'
    uns: 'neighbors'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'SCT'
    obsp: 'distances'
CPU times: user 242 ms, sys: 1.38 s, total: 1.62 s
Wall time: 28.2 s


### save h5ad as pickle

In [None]:
import pickle

KeyboardInterrupt: 

In [20]:
save_dir = 'data'
#create a pickle file
snRNA_file = os.path.join(save_dir, "TNBC_snRNA_integrated.zip")

In [22]:
%%time
picklefile = open(snRNA_file, 'wb')
#pickle the dictionary and write it to file
pickle.dump(adata_sn, picklefile)
#close the file
picklefile.close()

KeyboardInterrupt: 

### get from pickle

In [23]:
%%time
picklefile = open(snRNA_file, 'rb')
#unpickle the dataframe
adata_sn = pickle.load(picklefile)
print(type(adata_sn))

EOFError: Ran out of input

In [26]:
# outfile = 'output/TNBC_sn_reduced.h5ad'
# # there seems to be some naming issue coming from seurat, found this hack at: https://github.com/theislab/scvelo/issues/255
# adata_reduced.__dict__['_raw'].__dict__['_var'] = adata_reduced.__dict__['_raw'].__dict__['_var'].rename(columns={'_index': 'features'})
# adata_reduced.write(outfile)

# Extract single parts for reconstruction
## DGE

In [16]:
sn_dge = pd.DataFrame(adata_sn.X, columns=adata_sn.var_names) # 3k genes, 26978 cells

In [17]:
savepath = os.path.join(data_dir,'TNBC_sn_integrated_dge.zip')
sn_dge.to_pickle(savepath)

# Metadata
* 'Phase'
* 'annotated_idents'
* 'condition'
* 'annotated_Cellcompartments'
* 'annotated_Celltypes'

In [13]:
def create_metadata_files(dataset, metadata_keys, filename_identifier, outputdir=''):
    """
    dataset = pd.DF
    """
    for annotation in metadata_keys:
        # create acceptable filename
        if '.' in annotation:
            suffix = annotation.replace(".","_").lower()
        else:
            suffix = annotation.lower()

        filename = f"TNBC_meta_{filename_identifier}_{suffix}.csv"
        outdir = outputdir
        filename = os.path.join(outdir,filename)
        # save as csv
        dataset[annotation].to_csv(filename, index=False)

## snRNA metadata

In [7]:
%%time
snRNA_meta_file = os.path.join(data_dir,'metadata_snRNA.csv')
# snRNA_meta = pd.read_pickle('data/meta_snRNA.zip')
snRNA_meta = pd.read_csv(snRNA_meta_file)

<class 'pandas.core.frame.DataFrame'>
CPU times: user 71.2 ms, sys: 28.2 ms, total: 99.4 ms
Wall time: 95.8 ms


In [14]:
meta_o_interest = ['ReannotationGlobal','ReannotationDetailed','ConditionGlobal','Phase']
create_metadata_files(snRNA_meta,meta_o_interest,'snRNA','data')

## spatial metadata

In [15]:
%%time
spatial_meta_file = os.path.join(data_dir,'metadata_fc_51_4.csv')
spatial_meta = pd.read_csv(spatial_meta_file)
spatial_meta

CPU times: user 111 ms, sys: 58.5 ms, total: 169 ms
Wall time: 167 ms


Unnamed: 0.1,Unnamed: 0,orig.ident,nCount_Spatial,nFeature_Spatial,V1,y_pos,n_genes_by_counts,total_counts,total_counts_mt,pct_counts_mt,...,tile,nCount_SCT,nFeature_SCT,SCT_snn_res.0.8,seurat_clusters,SCT_snn_res.1.5,SCT_snn_res.1,Annotation_1,x_pos,Annotation1
0,4840,SeuratProject,123,110,4840,35944.280433,110,123,5,4.065041,...,2456,166,101,11,11,13,11,Endothelium,28429.981116,Endothelium
1,4841,SeuratProject,300,258,4841,36503.387933,258,300,3,1.000000,...,2456,276,238,15,19,23,19,Vascular stroma,28429.981116,Vascularstroma
2,4905,SeuratProject,712,556,4905,35944.280433,556,712,36,5.056179,...,2456,537,478,15,19,23,19,Vascular stroma,28751.214988,Vascularstroma
3,4906,SeuratProject,354,299,4906,36503.387933,299,354,10,2.824859,...,2456,325,275,15,19,23,19,Vascular stroma,28751.214988,Vascularstroma
4,4907,SeuratProject,159,137,4907,37062.495433,137,159,1,0.628931,...,2456,228,128,15,19,23,19,Vascular stroma,28751.214988,Vascularstroma
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55473,141910,SeuratProject,157,141,141910,112694.529966,141,157,2,1.273885,...,2659,225,133,0,0,0,0,low_UMI,97357.408058,lowUMI
55474,141913,SeuratProject,104,94,141913,114371.379966,94,104,5,4.807692,...,2659,159,94,3,2,5,2,Adipocytes,97357.408058,Adipocytes
55475,141914,SeuratProject,260,225,141914,114930.329966,225,260,4,1.538462,...,2659,268,211,6,1,26,1,low_UMI,97357.408058,lowUMI
55476,141915,SeuratProject,124,119,141915,115489.279966,119,124,2,1.612903,...,2659,179,116,1,1,1,1,low_UMI,97357.408058,lowUMI


In [17]:
meta_o_interest = ['Annotation_1']
create_metadata_files(spatial_meta,meta_o_interest,'spatial','data')