### Prepare data for cellular signal analysis 

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata
import os

import sys

def MovePlots(plotpattern, subplotdir):
    os.system('mkdir -p '+str(sc.settings.figdir)+'/'+subplotdir)
    os.system('mv '+str(sc.settings.figdir)+'/*'+plotpattern+'** '+str(sc.settings.figdir)+'/'+subplotdir)

sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.figdir = './figures/'
sc.logging.print_versions()
sc.settings.set_figure_params(dpi=80)  # low dpi (dots per inch) yields small inline figures

sys.executable

The `sinfo` package has changed name and is now called `session_info` to become more discoverable and self-explanatory. The `sinfo` PyPI package will be kept around to avoid breaking old installs and you can downgrade to 0.3.2 if you want to use it without seeing this message. For the latest features and bug fixes, please install `session_info` instead. The usage and defaults also changed slightly, so please review the latest README at https://gitlab.com/joelostblom/session_info.
-----
anndata     0.7.6
scanpy      1.8.0
sinfo       0.3.4
-----
PIL                 8.3.0
backcall            0.2.0
beta_ufunc          NA
binom_ufunc         NA
cffi                1.14.5
cycler              0.10.0
cython_runtime      NA
dateutil            2.8.1
debugpy             1.3.0
decorator           4.4.2
h5py                3.6.0
igraph              0.9.5
ipykernel           6.0.1
ipython_genutils    0.2.0
jedi                0.18.0
joblib              1.0.1
kiwisolver          1.3.1
leidenalg    

'/home/jovyan/my-conda-envs/atac_env/bin/python'

#### Load TCGA metadata

In [37]:
path_to_metadata = '/lustre/scratch117/cellgen/cellgeni/team274/bulkData/'
path_to_data = '/lustre/scratch117/cellgen/cellgeni/team274/bulkData/TCGA_fragment_files/lustre/scratch117/casm/team274/my4/oldScratch/ProjectsExtras/CellDeconvolution/Results/bulkRNAseq/TCGA/frag/'

In [38]:
metadata = pd.read_csv(path_to_metadata + 'TCGA_BulkMetadata.tsv', sep = '\t', index_col = 0)
print(metadata.shape)
metadata.head()


(11284, 7)


Unnamed: 0,Tissue,Disease,SampleID,Project,Source,UniqueSampleID,vitalStatus
3DFF72D2-F292-497E-ACE3-6FAA9C884205,Liver,Hepatocellular Carcinoma,3DFF72D2-F292-497E-ACE3-6FAA9C884205,TCGA,TCGA,TCGA_3DFF72D2-F292-497E-ACE3-6FAA9C884205,Alive
B1E54366-42B9-463C-8615-B34D52BD14DC,Prostate,Prostate Adenocarcinoma Acinar Type,B1E54366-42B9-463C-8615-B34D52BD14DC,TCGA,TCGA,TCGA_B1E54366-42B9-463C-8615-B34D52BD14DC,Alive
473713F7-EB41-4F20-A37F-ACD209E3CB75,Rectum,Rectal Adenocarcinoma,473713F7-EB41-4F20-A37F-ACD209E3CB75,TCGA,TCGA,TCGA_473713F7-EB41-4F20-A37F-ACD209E3CB75,Alive
11F18F54-9B33-4C33-BDF9-0F093F4F3336,Liver,Hepatocellular Carcinoma,11F18F54-9B33-4C33-BDF9-0F093F4F3336,TCGA,TCGA,TCGA_11F18F54-9B33-4C33-BDF9-0F093F4F3336,Alive
136B7576-1108-4FA3-8254-6069F0CA879A,Bladder,Muscle invasive urothelial carcinoma (pT2 or a...,136B7576-1108-4FA3-8254-6069F0CA879A,TCGA,TCGA,TCGA_136B7576-1108-4FA3-8254-6069F0CA879A,Dead


In [39]:
metadata['Tissue'].value_counts()

Breast                                                                                      1246
Lung                                                                                        1156
Kidney                                                                                      1030
Endometrial                                                                                  576
Thyroid                                                                                      572
Prostate                                                                                     558
Head and Neck                                                                                548
Colon                                                                                        543
Central nervous system                                                                       531
Stomach                                                                                      453
Bladder                       

In [40]:
pd.set_option('display.max_rows', 500)
metadata['Disease'].value_counts()

Infiltrating Ductal Carcinoma                                                        896
Kidney Clear Cell Renal Carcinoma                                                    616
Prostate Adenocarcinoma Acinar Type                                                  542
Head & Neck Squamous Cell Carcinoma                                                  537
Lung Squamous Cell Carcinoma- Not Otherwise Specified (NOS)                          530
Colon Adenocarcinoma                                                                 466
Endometrioid endometrial adenocarcinoma                                              430
Serous Cystadenocarcinoma                                                            430
Muscle invasive urothelial carcinoma (pT2 or above)                                  428
Thyroid Papillary Carcinoma - Classical/usual                                        415
Hepatocellular Carcinoma                                                             413
Lung Adenocarcinoma- 

#### Filter ovarian cancers

In [41]:
# Filter for ovarian cancer 
metadata = metadata[metadata['Tissue'] == "Ovary"]
metadata.shape

(426, 7)

In [42]:
metadata['Disease'].value_counts(dropna = False)

Serous Cystadenocarcinoma    426
Name: Disease, dtype: int64

In [43]:
metadata['vitalStatus'].value_counts(dropna = False)

Dead     234
Alive    192
Name: vitalStatus, dtype: int64

In [31]:
metadata = metadata[metadata['Disease'] == "Serous endometrial adenocarcinoma"]

Get the list of *UniqueSampleID* column 

In [44]:
ids = list(metadata['UniqueSampleID'])
len(ids)

426

#### Format of bulk tumor samples (one file per sample)

Every bulk tumor sample is a tsv file with 3 columns: 

 * **geneName** : ENSEMBL gene ID 
 * **geneLengths** : length of the gene 
 * **cnts** : counts coming from the gene

#### Generate txt file with one sample per line (include full path to make sure it runs from everywhere)

In [45]:
ovarian_cancer_files = []
for i in ids:
    i_file = path_to_data + i + '.tsv'
    ovarian_cancer_files.append(i_file)

In [46]:
len(ovarian_cancer_files)

426

In [47]:
outdir = '/nfs/team292/vl6/CancerDeconvolution/'

In [48]:
# Write list to txt file, one element per line
with open(outdir + "bulk_samples_ovarian_cancer.txt", 'w') as f:
    f.write("\n".join(map(str, ovarian_cancer_files)))

#### Load single cell data and format it 

In [2]:
adata = sc.read('/lustre/scratch117/cellgen/team292/lh20/revision/integrated4.h5ad')
adata

AnnData object with n_obs × n_vars = 120810 × 28614
    obs: 'sample_names', 'log2p1_count', 'percent_mito', 'n_genes', 'batch', 'StudyName', 'SampleID', 'DonorID', 'BiopsyType', 'Location', 'Stage', 'Treatment', 'Batch', 'scrublet_pred', 'scrublet_local_pred', 'filtered_cells', 'Wang_celltype', 'leiden_scvi_batch_cc', 'leiden_scvi_sampl_cc', 'Studywise_celltypes', 'leiden_seurat_batch_cc', 'leiden_seurat_sampl_cc', 'broad_celltype', 'leiden_scvi_batch_epithetial', 'leiden_scvi_batch_stromal', 'scrublet_score', 'scrublet_cluster_score', 'Women age', '10x kit', 'leiden_res2_scvi_batchv2_Immune', 'leiden_res2_scvi_batchv2_Epithelial', 'leiden_res2_scvi_batchv2_Stromal', 'StromalSample', 'EpithelialSample', 'leiden_scvi_stromal_subsample', 'leiden_scvi_epithelial_subsample', 'leiden_scvi_stromal_subsample_donorID', 'leiden_scvi_epithelial_subsample_DonorID', 'palantir_pseudotime', 'palantir_pseudotime_proj', 'EpithelialCiliaSample', 'daconcat', 'leiden_scvi_sampl_cc_renamed', 'Day', 'leid

In [8]:
adata.obs['StudyName'].value_counts(dropna = False)

Wang    65081
Luz     55729
Name: StudyName, dtype: int64

In [5]:
metadata = pd.read_csv('/nfs/team292/lg18/with_valentina/endometrium_annotations.cvs', sep = '\t', index_col = 0)
metadata.head()

Unnamed: 0_level_0,cell_type
Cell,Unnamed: 1_level_1
4861STDY7387181_AAACCTGAGGCATGGT,Fibroblast eS
4861STDY7387181_AAACCTGAGGGCACTA,Fibroblast eS
4861STDY7387181_AAACCTGAGTGAAGTT,PV STEAP4
4861STDY7387181_AAACCTGAGTGGGCTA,Fibroblast dS
4861STDY7387181_AAACCTGCACCGAAAG,Fibroblast eS


In [6]:
mapping = metadata['cell_type'].to_dict()
adata.obs['cell_type'] = adata.obs_names.map(mapping)
adata.obs['cell_type'].value_counts(dropna = False)

NaN                        46884
Fibroblast dS              24436
Fibroblast eS              12675
uSMC                        7746
Endothelial ACKR1           7188
PV MYH11                    3414
PV STEAP4                   3345
epi_SOX9                    2806
Lymphoid                    2311
epi_Glandular_secretory     2147
epi_Lumenal 1               1376
epi_SOX9_LGR5               1150
epi_Glandular               1101
Fibroblast C7                932
epi_SOX9_prolif              883
epi_Ciliated                 762
Myeloid                      761
Endothelial SEMA3G           389
epi_Lumenal 2                273
epi_Ciliated LRG5            182
epi_Pre-ciliated              49
Name: cell_type, dtype: int64

In [7]:
adata.obs['cell_type'] = adata.obs['cell_type'].astype(str)
adata = adata[[i not in ['nan', 'epi_SOX9_prolif'] for i in adata.obs['cell_type']]]
adata.shape

(73043, 28614)

In [8]:
np.unique(adata.obs['cell_type'])

array(['Endothelial ACKR1', 'Endothelial SEMA3G', 'Fibroblast C7',
       'Fibroblast dS', 'Fibroblast eS', 'Lymphoid', 'Myeloid',
       'PV MYH11', 'PV STEAP4', 'epi_Ciliated', 'epi_Ciliated LRG5',
       'epi_Glandular', 'epi_Glandular_secretory', 'epi_Lumenal 1',
       'epi_Lumenal 2', 'epi_Pre-ciliated', 'epi_SOX9', 'epi_SOX9_LGR5',
       'uSMC'], dtype=object)

In [9]:
adata.obs['phase'].value_counts()

G1     68997
S       2536
G2M     1510
Name: phase, dtype: int64

#### Select cells that are in G1 phase of the cell cycle 

In [10]:
adata = adata[[i in ['G1'] for i in adata.obs['phase']]]
adata.shape

(68997, 28614)

In [11]:
adata.raw = adata.copy()

In [12]:
# Normalize per cell
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)

# Log-transform the data 
sc.pp.log1p(adata)

normalizing by total count per cell
    finished (0:00:03): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)


In [24]:
adata.raw.X

<68997x28614 sparse matrix of type '<class 'numpy.float64'>'
	with 175483466 stored elements in Compressed Sparse Row format>

#### Save files needed to run Cellular Signal Analysis

 1. Matrix of raw counts (mtx file)

In [25]:
# Save raw counts as matrix but remember to TRANSPOSE it first 
tadata = adata.raw.X.transpose()
tadata.shape

(28614, 68997)

In [28]:
from scipy import io
io.mmwrite(outdir + "Endometrium.mtx", tadata)

 2. Formatted cell barcodes (tsv file)

In [29]:
adata.obs['formatted_barcodes'] = adata.obs['cell_type'].astype(str) + ":" + adata.obs_names 

In [30]:
cell_barcodes = list(adata.obs['formatted_barcodes'])

In [31]:
cell_barcodes[0:10]

['Fibroblast eS:4861STDY7387181_AAACCTGAGGGCACTA',
 'PV STEAP4:4861STDY7387181_AAACCTGAGTGAAGTT',
 'Fibroblast dS:4861STDY7387181_AAACCTGAGTGGGCTA',
 'Fibroblast eS:4861STDY7387181_AAACCTGCACCGAAAG',
 'Fibroblast eS:4861STDY7387181_AAACCTGCACGAGAGT',
 'Fibroblast eS:4861STDY7387181_AAACCTGCATGGTCAT',
 'Fibroblast dS:4861STDY7387181_AAACCTGGTCATGCCG',
 'epi_SOX9:4861STDY7387181_AAACCTGTCAATAAGG',
 'Fibroblast eS:4861STDY7387181_AAACCTGTCAGGCCCA',
 'Fibroblast eS:4861STDY7387181_AAACCTGTCTAAGCCA']

In [32]:
# Save tsv file for column names (one cell barcode per line)
with open(outdir + "Endometrium_columnNames.tsv", 'w') as f:
    f.write("\n".join(map(str, cell_barcodes)))

 3. ENSEMBL genes (tsv file)

In [33]:
adata.var['gene_ids'].value_counts(dropna = False)

ENSG00000243485    1
ENSG00000118939    1
ENSG00000274898    1
ENSG00000005812    1
ENSG00000102805    1
                  ..
ENSG00000146250    1
ENSG00000065833    1
ENSG00000013392    1
ENSG00000013375    1
ENSG00000271254    1
Name: gene_ids, Length: 28614, dtype: int64

In [34]:
ensembl_ids = list(adata.var['gene_ids'])
with open(outdir + "Endometrium_rowNames.tsv", 'w') as f:
    f.write("\n".join(map(str, ensembl_ids)))

#### Run CellularSignalAnalysis from terminal

python -i cellSignalAnalysisV2_cellSignalAnalysis.py -b bulk_samples_ovarian_cancer.txt -s Endometrium -w cellSignalAnalysisV2_geneWeights.tsv output_ovarianCancer_END/

#### End of notebook 