## A. CellphoneDB pre-processing

In [20]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata
import os
import sys

def MovePlots(plotpattern, subplotdir):
    os.system('mkdir -p '+str(sc.settings.figdir)+'/'+subplotdir)
    os.system('mv '+str(sc.settings.figdir)+'/*'+plotpattern+'** '+str(sc.settings.figdir)+'/'+subplotdir)

sc.settings.verbosity = 1  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.figdir = './cellphoneDB/'
sc.logging.print_versions()
sc.settings.set_figure_params(dpi=80)  # low dpi (dots per inch) yields small inline figures

sys.executable


def grouped_obs_percent(adata, group_key, layer=None, gene_symbols=None):
    if layer is not None:
        getX = lambda x: x.layers[layer]
    else:
        getX = lambda x: x.X
    if gene_symbols is not None:
        new_idx = adata.var[idx]
    else:
        new_idx = adata.var_names

    grouped = adata.obs.groupby(group_key)
    out = pd.DataFrame(
        np.zeros((adata.shape[1], len(grouped)), dtype=np.float64),
        columns=list(grouped.groups.keys()),
        index=adata.var_names
    )

    for group, idx in grouped.indices.items():
        X = getX(adata[idx])
        X.data = X.data > 0.01
        perc = np.asarray(np.sum(X,axis=0)/X.shape[0]).reshape(-1)
        out[group] = [round(i, 2) for i in perc ]
    return out


def grouped_obs_mean(adata, group_key, layer=None, gene_symbols=None):
    if layer is not None:
        getX = lambda x: x.layers[layer]
    else:
        getX = lambda x: x.X
    if gene_symbols is not None:
        new_idx = adata.var[idx]
    else:
        new_idx = adata.var_names

    grouped = adata.obs.groupby(group_key)
    out = pd.DataFrame(
        np.zeros((adata.shape[1], len(grouped)), dtype=np.float64),
        columns=list(grouped.groups.keys()),
        index=adata.var_names
    )

    for group, idx in grouped.indices.items():
        X = getX(adata[idx])
        out[group] = np.ravel(X.mean(axis=0, dtype=np.float64))
    return out



-----
anndata     0.7.5
scanpy      1.7.0rc1
sinfo       0.3.1
-----
PIL                 8.1.2
anndata             0.7.5
anyio               NA
attr                20.3.0
babel               2.9.0
backcall            0.2.0
brotli              NA
cairo               1.20.0
certifi             2020.12.05
cffi                1.14.5
chardet             4.0.0
cloudpickle         1.6.0
colorama            0.4.4
cycler              0.10.0
cython_runtime      NA
cytoolz             0.11.0
dask                2021.03.0
dateutil            2.8.1
decorator           4.4.2
get_version         2.1
google              NA
h5py                2.10.0
idna                2.10
igraph              0.9.1
ipykernel           5.5.0
ipython_genutils    0.2.0
jedi                0.18.0
jinja2              2.11.3
joblib              1.0.1
json5               NA
jsonschema          3.2.0
jupyter_server      1.4.1
jupyterlab_server   2.3.0
kiwisolver          1.3.1
legacy_api_wrap     0.0.0
leidenalg           0.

# Prepare INPUT

## Load andata

In [21]:
adata = sc.read('/nfs/team292/lg18/with_valentina/FCA-M5-annotatedCluster4Seurat.h5ad')
adata.X.shape

(319081, 28230)

### Load cell clusters annotation

In [22]:
# Add cluster name and doublet information
clu_annot = pd.read_csv('figures_manual_annotation/clustering_metadata.csv', header=0, index_col=0)
import collections 

if collections.Counter(adata.obs.index) == collections.Counter(clu_annot.index): 
    print ("The lists are identical") 
else : 
    print ("The lists are not identical") 
    
vars2import = ['clusters_manual','louvain', 'is_doublet', 'scrublet_cluster_score', 'scrublet_score']
for var in vars2import:
    adata.obs[var] = clu_annot[var]

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


The lists are identical


In [23]:
meta_som = pd.read_csv('/nfs/team292/lg18/with_valentina/supporting_nocycling_annotation.csv')
meta_som = meta_som.set_index('Unnamed: 0')
meta_som.head()

meta_germ = pd.read_csv('/nfs/team292/lg18/with_valentina/germcells_annotation.csv')
meta_germ = meta_germ.set_index('Unnamed: 0')
meta_germ.head()


Unnamed: 0_level_0,5v1.1,TP,batch,batch.collection,cryopreserved,individual,location,nCount_RNA,nFeatures_RNA,percent.mito,...,clusters_previous,S.Score,G2M.Score,Phase,old.ident,RNA_snn_res.0.3,seurat_clusters,clusters_old,clusters_sex,annotated_clusters
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
FCA_GND8047885_AAGACCTCAGTATAAG,2,U,0,0,0,F81,0,1469,3446,0.013602,...,PGC_mitotic,-0.061951,0.658402,G2M,FCA-M5-annotatedCluster4Seurat,4,4,PGC_mitotic,new,PGC_mitotic
FCA_GND8047885_AAGGTTCAGTTAAGTG,2,U,0,0,0,F81,0,1763,3821,0.017654,...,PGC_mitotic,-0.126121,0.615875,G2M,FCA-M5-annotatedCluster4Seurat,4,4,PGC_mitotic,new,PGC_mitotic
FCA_GND8047885_ATCATCTTCGTTGACA,2,U,0,0,0,F81,0,850,2979,0.065334,...,PGC,-0.051857,-0.098312,G1,FCA-M5-annotatedCluster4Seurat,2,2,PGC,new,PGC
FCA_GND8047885_CCTCTGAAGTGACATA,2,U,0,0,0,F81,0,1446,4123,0.014408,...,PGC,-0.145298,-0.111084,G1,FCA-M5-annotatedCluster4Seurat,2,2,PGC,new,PGC
FCA_GND8047885_CGAACATAGCCGGTAA,2,U,0,0,0,F81,0,2458,4494,0.013342,...,PGC,-0.23333,-0.206078,G1,FCA-M5-annotatedCluster4Seurat,2,2,PGC,new,PGC


# Filter clusters of interest

Use only barcodes in metadata file containing germ or somatic cells in the gonad

In [24]:
barcodes = meta_som.index.tolist() + meta_germ.index.tolist() 
adata = adata[adata.obs.index.isin(barcodes),:]

And add `cluster` label

In [25]:
som_cell_annot = meta_som['annotated_clusters'].to_dict()
germ_cell_annot = meta_germ['annotated_clusters'].to_dict()
cell_annot = {**som_cell_annot, **germ_cell_annot}
adata.obs['clusters'] = [ cell_annot[i] for i in adata.obs_names ] 

Trying to set attribute `.obs` of view, copying.


In [26]:
set(adata.obs['clusters'] )

{'FetalLeydig-like',
 'GC_mitotic',
 'PGC',
 'PGC_mitotic',
 'Sertoli',
 'Sertoli_WFDC2',
 'Sertoli_low',
 'coelEpi',
 'oocyte',
 'oogonia_STRA8',
 'oogonia_meiosis',
 'ovarianSurf',
 'pre-spermatogonia',
 'preGC_II',
 'preGC_III',
 'preGC_III_Notch',
 'preGC_II_hypoxia',
 'preGC_I_OSR1',
 'sKITLG',
 'sLGR5',
 'sPAX8b',
 'sPAX8m'}

Add sex info in the bipotent clusters - to match microenviroments file

In [27]:
s = adata.obs['clusters'].isin(['sLGR5'])
idx = s[s].index.values
adata.obs['clusters_sex'] =  adata.obs['clusters']
for i in idx:
    adata.obs.at[i,'clusters_sex'] = adata.obs['clusters'][i]+'_'+adata.obs['sex'][i]
set(adata.obs['clusters_sex'])

{'FetalLeydig-like',
 'GC_mitotic',
 'PGC',
 'PGC_mitotic',
 'Sertoli',
 'Sertoli_WFDC2',
 'Sertoli_low',
 'coelEpi',
 'oocyte',
 'oogonia_STRA8',
 'oogonia_meiosis',
 'ovarianSurf',
 'pre-spermatogonia',
 'preGC_II',
 'preGC_III',
 'preGC_III_Notch',
 'preGC_II_hypoxia',
 'preGC_I_OSR1',
 'sKITLG',
 'sLGR5_female',
 'sLGR5_male',
 'sPAX8b',
 'sPAX8m'}

## Filter cells in the microenviroments of interest

In [28]:
micro = pd.read_csv('cellphoneDB/input/microenviroments/microenviroments.csv')
# micro = micro.set_index('Unnamed: 0')
micro.head()

Unnamed: 0,celltype,microenviroment
0,coelEpi,cortex
1,ovarianSurf,cortex
2,PGC,cortex
3,PGC_mitotic,cortex
4,preGC_II_hypoxia,cortex


In [29]:
cells_common = list(set(adata.obs['clusters_sex'] )  & set(micro.celltype)) 
adata = adata[[ i in cells_common for i in adata.obs.clusters_sex ]]

In [30]:
set(adata.obs['clusters_sex'] )

{'PGC',
 'PGC_mitotic',
 'Sertoli',
 'Sertoli_WFDC2',
 'coelEpi',
 'oocyte',
 'oogonia_STRA8',
 'oogonia_meiosis',
 'ovarianSurf',
 'pre-spermatogonia',
 'preGC_II',
 'preGC_III',
 'preGC_III_Notch',
 'preGC_II_hypoxia',
 'preGC_I_OSR1',
 'sKITLG',
 'sLGR5_female',
 'sLGR5_male',
 'sPAX8b',
 'sPAX8m'}

In [31]:
# pd.DataFrame(adata.obs['clusters_sex']).to_csv(str(sc.settings.figdir)+'/input/metadata.csv')

## Compute average and percentatge expression per cluster

In [32]:
# Normalized average
adata_norm = adata.copy()
sc.pp.normalize_per_cell(adata_norm, counts_per_cell_after=1e4)
sc.pp.log1p(adata_norm)
means = grouped_obs_mean(adata_norm, 'clusters_sex')

In [33]:
# percentatge expression
percent = grouped_obs_percent(adata, 'clusters_sex')

In [34]:
# # un-byte
# means.index = [ i.decode("utf-8") for i in means.index]
# percent.index = [ i.decode("utf-8") for i in percent.index]

In [35]:
means.head()
percent.head()

Unnamed: 0_level_0,PGC,PGC_mitotic,Sertoli,Sertoli_WFDC2,coelEpi,oocyte,oogonia_STRA8,oogonia_meiosis,ovarianSurf,pre-spermatogonia,preGC_II,preGC_III,preGC_III_Notch,preGC_II_hypoxia,preGC_I_OSR1,sKITLG,sLGR5_female,sLGR5_male,sPAX8b,sPAX8m
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
MIR1302-2HG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AL627309.1,0.0,0.0,0.0,0.0,0.0,0.07,0.01,0.02,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AL669831.5,0.11,0.14,0.01,0.01,0.05,0.07,0.09,0.06,0.06,0.03,0.04,0.03,0.04,0.07,0.04,0.04,0.05,0.03,0.04,0.02
LINC00115,0.03,0.04,0.03,0.05,0.04,0.02,0.06,0.04,0.03,0.04,0.03,0.05,0.05,0.03,0.04,0.03,0.05,0.04,0.04,0.04
FAM41C,0.02,0.03,0.0,0.0,0.01,0.08,0.04,0.04,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
pd.DataFrame(means).to_csv(str(sc.settings.figdir)+'/input/average_log.csv')
pd.DataFrame(percent).to_csv(str(sc.settings.figdir)+'/input/percent.csv')

## Save gene expression 

Raw counts

In [37]:
adata.write(str(sc.settings.figdir)+'/input/counts.h5ad')

Trying to set attribute `.obs` of view, copying.
... storing 'clusters' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'clusters_manual' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'clusters_sex' as categorical


## Save meta

In [38]:
df_meta = pd.DataFrame(data={'Cell':list(adata.obs.index),
                             'cell_type':[ 'celltype_'+str(i) for i in adata.obs['clusters_sex']] })
df_meta.set_index('Cell', inplace=True)
df_meta.to_csv('cellphoneDB/input/meta.tsv', sep = '\t')

# Run cellphoneDB

```
source ~/cpdb-venv/bin/activate
```

## Create database
```
cellphonedb database generate \
    --result-path database \
    --user-gene ~/gonads/scRNAseq_analysis_FCA_2sex/cellphoneDB/database/gene_input_all.csv \
    --user-complex ~/gonads/scRNAseq_analysis_FCA_2sex/cellphoneDB/database/complex_curated_032021.tsv \
    --user-interactions ~/gonads/scRNAseq_analysis_FCA_2sex/cellphoneDB/database/interaction_curated_032021.tsv
```

## Run cellphone without statistical analysis - just get expressed interactions
```
cellphonedb method analysis \
    ~/gonads/main/cellphoneDB/input/meta.tsv \
    ~/gonads/main/cellphoneDB/input/counts.h5ad \
    --database ~/gonads/main/cellphoneDB/database/cellphonedb_user_2021-03-11-15_06.db \
    --counts-data hgnc_symbol \
    --output-path ~/gonads/main/cellphoneDB/out/ \
    --threshold 0
```


# Merge DEGs files

```
cp germcells_DEGs.csv merged_DEGs.csv \
grep gene -v supporting_DEGs.csv >> merged_DEGs.csv \
```