# Generate cellphoneDB input files

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata
import os
import sys

def MovePlots(plotpattern, subplotdir):
    os.system('mkdir -p '+str(sc.settings.figdir)+'/'+subplotdir)
    os.system('mv '+str(sc.settings.figdir)+'/*'+plotpattern+'** '+str(sc.settings.figdir)+'/'+subplotdir)

sc.settings.verbosity = 1  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.figdir = './figures/data_utils/'
sc.settings.set_figure_params(dpi=80)  # low dpi (dots per inch) yields small inline figures

sys.executable


def grouped_obs_percent(adata, group_key, layer=None, gene_symbols=None):
    if layer is not None:
        getX = lambda x: x.layers[layer]
    else:
        getX = lambda x: x.X
    if gene_symbols is not None:
        new_idx = adata.var[idx]
    else:
        new_idx = adata.var_names

    grouped = adata.obs.groupby(group_key)
    out = pd.DataFrame(
        np.zeros((adata.shape[1], len(grouped)), dtype=np.float64),
        columns=list(grouped.groups.keys()),
        index=adata.var_names
    )

    for group, idx in grouped.indices.items():
        X = getX(adata[idx])
        X.data = X.data > 0.01
        perc = np.asarray(np.sum(X,axis=0)/X.shape[0]).reshape(-1)
        out[group] = [round(i, 2) for i in perc ]
    return out


def grouped_obs_mean(adata, group_key, layer=None, gene_symbols=None):
    if layer is not None:
        getX = lambda x: x.layers[layer]
    else:
        getX = lambda x: x.X
    if gene_symbols is not None:
        new_idx = adata.var[idx]
    else:
        new_idx = adata.var_names

    grouped = adata.obs.groupby(group_key)
    out = pd.DataFrame(
        np.zeros((adata.shape[1], len(grouped)), dtype=np.float64),
        columns=list(grouped.groups.keys()),
        index=adata.var_names
    )

    for group, idx in grouped.indices.items():
        X = getX(adata[idx])
        out[group] = np.ravel(X.mean(axis=0, dtype=np.float64))
    return out

# Prepare INPUT

## Load andata

In [2]:
path_to_gonads = '/nfs/team292/lg18/with_valentina/gonadsV2_revision/'
adata = sc.read(path_to_gonads + 'FCA-gonads_rawcounts.h5ad')

### Load cell clusters annotation

In [3]:
# Add old cluster subname
metaF = pd.read_csv(path_to_gonads+'/FCA-gonads_female_annotated.csv')
metaF = metaF.set_index('Unnamed: 0')
metaF.head()

metaM = pd.read_csv(path_to_gonads+'/FCA-gonads_male_annotated.csv')
metaM = metaM.set_index('Unnamed: 0')
metaM.head()


cell_annotM = metaM['lineage'].to_dict()
cell_annotF = metaF['lineage'].to_dict()
cell_annot = {**cell_annotM , **cell_annotF}
adata.obs['lineage'] = adata.obs_names.map(cell_annot)
adata.obs['lineage'].value_counts()

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


Mesenchymal_GATA2    84229
Mesenchymal_LHX9     62747
preGranulosa         38881
Sertoli              34969
lowQC                34677
Supporting           31454
CoelEpi              24297
Epithelial           20222
GermCells            14516
Endothelial          14484
PV                   12679
Doublet               5821
Immune                5533
Neural                3783
SMCs                  3044
FetalLeydig           2426
cont                  1432
Erythroid             1371
Name: lineage, dtype: int64

In [4]:
# adata.obs['lineage'] = adata.obs['clusters_manual'].tolist()
# adata.obs['lineage'] = adata.obs['lineage'].replace(regex=r'Supporting_female', value='Supporting')
# set(adata.obs['lineage'] )


In [5]:
# del adata.obs['study']
# del adata.obs['stage']
# del adata.obs['location']
# del adata.obs['5v1.1']
# del adata.obs['batch_collection']
# del adata.obs['TP']
# del adata.obs['cryopreserved']
# del adata.obs['clusters_manual']
# del adata.obs['batch']
# del adata.obs['clusters']
# del adata.obs['sub_clusters']

# Import sub-analysis

In [6]:
# Add somatic cluster subname
meta = pd.read_csv(path_to_gonads+'FCA-gonads_somatic_annotated.csv')
meta = meta.set_index('Unnamed: 0')
meta['celltype'] = ['Somatic.'+i for i in meta['celltype'] ]
meta.head()


# Add in_gonad cluster subname
metaF = pd.read_csv(path_to_gonads+'/FCA-gonads_sPAX8_annotated.csv')
metaF = metaF.set_index('Unnamed: 0')
metaF = metaF[ metaF.lineage == 'Epithelial' ]
metaF['celltype'] = ['Epi.'+i for i in metaF['celltype'] ]
metaF.head()

# Add endothelials
metaEnd = pd.read_csv('/nfs/team292/vl6/immune_fetal_gonads/endothelial.csv')
metaEnd = metaEnd.set_index('Unnamed: 0')
metaEnd['celltype'] = ['Endo.'+i for i in metaEnd['celltype'] ]
metaEnd.head()

cell_annotM = meta['celltype'].to_dict()
cell_annotF = metaF['celltype'].to_dict()
cell_annotEnd = metaEnd['celltype'].to_dict()
cell_annot = {**cell_annotM , **cell_annotF, **cell_annotEnd}
adata.obs['celltype'] = adata.obs_names.map(cell_annot)
adata.obs['celltype'] = adata.obs['celltype'].fillna(adata.obs['lineage'])
adata.obs['celltype'].value_counts()

  interactivity=interactivity, compiler=compiler, result=result)


Mesenchymal_GATA2               84229
Somatic.Sertoli                 36917
Somatic.Ti                      34808
lowQC                           34516
Somatic.Oi                      25730
Somatic.preGC_IIb               23825
Somatic.preGC_I                 15599
GermCells                       14516
Somatic.preGC_IIa               13772
PV                              12679
Epithelial                       9268
Somatic.OSE                      8222
Somatic.early_supporting         5894
Doublet                          5821
Somatic.CoelEpi_LHX9             5547
Immune                           5533
Endo.Fenestrated_EC              5385
Epi.Epi_wolffian                 5230
Somatic.Gi                       4749
Somatic.CoelEpi_GATA2            4050
Neural                           3783
Somatic.early_sPAX8              3648
Endo.Venous_EC                   3450
SMCs                             3044
Somatic.FetalLeydig              2874
Endothelial                      2788
Epi.Epi_glom

In [7]:
# Clean cells
adata = adata[[ i not in ['cont', 'Doublet', 'lowQC', 'Supporting' ,'CoelEpi',  'GermCells', 'Mesenchymal_LHX9', 'preGranulosa']  for i in adata.obs.celltype ]]
adata = adata[[ 'cont' not in i  for i in adata.obs.celltype ]]
adata = adata[[ 'lowQC' not in i  for i in adata.obs.celltype ]]
adata = adata[[ 'Doublet' not in i  for i in adata.obs.celltype ]]
adata = adata[[ 'cycling' not in i  for i in adata.obs.celltype ]]
adata.obs['celltype'].value_counts(dropna = False)

  res = method(*args, **kwargs)


Mesenchymal_GATA2           84229
Somatic.Sertoli             36917
Somatic.Ti                  34808
Somatic.Oi                  25730
Somatic.preGC_IIb           23825
Somatic.preGC_I             15599
Somatic.preGC_IIa           13772
PV                          12679
Epithelial                   9268
Somatic.OSE                  8222
Somatic.early_supporting     5894
Somatic.CoelEpi_LHX9         5547
Immune                       5533
Endo.Fenestrated_EC          5385
Epi.Epi_wolffian             5230
Somatic.Gi                   4749
Somatic.CoelEpi_GATA2        4050
Neural                       3783
Somatic.early_sPAX8          3648
Endo.Venous_EC               3450
SMCs                         3044
Somatic.FetalLeydig          2874
Endothelial                  2788
Epi.Epi_glomerular           2732
Epi.Epi_Mullerian            2170
Somatic.late_sPAX8           2118
Endo.Arterial_EC             1519
Somatic.early_somatic        1414
Erythroid                    1371
Endo.Lymphatic

# Subset dataset

In [8]:
# save for cellphoneDB granulosa anaysis
in_gonad = ['Endothelial','Epi.Epi_mesonephros', 'Epi.Epi_Mullerian','Epi.Epi_wolffian', 
            'Endo.Lymphatic_EC', 'Endo.Glomerular_EC', 'Endo.Venous_EC', 'Endo.Fenestrated_EC', 'Endo.Arterial_EC']
somatic = ['Somatic.CoelEpi_LHX9', 'Somatic.early_supporting', 'Somatic.Sertoli','Somatic.preGC_I', 
           'Somatic.Gi','Somatic.Ti', 'Somatic.Oi', 'Somatic.FetalLeydig', 
           'Somatic.early_sPAX8', 'Somatic.late_sPAX8']
cellphoneDB_adata =  adata[[ i in somatic+in_gonad for i in adata.obs.celltype ]]
# cellphoneDB_adata = cellphoneDB_adata[[ 'female' in i for i in cellphoneDB_adata.obs.sex ]]

In [9]:
sc.pp.filter_cells(cellphoneDB_adata, min_genes=1500)

Trying to set attribute `.obs` of view, copying.


In [10]:
sc.pp.normalize_per_cell(cellphoneDB_adata)
sc.pp.log1p(cellphoneDB_adata)                                                                                                            
# sc.pp.scale(adata, max_value=10)

In [11]:
# cellphoneDB_adata.write('cellphoneDB/in_sPAX8/sPAX8_normloqTransformed.h5ad')

In [12]:
df_meta = pd.DataFrame(data={'Cell':list(cellphoneDB_adata.obs.index),
                             'cell_type':[ str(i) for i in cellphoneDB_adata.obs['celltype']] })
df_meta.set_index('Cell', inplace=True)
df_meta.to_csv('cellphoneDB/in_sPAX8/sPAX8_meta.tsv', sep = '\t')

# Merge DEGs file

In [13]:
somaDEG = pd.read_csv(path_to_gonads+'/FCA-gonad_somatic_DEGs.tsv', header=0, index_col=None, sep='\t')
somaDEG['lineage'] = 'Somatic'
somaDEG['celltype'] = somaDEG['cluster']
somaDEG['cluster'] = ['Somatic.'+i for i in somaDEG['cluster'] ]

In [14]:
# DEG = in_gonadDEG.append(somaDEG)
DEG = somaDEG
pd.DataFrame(DEG).to_csv('cellphoneDB/in_sPAX8/somatic_DEG_tests.tsv', sep='\t', index=False)

In [15]:
# select significant
DEG = DEG[ DEG.p_val_adj < 0.01]
DEG = DEG[ DEG.avg_logFC > 0.05]
DEG = DEG[ DEG['pct.1'] > 0.09 ]

In [16]:
# make sure DEGs only includes samples in meta
cl2include = set(cellphoneDB_adata.obs.celltype.tolist())
idx = np.array([i in cl2include for i in DEG.cluster])
DEG = DEG[ idx ]

In [17]:
cncol = DEG.columns.tolist()
cncol = [item for item in cncol if item not in ['cluster', 'gene'] ]
DEG = DEG[ ['cluster', 'gene']+cncol ]
pd.DataFrame(DEG).to_csv('cellphoneDB/in_sPAX8/DEGs.tsv', sep='\t', index=False)

## Compute % percent and average files - for plotting latter

In [18]:
cellphoneDB_adata.write('cellphoneDB/in_sPAX8/sPAX8_normloqTransformed.h5ad')

... storing 'lineage' as categorical
... storing 'celltype' as categorical


In [19]:
percent = grouped_obs_percent(cellphoneDB_adata, 'celltype')
pd.DataFrame(percent).to_csv('cellphoneDB/in_sPAX8/sPAX8_percent.csv')

means = grouped_obs_mean(cellphoneDB_adata, 'celltype')
pd.DataFrame(means).to_csv('cellphoneDB/in_sPAX8/sPAX8_average_log.csv')

  res = method(*args, **kwargs)
