In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import os
import sys
import scipy
import anndata


sc.set_figure_params(figsize=(4, 4))

In [2]:
def Barplot(which_var, adata, var='clusters', height=3, color = False):
    plotdata = pd.crosstab(adata.obs[var], adata.obs[which_var], normalize='index') * 100
    if 'category' in plotdata.index.dtype.name:
        plotdata.index.reorder_categories(adata.obs[var].cat.categories[::-1])

    if not color:
        ax1 = plotdata.plot.barh(stacked = True, edgecolor = 'none', zorder = 3, figsize = (6,height), fontsize = 14, grid = False)
    else:
        ax1 = plotdata.plot.barh(stacked = True, edgecolor = 'none', zorder = 3, figsize = (6,height), fontsize = 14, grid = False, color = color)
    ax1.set_title(which_var+' %')
    ax1.set_ylabel(var)
    horiz_offset = 1
    vert_offset = 1.
    ax1 = ax1.legend(bbox_to_anchor = (horiz_offset, vert_offset))
#     ax1.figure.savefig('barplot_'+var+'_proportions_'+which_var+'.pdf', bbox_inches='tight',
#                        dpi=300, orientation='landscape', format= 'pdf', optimize=True)

# Somatic cells

In [4]:
path_to_gonads = '/nfs/team292/lg18/with_valentina/gonadsV2_revision/'
adata = sc.read(path_to_gonads + 'FCA-gonads_somatic_annotated.h5ad')

In [5]:
adata.obs['celltype'].values.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
CoelEpi_GATA2,4935,0.025425
CoelEpi_LHX9_cycling,1858,0.009572
CoelEpi_LHX9,5547,0.028577
early_sPAX8,3648,0.018794
late_sPAX8,2118,0.010912
early_somatic,1414,0.007285
early_supporting,5894,0.030365
preGC_I,15599,0.080364
Sertoli,36917,0.190192
Gi,4749,0.024466


In [6]:
adata = adata[[ not pd.isna(i) for i in adata.obs.celltype ]]
adata = adata[[i not in ['cont', 'lowQC', 'Doublet'] for i in adata.obs.celltype]]
adata.obs['celltype'].values.describe()

  res = method(*args, **kwargs)


Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
CoelEpi_GATA2,4935,0.025666
CoelEpi_LHX9_cycling,1858,0.009663
CoelEpi_LHX9,5547,0.028849
early_sPAX8,3648,0.018973
late_sPAX8,2118,0.011015
early_somatic,1414,0.007354
early_supporting,5894,0.030654
preGC_I,15599,0.081128
Sertoli,36917,0.192
Gi,4749,0.024699


## TF-IDF

In [7]:
import anndata
XX = adata[[ i in ["G1"] for i in adata.obs.phase ]]
adataDown = sc.pp.subsample(XX, fraction=0.25, random_state=0, copy=True)
sc.pp.filter_genes(adataDown, min_cells=3)

  res = method(*args, **kwargs)


In [8]:
import rpy2.rinterface_lib.callbacks
import logging
# Itoliste R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
import anndata2ri
anndata2ri.activate()
%load_ext rpy2.ipython


In [9]:
adataDown.X = adataDown.X.toarray()

idx = ['n_genes', 'sample', 'donor', 'location', 'stage', 'sex', 'study', 'batch_collection', 'enrichment', 'cryopreserved', 'TP', '10xKit', 'dig_protocol', 'percent_mito', 'n_counts', 'batch', 'low_ncounts', 'high_mito', 'low_ncounts_high_mito', 'S_score', 'G2M_score', 'phase', 'scrublet_score', 'scrublet_cluster_score', 'zscore', 'bh_pval', 'bonf_pval', 'is_doublet', 'lineages_v1', 'celltype_v1', 'lineage_v2', 'somatic_celltype_v2', 'PCW', 'sample_source', 'trimester', 'lineage', 'celltype_somaticEarly', 'celltype_v2', 'leiden', 'leiden_R']
for i in idx:
    del adataDown.obs[i]
    
adataDown.obs['celltype'] = adataDown.obs['celltype'].astype(str)

In [10]:
%%R -i adataDown
adataDown

TypeError: Indices must be integers or slices, not <class 'rpy2.robjects.vectors.StrVector'>

In [None]:
%%R -o df_tfIDF -o topgenes_tfIDF

library(Seurat)
library(SoupX)
library(dplyr)
seurat_andata = as.Seurat(adataDown, counts = "X", data = "X")
Idents(seurat_andata) = seurat_andata$celltype


df_tfIDF = quickMarkers(toc = seurat_andata@assays$RNA@counts, clusters = Idents(seurat_andata), N = 500)

df_tfIDF$cluster = factor(df_tfIDF$cluster, levels = levels(Idents(seurat_andata)) )
topgenes_tfIDF = subset(df_tfIDF, qval <= 0.05) %>%
  group_by(cluster) %>%
  group_map(~ head(.x, 30L)$gene) %>%
  unlist(.)

In [None]:
df_tfIDF.to_csv('/nfs/team292/lg18/with_valentina/gonadsV2_revision/FCA_somatic_TFIDF.tsv', index=False, sep='\t')

In [None]:
sc.pl.dotplot(adataDown, list(topgenes_tfIDF), groupby='celltype', standard_scale = 'var', save='_somatic_TFIDF.pdf')#topgenes_tfIDF

In [None]:
%%R -o DEGs

library(Seurat)
so = as.Seurat(adataDown, counts = "X", data = "X")
Idents(so) = so$celltype

# Extract DEGs for each cell_type
DEGs <- FindAllMarkers(so, 
                       test.use = 'LR', 
                       verbose = F, 
                       only.pos = T, 
                       random.seed = 1, 
                       logfc.threshold = 0.2, 
                       min.pct = 0.1, 
                       return.thresh = 0.05)


In [None]:
DEGs.to_csv('/nfs/team292/lg18/with_valentina/gonadsV2_revision/FCA_somatic_DEGs.tsv', index=False, sep='\t')