### Subset scRNAseq data from matched individuals for label transfer to scATACseq

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata
import os

import sys

def MovePlots(plotpattern, subplotdir):
    os.system('mkdir -p '+str(sc.settings.figdir)+'/'+subplotdir)
    os.system('mv '+str(sc.settings.figdir)+'/*'+plotpattern+'** '+str(sc.settings.figdir)+'/'+subplotdir)

sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.figdir = '/home/jovyan/MULTIOME_july2021/figures_germ/'
sc.logging.print_versions()
sc.settings.set_figure_params(dpi=80)  # low dpi (dots per inch) yields small inline figures

sys.executable



-----
anndata     0.7.5
scanpy      1.7.0
sinfo       0.3.1
-----
PIL                 8.1.0
anndata             0.7.5
backcall            0.2.0
cairo               1.20.0
cffi                1.14.4
constants           NA
cycler              0.10.0
cython_runtime      NA
dateutil            2.8.1
decorator           4.4.2
get_version         2.1
h5py                3.1.0
highs_wrapper       NA
igraph              0.8.3
ipykernel           5.4.3
ipython_genutils    0.2.0
jedi                0.18.0
joblib              1.0.0
kiwisolver          1.3.1
legacy_api_wrap     1.2
leidenalg           0.8.3
llvmlite            0.35.0
matplotlib          3.3.4
mpl_toolkits        NA
natsort             7.1.1
numba               0.52.0
numexpr             2.7.2
numpy               1.21.2
packaging           20.9
pandas              1.2.1
parso               0.8.1
pexpect             4.8.0
pickleshare         0.7.5
pkg_resources       NA
prompt_toolkit      3.0.14
ptyprocess          0.7.0
pygments  

'/opt/conda/envs/atac_env/bin/python'

#### Load scRNAseq data

In [2]:
adata = sc.read('/nfs/users/nfs_l/lg18/team292/lg18/with_valentina/gonadsV2_revision/FCA-gonads_rawcounts.h5ad')
metadata = pd.read_csv('/nfs/users/nfs_l/lg18/team292/lg18/with_valentina/gonadsV2_revision/FCA-gonads_germcellsClean_annotated.csv', index_col = 0)


In [3]:
adata.X[1:10, 1:10].toarray()

array([[0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32)

In [4]:
metadata.head()

Unnamed: 0,n_genes,sample,donor,location,stage,sex,study,batch_collection,enrichment,cryopreserved,...,lineage_v2,somatic_celltype_v2,PCW,sample_source,trimester,lineage,celltype_germcellDirty,leiden,leiden_R,celltype
FCA_GND8047885_AAGACCTCAGTATAAG,3446,FCA_GND8047885,F81,Gonad_and_extragonadal,8.8,female,FCA,A,CD45-,No,...,GermCells,,8.8,N,1st,GermCells,PGC_mitotic,0,0,PGC
FCA_GND8047885_AAGGTTCAGTTAAGTG,3821,FCA_GND8047885,F81,Gonad_and_extragonadal,8.8,female,FCA,A,CD45-,No,...,GermCells,,8.8,N,1st,GermCells,PGC_mitotic,0,0,PGC
FCA_GND8047885_CCTCTGAAGTGACATA,4123,FCA_GND8047885,F81,Gonad_and_extragonadal,8.8,female,FCA,A,CD45-,No,...,GermCells,,8.8,N,1st,GermCells,PGC,0,0,PGC
FCA_GND8047885_CGAACATAGCCGGTAA,4494,FCA_GND8047885,F81,Gonad_and_extragonadal,8.8,female,FCA,A,CD45-,No,...,GermCells,,8.8,N,1st,GermCells,PGC,0,0,PGC
FCA_GND8047885_CGTGAGCTCATCATTC,3152,FCA_GND8047885,F81,Gonad_and_extragonadal,8.8,female,FCA,A,CD45-,No,...,GermCells,,8.8,N,1st,GermCells,PGC,0,0,PGC


In [5]:
metadata['celltype'].value_counts(dropna = False)

PGC                  3764
oogonia_STRA8        2203
oogonia_meiotic      1875
pre_oocyte            812
oocyte                723
pre_spermatogonia     621
GC_mitotic            510
GC                    485
Name: celltype, dtype: int64

In [6]:
adata.obs['germ_celltype'] = adata.obs_names.map(metadata['celltype'].to_dict())

In [7]:
adata.obs['germ_celltype'].value_counts(dropna = False)

NaN                  385572
PGC                    3764
oogonia_STRA8          2203
oogonia_meiotic        1875
pre_oocyte              812
oocyte                  723
pre_spermatogonia       621
GC_mitotic              510
GC                      485
Name: germ_celltype, dtype: int64

In [8]:
adata.obs['germ_celltype'] = adata.obs['germ_celltype'].astype(str)

In [9]:
adata = adata[[i not in ['nan', 'Doublet'] for i in adata.obs['germ_celltype']]]
adata.shape

  res = method(*args, **kwargs)


(10993, 28820)

In [10]:
adata.obs['germ_celltype'] = np.where(adata.obs['germ_celltype'] == 'GC_mitotic', 'GC', adata.obs['germ_celltype'])
adata.obs['germ_celltype'].value_counts()

Trying to set attribute `.obs` of view, copying.


PGC                  3764
oogonia_STRA8        2203
oogonia_meiotic      1875
GC                    995
pre_oocyte            812
oocyte                723
pre_spermatogonia     621
Name: germ_celltype, dtype: int64

In [11]:
np.unique(adata.obs['donor'])

array(['F100', 'F122', 'F123', 'F126', 'F133', 'F81', 'F83', 'F84', 'F87',
       'F88', 'F89', 'F91', 'F96', 'F99', 'Hrv10', 'Hrv11', 'Hrv12',
       'Hrv13', 'Hrv17', 'Hrv21', 'Hrv25', 'Hrv27', 'Hrv3', 'Hrv30',
       'Hrv31', 'Hrv32', 'Hrv33', 'Hrv34', 'Hrv37', 'Hrv39', 'Hrv40',
       'Hrv41', 'Hrv49', 'Hrv54', 'Hrv55', 'Hrv57', 'Hrv58', 'Hrv59',
       'Hrv85', 'Hrv86', 'Hrv91', 'Hrv92', 'Hrv93', 'Hrv99'], dtype=object)

In [12]:
myindex = adata.obs['germ_celltype'].value_counts().index 
myvalues = adata.obs['germ_celltype'].value_counts().values
clusters = pd.Series(myvalues, index = myindex)
clusters.values

array([3764, 2203, 1875,  995,  812,  723,  621])

In [13]:
import random
from itertools import chain

# Find clusters with > n cells
n = 600
cl2downsample = clusters.index[ clusters.values > n ]

# save all barcode ids from small clusters
holder = []
holder.append( adata.obs_names[[ i not in cl2downsample for i in adata.obs['germ_celltype'] ]] ) 

# randomly sample n cells in the cl2downsample
for cl in cl2downsample:
    print(cl)
    cl_sample = adata[[ i == cl for i in adata.obs['germ_celltype'] ]].obs_names
    # n = int(round(len(cl_sample)/2, 0))
    cl_downsample = random.sample(set(cl_sample), n )
    holder.append(cl_downsample)
    
# samples to include
samples = list(chain(*holder))

# Filter adata_count
adata = adata[[ i in samples for i in adata.obs_names ]]
adata.X.shape

PGC
oogonia_STRA8
oogonia_meiotic
GC
pre_oocyte
oocyte
pre_spermatogonia


(4200, 28820)

#### Don't select matching donors as there aren't many germ cells

In [14]:
#adata = adata[[i in ['F81', 'Hrv10', 'Hrv13', 'Hrv15', 'Hrv17', 'Hrv18', 'Hrv21',
#       'Hrv3', 'Hrv39', 'Hrv41', 'Hrv49', 'Hrv50', 'Hrv54', 'Hrv55',
#       'Hrv58', 'Hrv59', 'Hrv65', 'Hrv91', 'Hrv92'] for i in adata.obs['donor']]]
#adata.shape

#### Save anndata object to perform integration with ATAC with Seurat

In [15]:
del adata.uns 

In [16]:
for i in adata.obs.columns:
    if i not in ['germ_celltype', 'donor']:
        del adata.obs[i] 

In [17]:
adata.obs

Unnamed: 0,donor,germ_celltype
FCA_GND8047885_CCTCTGAAGTGACATA,F81,PGC
FCA_GND8047885_CGAACATAGCCGGTAA,F81,PGC
FCA_GND8047885_TCCACACAGACTAAGT,F81,PGC
FCA_GND8125925_GGCGACTCATGTAAGA,F87,GC
FCA_GND8125925_GGGTTGCTCTGTGCAA,F87,GC
...,...,...
FCA_GND10375780_TTGCGTCTCGACGGAA,Hrv86,pre_oocyte
FCA_GND10375780_TTTGCGCCATGCAATC,Hrv86,oogonia_meiotic
FCA_GND10375780_TTTGCGCTCCCATTTA,Hrv86,oogonia_meiotic
HCA_F_GON10941968_CTCTACGGTTGATTCG,Hrv99,PGC


In [18]:
for j in adata.var.columns:
    if j != 'gene_ids-0':
        del adata.var[j]

In [19]:
adata.var

Unnamed: 0,gene_ids-0
A1BG,ENSG00000121410
A1BG-AS1,ENSG00000268895
A1CF,
A2M,ENSG00000175899
A2M-AS1,ENSG00000245105
...,...
ZXDC,ENSG00000070476
ZYG11A,ENSG00000203995
ZYG11B,ENSG00000162378
ZYX,ENSG00000159840


In [20]:
adata

AnnData object with n_obs × n_vars = 4200 × 28820
    obs: 'donor', 'germ_celltype'
    var: 'gene_ids-0'

In [21]:
# Define variables
outdir = "/nfs/team292/vl6/my_MULTIOME_dir/germcells_july2021/"
experiment_prefix = 'germcells_'

In [22]:
adata.write(outdir + experiment_prefix + "_RNAseq.h5ad")

... storing 'germ_celltype' as categorical


#### End of notebook 