In [None]:
"""
Use testes data to find the rational QC thresholds.
"""

In [None]:
import logging, matplotlib, os, sys
import scanpy as sc
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
from anndata import AnnData
from matplotlib import rcParams
from matplotlib import colors
import seaborn as sb
from rpy2.robjects.packages import importr

import sc_utils

In [None]:
plt.rcParams['figure.figsize'] = (8, 8)
sc.settings.verbosity = 3
sc.set_figure_params(dpi=200, dpi_save=200)
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['font.size'] = 10
sc.settings.autoshow = False

In [None]:
samples = [
    sc_utils.sparsify("../testes_res/scte_data/ss.Hs_testes_7yo.rp1.csv.gz",  obs_add={'replicate': "7YO#1", 'age': '7YO'}, csv=True),
    sc_utils.sparsify("../testes_res/scte_data/ss.Hs_testes_7yo.rp2.csv.gz",  obs_add={'replicate': "7YO#2", 'age': '7YO'}, csv=True),
    sc_utils.sparsify("../testes_res/scte_data/ss.Hs_testes_11yo.rp1.csv.gz", obs_add={'replicate': "11YO#1", 'age': '11YO'}, csv=True),
    sc_utils.sparsify("../testes_res/scte_data/ss.Hs_testes_11yo.rp2.csv.gz", obs_add={'replicate': "11YO#2", 'age': '11YO'}, csv=True),
    sc_utils.sparsify("../testes_res/scte_data/ss.Hs_testes_13yo.rp1.csv.gz", obs_add={'replicate': "13YO#1", 'age': '13YO'}, csv=True),
    sc_utils.sparsify("../testes_res/scte_data/ss.Hs_testes_13yo.rp2.csv.gz", obs_add={'replicate': "13YO#2", 'age': '13YO'}, csv=True),
    sc_utils.sparsify("../testes_res/scte_data/ss.Hs_testes_14yo.rp1.csv.gz", obs_add={'replicate': "14YO#1", 'age': '14YO'}, csv=True),
    sc_utils.sparsify("../testes_res/scte_data/ss.Hs_testes_14yo.rp2.csv.gz", obs_add={'replicate': "14YO#2", 'age': '14YO'}, csv=True)
    ]
print('Loaded Samples...')

In [None]:
### have a look on AnnData
samples

In [None]:
### find mitocondrial counts
## but output from scte seems already excluded mt genes
for sam in samples:
    sam.var['mt'] = sam.var_names.str.startswith('mt-')

In [None]:
## have a look, looks like all false
samples[0].var['mt']

In [None]:
### QC metrics, us 'mt' as var, defalut is `()`
[sc.pp.calculate_qc_metrics(sam, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True) for sam in samples]
# 确实已经没有mito gene了...

In [None]:
## have a look on after calculate_qc_metrics obs table
[sam.obs.head() for sam in samples]

In [None]:
## try violin plots
for sam in samples:
    sc.pl.violin(
        sam, 
        [
         'n_genes_by_counts', 
         'total_counts', 
         'pct_counts_mt'
         ],
        multi_panel=True
    )
# 确实是完全没有mito gene...

In [None]:
## just have a look on numbers of genes and counts
gene_mins = []
gene_maxs = []
gene_ave = []

count_mins = []
count_maxs = []
count_ave = []


for sam in samples:
    gene_mins.append(sam.obs.n_genes_by_counts.min())
    gene_maxs.append(sam.obs.n_genes_by_counts.max())
    gene_ave.append(sam.obs.n_genes_by_counts.mean())
    
    count_mins.append(sam.obs.total_counts.min())
    count_maxs.append(sam.obs.total_counts.max())
    count_ave.append(sam.obs.total_counts.mean())

### Manually filtering:

```python
adata = adata.adata[adata.obs['n_genes_by_counts'] >= 1000]
# ...
```

is equivalent to:

```python
# Quick pre-filtering, these should be low, otherwise it can mess up downstream analysis, but also can get rid of trivial uninteresting things
[sc.pp.filter_cells(sam, min_genes=1000) for sam in samples]
[sc.pp.filter_cells(sam, max_counts=200000) for sam in samples]
[sc.pp.filter_cells(sam, min_counts=5000) for sam in samples]
# Do not filter gene here; concatenate joins on the union, so if a gene fails in a single sample, it will also be deleted from all other samples;

```

and these are filtering cells only.

In [None]:
### is there FIGLA?
for sam in samples:
    print(sam.var_names[sam.var_names == "FIGLA"])
    print(len(sam.var_names))

    # or
    print(sam[:, "FIGLA"].X.shape)

# var_names is a full list

In [None]:
### follow andrew's script
[sc.pp.filter_cells(sam, min_genes=1000) for sam in samples]
[sc.pp.filter_cells(sam, max_counts=200000) for sam in samples]
[sc.pp.filter_cells(sam, min_counts=5000) for sam in samples]

adata = samples[0].concatenate(samples[1:])

adata.X = adata.X.astype('float32')

In [None]:
### when filtering genes
## in 2.filter.py
## violin first
sc.pl.violin(adata, ['n_genes', 'n_counts'], groupby='replicate', size=0, log=False, cut=0, show=False)

In [None]:
## still filtering cells
sc.pp.filter_cells(adata, min_genes=1500)
sc.pp.filter_cells(adata, max_genes=8000)
sc.pp.filter_cells(adata, min_counts=3000)
sc.pp.filter_cells(adata, max_counts=100000)

In [None]:
# still, there's FIGLA, and remember this is testes!
adata[:, "FIGLA"].X.shape

In [None]:
## filtering genes
# keep the original object
adata_tmp = adata.copy()
sc.pp.filter_genes(adata_tmp, min_cells=100)

# then find FIGLA in testes cells
adata_tmp[:, "FIGLA"].X.shape # would report an error, becuz those cells were filtered out

# if get a lower threshold
adata_tmp2 = adata.copy()
sc.pp.filter_genes(adata_tmp2, min_cells=10)

adata_tmp2[:, "FIGLA"].X.shape # you still get FIGLA expression in testes

###
- find rational thresholds for filtering cells, should be low
- never mind the mito genes
- make threshold for `filter_genes` lower