# scRNAseq Calprotectin organoids
## 1. Load adata

In [None]:
import os
import gzip
import anndata
import scanpy as sc
import scipy as sp
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sc.settings.set_figure_params(figsize=(5,5))

In [None]:
# Get metadata from samplesheet
meta = pd.read_csv('../../tables/samplesheet.csv')
meta.drop(axis='columns', labels=['fastq_1', 'fastq_2'], inplace=True)

# Reorder by Sample ID, drop double columns, and update index
sample_idx = []
for s in meta['sample'].values:
    sample_idx.append(int(s[2:]))

meta['sample_idx'] = sample_idx
meta = meta.sort_values('sample_idx')
meta.index = meta.sample_idx
meta=meta.drop(columns=['sample_idx'])

In [None]:
# Iterate over sample h5ads and concatenate
adatas = dict()
key_save_l = []
for sample in meta.to_dict(orient="records"):
    tmp_adata = sc.read_10x_h5(
        f"/data/projects/2022/Adolph-scRNA-organoids/01_nfcore_scrnaseq/cellranger/sample-{sample['sample']}/outs/filtered_feature_bc_matrix.h5"
    )
    # save gene conversion key and switch index to ensembl ids before making unique
    key_save_l.append(tmp_adata.var.copy())
    tmp_adata.var['gene_symbols'] = tmp_adata.var.index
    tmp_adata.var.index = tmp_adata.var.gene_ids
    tmp_adata.var = tmp_adata.var.drop(columns=['gene_symbols','feature_types','genome'])
    tmp_adata.var_names_make_unique()
    assert tmp_adata.obs_names.is_unique
    tmp_adata.obs = tmp_adata.obs.assign(**sample)
    adatas[sample['sample']] = tmp_adata # assign sample_id to barcodes

# when concatenating all, columns in .var are somehow dropped
# index_unique in .concat appends sample ids to barcodes
adata = anndata.concat(adatas, index_unique="_")

assert np.all(key_save_l[0] == key_save_l[1])
key = key_save_l[-1]
key= key.reset_index()
key.index=key.gene_ids

# Use conversion key to re-assign symbols to ensembl ids
adata.var['gene_symbols'] = key.loc[adata.var.index]['index']

In [None]:
adata

In [None]:
adata.shape

In [None]:
# check if any indices contain a certain string (look for "-" as .var_names_make_unique appends -1, -2, etc. to duplicated gene symbols)
for g in adata.var.index:
    if '-' in g:
        print(g)

### Load features.tsv.gz from single sample

In [None]:
# This should be equivalent for every sample
features_path = f"/data/projects/2022/Adolph-scRNA-organoids/01_nfcore_scrnaseq/cellranger/sample-AJ10/outs/filtered_feature_bc_matrix/features.tsv.gz"
genes = pd.read_csv(gzip.open(features_path, mode="rt"), delimiter="\t", header=None)
genes.drop(genes.columns[[2]], axis=1, inplace=True)
genes.rename(columns={0:'gene_id', 1: 'gene_symbol'}, inplace=True)
genes.set_index('gene_symbol', inplace=True)

In [None]:
len(adata.var['gene_symbols'].unique())

In [None]:
# Apparently some gene symbols are not unique
adata.shape[1] - len(adata.var['gene_symbols'].unique())

### Summary stats raw adata

In [None]:
print(adata.obs['sample'].value_counts())
print('')
print(adata.obs['patient'].value_counts())
print('')
print(adata.obs['group'].value_counts())
print('')
print(adata.obs['batch'].value_counts())

## 2. Define filter thresholds

In [None]:
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)

In [None]:
adata.shape

In [None]:
adata.var['mt'] = adata.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)