# QC Data and Filter Samples for DE testing

In [None]:
import pandas as pd
import numpy as np
import os
from pydeseq2.dds import DeseqDataSet
import matplotlib.pyplot as plt
import scanpy as sc
import seaborn as sns
from IPython.display import HTML
import html


In [None]:
NUM_CPUS = 8
# DATA_PATH = '/data/expression_atlas/v1/GSE139358/'
# DATA_PATH = '/data/expression_atlas/v1/GSE112087/'
# DATA_PATH = '/data/expression_atlas/v1/GSE122459/'
# DATA_PATH = '/data/expression_atlas/v1/GSE102371'
# DATA_PATH = '/data/expression_atlas/v1/GSE110914/'
DATA_PATH = '/data/expression_atlas/v1/GSE162828/'
# DATA_PATH = '/data/expression_atlas/v1/GSE102371/'

MULTIQC_PATH = os.path.join(DATA_PATH, 'rnaseq_output/multiqc/star_salmon/multiqc_report.html')

#DATA_PATH = os.getcwd()
COUNT_PATH = os.path.join(DATA_PATH, 'rnaseq_output/star_salmon')
METADATA_FH = '' + '%s_metadata.csv' % DATA_PATH.rstrip('/').split('/')[-1]

if not os.path.exists('results'):
    os.mkdir('results')
DDS_TRANSCRIPT_FH = '' + 'results/%s_dds_transcript.h5_ad' % DATA_PATH.rstrip('/').split('/')[-1]
DDS_GENE_FH = '' + 'results/%s_dds_gene.h5_ad' % DATA_PATH.rstrip('/').split('/')[-1]

TRANSCRIPT_SUM_FILTER = 10
TRANSCRIPT_PSEUDOCOUNT = 0
GENE_SUM_FILTER = 10
GENE_PSEUDOCOUNT = 0

In [None]:
# Embed the MulitQC report into notebook. Note srcdoc was reqiured to get the html rendered without screwing up the 
# styling of the notebook while using an iframe. Buttons on the side don't work, but everything else seems to work fine.

with open(MULTIQC_PATH,'r') as f_in:
    html_raw = html.escape(f_in.read())

HTML('<iframe srcdoc="%s" width="1200px" height="1000px"></iframe>' % html_raw)

In [None]:
# Read sample metadata into dataframe

metadata = pd.read_csv(os.path.join('', METADATA_FH), index_col=0)
smallest_condition_size = metadata[[c for c in metadata.columns if c.startswith('condition')]].value_counts()[-1]

metadata, smallest_condition_size

In [None]:
# # Merge dataframes from indiviudal runs into one expression dataframe

# # Build the expression dataframe off of the first sample in the metadata dataframe
# expression = pd.read_csv(os.path.join(COUNT_PATH, metadata.index[0], 'quant.sf'), delimiter= '\t', index_col=0)
# expression.drop(['Length','EffectiveLength','TPM'], inplace=True, axis=1)
# expression.rename({'NumReads':metadata.index[0]}, inplace=True, axis=1)

# expression_gene = pd.read_csv(os.path.join(COUNT_PATH, metadata.index[0], 'quant.genes.sf'), delimiter= '\t', index_col=0)
# expression_gene.drop(['Length','EffectiveLength','TPM'], inplace=True, axis=1)
# expression_gene.rename({'NumReads':metadata.index[0]}, inplace=True, axis=1)

# # Populate expression dataframe with remaining samples
# for srx in tqdm(metadata.index[1:]):
#     df = pd.read_csv(os.path.join(COUNT_PATH, srx, 'quant.sf'), delimiter='\t', index_col=0)
#     df.drop(['Length','EffectiveLength','TPM'], inplace=True, axis=1)
#     df.rename({'NumReads':srx}, inplace=True, axis=1)
#     expression = expression.merge(df, on='Name')

#     df_gene = pd.read_csv(os.path.join(COUNT_PATH, srx, 'quant.genes.sf'), delimiter='\t', index_col=0)
#     df_gene.drop(['Length','EffectiveLength','TPM'], inplace=True, axis=1)
#     df_gene.rename({'NumReads':srx}, inplace=True, axis=1)
#     expression_gene = expression_gene.merge(df_gene, on='Name')

# expression.shape, df.shape, expression_gene.shape, df_gene.shape


# Read in transcript and gene dataframes

expression = pd.read_csv(os.path.join(COUNT_PATH, 'salmon.merged.transcript_counts.tsv'), delimiter = '\t', index_col=0)
gene_transcript_mapping = expression[['gene_id']].copy().reset_index()
expression.drop('gene_id', inplace=True, axis=1)

tpm = pd.read_csv(os.path.join(COUNT_PATH, 'salmon.merged.transcript_tpm.tsv'), delimiter = '\t', index_col=0)


# See: https://nf-co.re/rnaseq/3.12.0/docs/output#salmon on output choice below 
# salmon.merged.gene_counts_length_scaled.tsv is the gene-level output of nf-core rnaseq that is bias-corrected
# and is already scaled by potential transcript length
expression_gene = pd.read_csv(os.path.join(COUNT_PATH, 'salmon.merged.gene_counts_length_scaled.tsv'), delimiter='\t', index_col=0)
expression_gene.drop('gene_name', inplace=True, axis=1)
expression.shape, expression_gene.shape


In [None]:
# Filter expression dataframe and prepare for QC/EDA.

filtered_expression_transcript = expression.T.copy()
filtered_expression_transcript = filtered_expression_transcript[
                                    filtered_expression_transcript.columns[
                                        filtered_expression_transcript.sum(axis=0) >= TRANSCRIPT_SUM_FILTER
                                        ]
                                    ]

# Tag to take another look, filter later 
filtered_expression_transcript = filtered_expression_transcript[
                                    filtered_expression_transcript.columns[
                                        (filtered_expression_transcript >= TRANSCRIPT_SUM_FILTER).sum(axis=0) >
                                            smallest_condition_size
                                        ]
                                    ]

filtered_expression_gene = expression_gene.T.copy()
filtered_expression_gene = filtered_expression_gene[
                                    filtered_expression_gene.columns[
                                        filtered_expression_gene.sum(axis=0) >= GENE_SUM_FILTER
                                        ]
                                    ]
# Tag to take another look, filter later 
filtered_expression_gene = filtered_expression_gene[
                                    filtered_expression_gene.columns[
                                        (filtered_expression_gene >= GENE_SUM_FILTER).sum(axis=0) > 
                                            smallest_condition_size
                                        ]
                                    ]

expression.shape, filtered_expression_transcript.shape, expression_gene.shape, filtered_expression_gene.shape

In [None]:
# Drop specific samples from dataframe. Provide accession name of samples to remove from analysis.

samples_to_drop = []
# samples_to_drop = ['SRX3729696']
# samples_to_drop = ['SRX9647190']
filtered_expression_transcript.drop(samples_to_drop, axis=0, inplace=True)
filtered_expression_gene.drop(samples_to_drop, axis=0, inplace=True)
metadata.drop(samples_to_drop, axis=0, inplace=True)
filtered_expression_transcript

In [None]:
# Drop specific conditions/groups from metadata dataframe. 

samples_to_drop = []
metadata.drop(samples_to_drop, axis=1, inplace=True)
metadata

In [None]:
# Create a Deseq dataframe (AnnData object).

# DeseqDataSet expects integers in counts matrix, need to check in on the default method for 
# rounding fractional counts to integers in tximport.
 
dds = DeseqDataSet(
    counts = filtered_expression_transcript.astype(int), 
    metadata = metadata, 
    design_factors = 
        [c for c in metadata.columns if c.startswith('group')]+
        [c for c in metadata.columns if c.startswith('condition')],
    refit_cooks = True, 
    n_cpus = NUM_CPUS, 
    )

dds_gene = DeseqDataSet(
    counts = filtered_expression_gene.astype(int), 
    metadata = metadata, 
    design_factors = 
        [c for c in metadata.columns if c.startswith('group')]+
        [c for c in metadata.columns if c.startswith('condition')],
    refit_cooks = True, 
    n_cpus = NUM_CPUS, 
    )

In [None]:
# Set gene-transcript mapping attribute in uns for comparisons between 
# gene- and transcript-level quantifications.

dds.uns['gene_transcript_mapping'] = gene_transcript_mapping
dds_gene.uns['gene_transcript_mapping'] = gene_transcript_mapping

In [None]:
# Compute size-factors and library sizes.

dds.fit_size_factors()
dds.obs['size_factors'] = dds.obsm['size_factors']
dds.obs['lib_sizes'] = dds.X.sum(axis=1)

dds_gene.fit_size_factors()
dds_gene.obs['size_factors'] = dds_gene.obsm['size_factors']
dds_gene.obs['lib_sizes'] = dds_gene.X.sum(axis=1)

dds.obs

In [None]:
# Variance-stabilizing transformation.

dds.vst()
dds_gene.vst()

dds.layers['vst_counts'], dds_gene.layers['vst_counts']

In [None]:
# Set recoverable count data.

dds.layers['counts'] = dds.X.copy()
dds_gene.layers['counts'] = dds_gene.X.copy()

In [None]:
# Compute fractional counts to get a quick idea for any weird skews in library composition.

dds.layers['fraction_counts'] = dds.layers['counts'] / np.reshape(dds.layers['counts'].sum(axis=1), (-1,1))
dds_gene.layers['fraction_counts'] = dds_gene.layers['counts'] / np.reshape(dds_gene.layers['counts'].sum(axis=1), (-1,1))

dds.layers['fraction_counts'], dds_gene.layers['fraction_counts']


In [None]:
# Plot CDF of fractional composition of libraries. 

ax = sns.ecdfplot(np.log2(dds.layers['fraction_counts'].T))
ax.set_xlabel('log2 fraction counts')
ax.legend(
        labels=dds.obs.index, 
        loc='upper left', 
        bbox_to_anchor=(1.,1.), 
        ncols=1 if len(dds.obs.index) < 10 else int(len(dds.obs.index)/10),
        frameon=False,
    )

In [None]:
# Replace count matrix with variance-transformed counts, following DESeq2 recommendation
# for preprocessing count data before QC visualization.

dds.X = dds.layers['vst_counts'].copy()
dds_gene.X = dds_gene.layers['vst_counts'].copy()

np.nan_to_num(dds.X, copy=False)
np.nan_to_num(dds_gene.X, copy=False)

dds.layers['counts'], dds.X, dds.layers['vst_counts']

In [None]:
# Scale transformed variables.

sc.pp.scale(dds)
sc.pp.scale(dds_gene)

np.nan_to_num(dds.X, copy=False)
np.nan_to_num(dds_gene.X, copy=False)

dds.X.mean(axis=0), dds.X.std(axis=0), dds_gene.X.mean(axis=0), dds_gene.X.std(axis=0)

In [None]:
# Preliminary PCA on transcript- and gene-level data.

suffix_size = 4

sc.pp.pca(dds)
ax_transcript_pca = sc.pl.pca( 
    dds, 
    color=
        [c for c in dds.obs.columns if c.startswith('group')]+
        [c for c in dds.obs.columns if c.startswith('condition')], 
    size = 128,
    show=False,
    )

for i, s in enumerate(dds.obsm['X_pca']):
    if type(ax_transcript_pca) == list:
        for ax in ax_transcript_pca:
            ax.text(s[0], s[1], dds.obs.index[i][-suffix_size:])
    else:
        ax_transcript_pca.text(s[0], s[1], dds.obs.index[i][-suffix_size:])

sc.pp.pca(dds_gene)
ax_gene_pca = sc.pl.pca(
    dds_gene, 
    color=
        [c for c in dds_gene.obs.columns if c.startswith('group')]+
        [c for c in dds_gene.obs.columns if c.startswith('condition')],
    size = 128,
    show=False, 
    )

for i, s in enumerate(dds_gene.obsm['X_pca']):
    if type(ax_gene_pca) == list:
        for ax in ax_gene_pca:
            ax.text(s[0], s[1], dds_gene.obs.index[i][-suffix_size:])
    else:
        ax_gene_pca.text(s[0], s[1], dds_gene.obs.index[i][-suffix_size:])


In [None]:
# Plot explained variance ratios.

fig, ax = plt.subplots(1,2,figsize=(10,5))

ax[0].plot(dds.uns['pca']['variance_ratio'])
ax[1].plot(dds_gene.uns['pca']['variance_ratio'])

ax[0].set_ylabel('fraction explained variance')
ax[0].set_xlabel('PC')
ax[1].set_xlabel('PC')
ax[0].set_title('Transcript')
ax[1].set_title('Gene')


In [None]:
# Plot loadings for first 3 PCs

sc.pl.pca_loadings(dds, components = '1,2,3')
sc.pl.pca_loadings(dds_gene, components = '1,2,3')

In [None]:
# Sample-sample pearson correlation.

dds.layers['vst_counts'].shape
dds_gene.layers['vst_counts'].shape

dist = np.corrcoef(np.nan_to_num(dds.layers['vst_counts'], copy=False))
sns.heatmap(dist, xticklabels=dds.obs.index, yticklabels=dds.obs['condition-1'], cbar_kws={'label': 'pearson r'})
plt.yticks(rotation=0)

In [None]:
# Restore the original counts data.

dds.X = dds.layers['counts'].copy()
dds_gene.X = dds_gene.layers['counts'].copy()


In [None]:
# Fit dispersions, logFCs, and calculate cooks.

dds.deseq2()
dds_gene.deseq2()

In [None]:
# Plot fitted dispersions.

fig, ax = plt.subplots(1,2,figsize=(10,5))

ax[0].scatter(
        np.log(dds.varm['_normed_means']), 
        np.log(dds.varm['genewise_dispersions']), 
        s=1, 
        alpha=0.01, 
        label='raw',
    )
ax[0].scatter(
        np.log(dds.varm['_normed_means']), 
        np.log(dds.varm['dispersions']), 
        s=1, 
        alpha=0.01, 
        label='squeezed',
    )
ax[0].scatter(
        np.log(dds.varm['_normed_means']), 
        np.log(dds.varm['fitted_dispersions']), 
        s=1, 
        alpha=0.01, 
        label='trended', 
        c='r', 
    )
ax[0].set_ylabel('log dispersions')
ax[0].set_xlabel('log normalized mean')
ax[0].set_title('transcript-level')
ax[0].legend(frameon=False)
legend = ax[0].legend(frameon=False)
for lh in legend.legend_handles:
    lh.set_alpha(1)

ax[1].scatter(
        np.log(dds_gene.varm['_normed_means']), 
        np.log(dds_gene.varm['genewise_dispersions']), 
        s=1, 
        alpha=0.01, 
        label='raw',
    )
ax[1].scatter(
        np.log(dds_gene.varm['_normed_means']), 
        np.log(dds_gene.varm['dispersions']), 
        s=1, 
        alpha=0.01, 
        label='squeezed',
    )
ax[1].scatter(
        np.log(dds_gene.varm['_normed_means']), 
        np.log(dds_gene.varm['fitted_dispersions']), 
        s=1, 
        alpha=0.01, 
        label='trended', 
        c='r', 
    )
ax[1].set_xlabel('log normalized mean')
ax[1].set_title('gene-level')
legend = ax[1].legend(frameon=False)
for lh in legend.legend_handles:
    lh.set_alpha(1)

In [None]:
# Write dds objects to files for DE and LogFC calculations.

# Pydeseq2 supports trend_coeffs/replaced as either np.array or pd.series, np.array required for 
# saving h5-formatted AnnData objects.
dds.uns['trend_coeffs'] = np.array(dds.uns['trend_coeffs'])
dds_gene.uns['trend_coeffs'] = np.array(dds_gene.uns['trend_coeffs'])

dds.varm['replaced'] = np.array(dds.varm['replaced'])
dds_gene.varm['replaced'] = np.array(dds_gene.varm['replaced'])

# DeseqDataSet doesn't have native support for writing h5, save as AnnData objects and restore from
# AnnData objects.

dds.write(DDS_TRANSCRIPT_FH)
dds_gene.write(DDS_GENE_FH)