# Differential Expression and LogFC Calculation

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import lines
import anndata as ad

from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats

import scanpy as sc

In [None]:
NUM_CPUS = 8
LOG2_FC_THRESH = np.log2(2.0)
NLOG10_PADJ_THRESH = -1*np.log10(0.05)

# DATA_PATH = '/data/expression_atlas/v1/GSE122459'
# DATA_PATH = '/data/expression_atlas/v1/GSE110914/'
DATA_PATH = '/data/expression_atlas/v1/GSE162828/'
# DATA_PATH = '/data/expression_atlas/v1/GSE102371/'

RESULTS_PATH = '' + 'results/%s' % DATA_PATH.rstrip('/').split('/')[-1]

DDS_TRANSCRIPT_FH = '' + 'results/%s_dds_transcript.h5_ad' % DATA_PATH.rstrip('/').split('/')[-1]
DDS_GENE_FH = '' + 'results/%s_dds_gene.h5_ad' % DATA_PATH.rstrip('/').split('/')[-1]

In [None]:
# Read in previously created dds objects, should be filtered and have correct
# metadata defined in obs.

ad_dds = ad.read_h5ad(DDS_TRANSCRIPT_FH)
ad_dds_gene = ad.read_h5ad(DDS_GENE_FH)

dds = DeseqDataSet( 
                adata=ad_dds, 
                design_factors=[ 
                            c for c in ad_dds.obs.columns if c.startswith('condition')
                    ]
            )
dds_gene = DeseqDataSet( 
                adata=ad_dds_gene, 
                design_factors=[ 
                            c for c in ad_dds_gene.obs.columns if c.startswith('condition')
                    ]
            )

In [None]:
dds.X, dds_gene.X

In [None]:
dds.obs,dds_gene.obs

In [None]:
# Manually define contrasts given conditions in metadata dataframe.

for c in [c for c in dds.obs.columns if c.startswith('condition')]:
    print(dds.obs[c].unique())

# Pydeseq2 contrasts require condition-name, treatment level, reference level format.
contrasts = {
    'SLE_v_control': ['condition-1','TREAT-1','CONTROL'],
    }

# contrasts = {
#     'T1D_v_control': ['condition-1','TREAT-1','CONTROL'],
#     'preT1D_v_control': ['condition-1','TREAT-2','CONTROL']
#     }

# contrasts = {
#     'SLE_v_control': ['condition-1','TREAT-1','CONTROL']
#     }

# contrasts = {
#     'T1D_v_control': ['condition-1','TREAT-1','CONTROL'],
#     }

In [None]:
dds.obsm['design_matrix'], dds_gene.obsm['design_matrix']

In [None]:
# Create Stats object. Define relevant contrasts for DE and LogFC computations and run tests. 

# Holds all DeseqStats objects as defined in contrasts.
stat_results = {}

for k, v in contrasts.items():

    stat_res = DeseqStats(dds, contrast=v, n_cpus=NUM_CPUS)
    stat_res_gene = DeseqStats(dds_gene, contrast=v, n_cpus=NUM_CPUS)

    stat_res.summary()
    stat_res_gene.summary()

    stat_res.lfc_shrink()
    stat_res_gene.lfc_shrink()
    
    stat_results['%s_transcript' % k] = stat_res
    stat_results['%s_gene' % k] = stat_res_gene


In [None]:
# Create volcano plots of DE-transcripts and -genes.

fig, ax = plt.subplots(len(contrasts),2,figsize=(10,5*len(contrasts)))

ax = ax.reshape((-1,2,))

scale_marker=2

for i, k in enumerate(contrasts.keys()):
    
    kg = '%s_gene' % k
    kt = '%s_transcript' % k

    stat_results[kt].results_df['-log10_padj'] = -1. * np.log10(stat_results[kt].results_df['padj'])
    stat_results[kg].results_df['-log10_padj'] = -1. * np.log10(stat_results[kg].results_df['padj'])

    ax[i,0].scatter(
            stat_results[kt].results_df['log2FoldChange'], 
            stat_results[kt].results_df['-log10_padj'], 
            alpha=0.1,
            s=scale_marker*np.log2(stat_results[kt].results_df['baseMean']),
            c=[ 
                '#1f77b4' if (abs(lf) > LOG2_FC_THRESH and nlp > NLOG10_PADJ_THRESH) else 
                '#ff7f0e' for i, (lf, nlp) in stat_results[kt].results_df[['log2FoldChange','-log10_padj']].iterrows()
            ],
        )

    ax[i,1].scatter(
            stat_results[kg].results_df['log2FoldChange'], 
            stat_results[kg].results_df['-log10_padj'], 
            alpha=0.05,
            s=scale_marker*np.log2(stat_results[kg].results_df['baseMean']),
            c=[ 
                '#1f77b4' if (abs(lf) > LOG2_FC_THRESH and nlp > NLOG10_PADJ_THRESH) else 
                '#ff7f0e' for i, (lf, nlp) in stat_results[kg].results_df[['log2FoldChange','-log10_padj']].iterrows()
            ],
        )

    ax[i,0].set_xlabel('log2 FC')
    ax[i,1].set_xlabel('log2 FC')
    ax[i,0].set_ylabel('-log10 padj')

    ax[i,0].set_title('%s Transcript' % k)
    ax[i,1].set_title('%s Gene' % k)
    
    element_range = np.rint(np.linspace(
                        1,
                        5*round(max(np.log2(stat_results[kg].results_df['baseMean']))/5),
                        4, 
                    ))

    legend_elements = [lines.Line2D(
                            [0], 
                            [0], 
                            lw=0, 
                            marker="o", 
                            linestyle=None, 
                            markersize=(scale_marker*s)**0.5,
                        ) for s in element_range]

    legend = ax[i,1].legend(
                    legend_elements,
                    element_range,
                    frameon=False, 
                    loc='upper left', 
                    bbox_to_anchor=(1.,1.),
                    title='log2 mean expression'
                )
    ax[i,1].add_artist(legend)
    
    color_legend = ax[i,1].legend(
                [
                    lines.Line2D([0], [0], lw=0, marker='o', linestyle=None, markerfacecolor='#1f77b4'),
                    lines.Line2D([0], [0], lw=0, marker='o', linestyle=None, markerfacecolor='#ff7f0e'),
                    ],
                [
                    f'log2FC > {LOG2_FC_THRESH} and -log10_padj > {NLOG10_PADJ_THRESH:.2f}',
                    f'log2FC < {LOG2_FC_THRESH} and -log10_padj < {NLOG10_PADJ_THRESH:.2f}',
                    ],
                frameon=False,
                loc='upper left',
                bbox_to_anchor=(1.,0.5,),
                )   


In [None]:
# Create MA plots of DE-transcripts and -genes.

fig, ax = plt.subplots(len(contrasts),2,figsize=(10,5*len(contrasts)))

ax = ax.reshape((-1,2,))

for i,k in enumerate(contrasts.keys()):

    kg = '%s_gene' % k
    kt = '%s_transcript' % k
    
    ax[i,0].scatter(
            np.log2(stat_results[kt].results_df['baseMean']), 
            stat_results[kt].results_df['log2FoldChange'], 
            alpha=0.1,
            s=stat_results[kt].results_df['-log10_padj'],
            c=[ 
                '#1f77b4' if (abs(lf) > LOG2_FC_THRESH and nlp > NLOG10_PADJ_THRESH) else 
                '#ff7f0e' for i, (lf, nlp) in stat_results[kt].results_df[['log2FoldChange','-log10_padj']].iterrows()
            ],
        )

    ax[i,1].scatter(
            np.log2(stat_results[kg].results_df['baseMean']), 
            stat_results[kg].results_df['log2FoldChange'], 
            alpha=0.1,
            s=stat_results[kg].results_df['-log10_padj'],
            c=[ 
                '#1f77b4' if (abs(lf) > LOG2_FC_THRESH and nlp > NLOG10_PADJ_THRESH) else 
                '#ff7f0e' for i, (lf, nlp) in stat_results[kg].results_df[['log2FoldChange','-log10_padj']].iterrows()
            ],
        )

    ax[i,0].set_xlabel('log2 mean expression')
    ax[i,1].set_xlabel('log2 mean expression')
    ax[i,0].set_ylabel('log2 FC expression')

    ax[i,0].set_title('%s Transcript' % k)
    ax[i,1].set_title('%s Gene' % k)


    element_range = np.rint(np.linspace(
                    1,
                    5*round(max(stat_results[kg].results_df['-log10_padj'])/5),
                    4, 
                ))

    legend_elements = [lines.Line2D(
                            [0], 
                            [0], 
                            lw=0, 
                            marker="o", 
                            linestyle=None, 
                            markersize=s**0.5,
                        ) for s in element_range]

    legend = ax[i,1].legend(
                    legend_elements,
                    element_range,
                    frameon=False, 
                    loc='upper left', 
                    bbox_to_anchor=(1.,1.),
                    title='-log10_padj'
                )
    ax[i,1].add_artist(legend)
    
    color_legend = ax[i,1].legend(
                [
                    lines.Line2D([0], [0], lw=0, marker='o', linestyle=None, markerfacecolor='#1f77b4'),
                    lines.Line2D([0], [0], lw=0, marker='o', linestyle=None, markerfacecolor='#ff7f0e'),
                    ],
                [
                    f'log2FC > {LOG2_FC_THRESH} and -log10_padj > {NLOG10_PADJ_THRESH:.2f}',
                    f'log2FC < {LOG2_FC_THRESH} and -log10_padj < {NLOG10_PADJ_THRESH:.2f}',
                    ],
                frameon=False,
                loc='upper left',
                bbox_to_anchor=(1.,0.5,),
                )

In [None]:
for i,k in enumerate(contrasts.keys()):

    kg = '%s_gene' % k
    kt = '%s_transcript' % k

    markers = stat_results[kt].results_df[(abs(stat_results[kt].results_df['log2FoldChange']) > LOG2_FC_THRESH) & (stat_results[kt].results_df['-log10_padj'] > NLOG10_PADJ_THRESH)]
    print('%s: %s'  % (kt, len(markers)))
    print(markers.sort_values('log2FoldChange', axis=0).to_string())

    markers = stat_results[kg].results_df[(abs(stat_results[kg].results_df['log2FoldChange']) > LOG2_FC_THRESH) & (stat_results[kg].results_df['-log10_padj'] > NLOG10_PADJ_THRESH)]
    print('%s: %s' % (kg, len(markers)))
    print(markers.sort_values('log2FoldChange', axis=0).to_string())


In [None]:
# Transfer gene to transcript mappings to dds.var dataframe. Plot LogFC between transcript- 
# and gene-level quantifications.

gene_transcript_mapping = dict(zip(dds.uns['gene_transcript_mapping']['tx'],dds.uns['gene_transcript_mapping']['gene_id']))

fig, ax = plt.subplots(len(contrasts),1,figsize=(5,5*len(contrasts)))

if type(ax) != np.ndarray:
    ax = np.array(ax)

ax = ax.reshape((-1,1,))

for i,k in enumerate(contrasts.keys()):

    kg = '%s_gene' % k
    kt = '%s_transcript' % k
    
    stat_results[kt].results_df['gene_id'] = stat_results[kt].results_df.index.map(lambda x: gene_transcript_mapping[x])

    df = stat_results[kt].results_df.merge(stat_results[kg].results_df, left_on='gene_id', right_on='gene_id')
    ax[i,0].scatter(
            df['log2FoldChange_x'], 
            df['log2FoldChange_y'],
            alpha=0.05,
            s=5*df['-log10_padj_x']
        )
    
    ax[i,0].set_xlabel('log2 FC transcript')
    ax[i,0].set_ylabel('log2 FC gene')

    ax[i,0].set_title('%s Transcript v Gene logFC' % k)

    element_range = np.rint(np.linspace(
                        1,
                        5*round(max(stat_results[kg].results_df['-log10_padj'])/5),
                        4, 
                    ))

    legend_elements = [lines.Line2D(
                            [0], 
                            [0], 
                            lw=0, 
                            marker="o", 
                            linestyle=None, 
                            markersize=s**0.5,
                        ) for s in element_range]

    legend = ax[i,0].legend(
                    legend_elements,
                    element_range,
                    frameon=False, 
                    loc='upper left', 
                    bbox_to_anchor=(1.,1.),
                    title='-log10_padj transcript'
                )


In [None]:
# Dump results dataframes to results folder.

for k in contrasts.keys():

    kt = '%s_transcript' % k
    kg = '%s_gene' % k

    stat_results[kt].results_df.to_csv('%s_%s_%s.csv' % (RESULTS_PATH, 'transcript', k))
    stat_results[kg].results_df.to_csv('%s_%s_%s.csv' % (RESULTS_PATH, 'gene', k))