# Create dataframes for use in Rshiny app
* Two outputs:
    * First output is metadata dataframe.
    * Second output is longform dataframe with logfc values, pvalues, etc.

Example header second dataframe:
"Reference","Gene.Symbol","Annotation","log2FC_INAFM2-HB","p.val_INAFM2-HB","q.val_INAFM2-HB","sig_INAFM2-HB","log2FC_TNF-HB","p.val_TNF-HB","q.val_TNF-HB","sig_TNF-HB","log2FC_INAFM2-TNF","p.val_INAFM2-TNF","q.val_INAFM2-TNF","sig_INAFM2-TNF","p.val_anova","q.val_anova","sig_anova","HB","INAFM2","TNF","HB_1","HB_2","HB_3","TNF_1","TNF_2","TNF_3","INAFM2_1","INAFM2_2","INAFM2_3"
"sp|A6NHG4|A6NHG4_sep_P30046_INDEX_31","DDTL","D-dopachrome decarboxylase-like protein (EC 4.1.1.-) (D-dopachrome tautomerase-like protein)",0.237493160183558,0.000783866415292806,0.0959129879152952,"n.s.",0.115973676432326,0.0314165085496348,0.125876569252111,"n.s.",0.121519483751232,0.0259869864656247,0.153705217160899,"n.s.",0.00255102742689988,0.037099554132613,"sig",19.654729084776,19.8922222449596,19.7707027612083,19.723710933439,19.6818183220977,19.5586579987914,19.751790837475,19.8177300438078,19.7425874023422,19.883882750514,19.8941368403913,19.8986471439734

Example header first dataframe:
Channel,Condition,Replicate,priority,Mixture
TMT1,HB,1,1,Mixture1
TMT2,HB,2,1,Mixture1
TMT3,HB,3,1,Mixture1
TMT4,TNF,1,2,Mixture1
TMT5,TNF,2,2,Mixture1
TMT6,TNF,3,2,Mixture1
TMT7,INAFM2,1,3,Mixture1
TMT8,INAFM2,2,3,Mixture1
TMT9,INAFM2,3,3,Mixture1
TMT10,Empty,Empty,Empty,Mixture1
TMT11,Empty,Empty,Empty,Mixture1
TMT12,Empty,Empty,Empty,Mixture1
TMT13,Empty,Empty,Empty,Mixture1
TMT14,Empty,Empty,Empty,Mixture1
TMT15,Empty,Empty,Empty,Mixture1
TMT16,Empty,Empty,Empty,Mixture1
TMT17,Empty,Empty,Empty,Mixture1
TMT18,Empty,Empty,Empty,Mixture1



In [None]:
import pandas as pd
import numpy as np
from pybiomart import Dataset
import anndata as ad
import os

In [None]:
#DATA_PATH = '/data/expression_atlas/v1/GSE80183/'

DATA_PATH = '/VPR_20230926_X202SC23091601-Z01-F001'

# DATA_PATH = os.getcwd()

RESULTS_PATH = 'de_results/%s' % DATA_PATH.rstrip('/').split('/')[-1]

DDS_GENE_FH = RESULTS_PATH + '_dds_gene.h5_ad'

PADJ_CUTOFF = 0.05
LOGFC_CUTOFF = np.log2(1.5)

In [None]:
# Load adata objects containing normed_counts for running traditional gsea.

dds_gene = ad.read_h5ad(DDS_GENE_FH)

dds_gene.X.shape, dds_gene.uns['contrasts']

In [None]:
# Copy DE dataframes to new dataframe that can be manipulated for gsea.

dds_gene.uns['stat_results_out'] = {k:v.copy() for k,v in dds_gene.uns['stat_results'].items()}
dds_gene.uns['stat_results_lrtnull_out'] = dds_gene.uns['stat_results_lrtnull'].copy()

In [None]:
# Filter dataframes by gene_id, only keep ensembl gene ids.

gene_prefix = 'ENSG'

for k in dds_gene.uns['stat_results_out'].keys():
    dds_gene.uns['stat_results_out'][k] = dds_gene.uns['stat_results_out'][k][
                                                dds_gene.uns['stat_results_out'][k].index.str.startswith(gene_prefix)
                                            ]
    
    dds_gene.uns['stat_results_lrtnull_out'] = dds_gene.uns['stat_results_lrtnull_out'][
                                                dds_gene.uns['stat_results_lrtnull_out'].index.str.startswith(gene_prefix)
                                            ]

In [None]:
# Fetch ensembl gene id - external gene name - uniprotid - gene description mappings from biomart.

dataset = Dataset(
                name='hsapiens_gene_ensembl',
                host='http://www.ensembl.org',
            )

external_gene_mapping = dataset.query(
                                attributes=[
                                    'ensembl_gene_id', 
                                    'external_gene_name',
                                    'description',
                                    'uniprotswissprot',
                                ]
                            )

external_gene_mapping.rename({'Gene stable ID': 'gene_id', 'Gene name': 'gene_name'}, axis=1, inplace=True)

external_gene_mapping

In [None]:
# Reset uniprotid if not present(lncRNAs/other misc genes) and deduplicate on gene_name. 

external_gene_mapping.loc[
                        external_gene_mapping['UniProtKB/Swiss-Prot ID'].isnull(),
                        'UniProtKB/Swiss-Prot ID'
                    ] = external_gene_mapping['gene_name'][external_gene_mapping['UniProtKB/Swiss-Prot ID'].isnull()]
external_gene_mapping.drop_duplicates(subset=['gene_name'], inplace=True, keep='first')
external_gene_mapping

In [None]:
# Build output statistical dataframe off of this gene-mapping dataframe.

out_df = external_gene_mapping.copy()

out_df.rename(
        {
        'gene_name': 'Gene.Symbol', 
        'Gene description': 'Annotation',
        },
        axis=1,
        inplace=True,
    )

out_df['Reference'] = out_df['UniProtKB/Swiss-Prot ID'].map(lambda x: 'sp|%s|empty' % x)
out_df

In [None]:
# Create ens_gene columns for mapping between deseq2 output dataframes and output dataframe.

for k,v in dds_gene.uns['stat_results_out'].items():
    v['ens_gene'] = v.index.str.split('.').str[0]

dds_gene.uns['stat_results_lrtnull_out']['ens_gene'] = dds_gene.uns['stat_results_lrtnull_out'].index.str.split('.').str[0]

dds_gene.uns['stat_results_out']

In [None]:
# Concatenate columns onto output dataframe for each contrasts in stat_results.

out_df = out_df[['Reference', 'Gene.Symbol', 'Annotation', 'gene_id', ]]

for k, v in dds_gene.uns['contrasts'].items():

    v_1 = v[1].replace('-','_')
    v_2 = v[2].replace('-','_')

    v_1 = v[1].replace('-','')
    v_2 = v[2].replace('-','')

    out_df = out_df.merge(dds_gene.uns['stat_results_out'][k], left_on='gene_id', right_on='ens_gene')
    
    out_df.drop(
            [
                '-log10_padj',
                'baseMean',
                'lfcSE',
                'stat',
             ]+[c for c in out_df.columns if c.startswith('ens_gene')], 
            axis=1,
            inplace=True,
        )
    
    out_df.rename(
                {
                'log2FoldChange':'log2FC_%s-%s' % (v_1, v_2),
                'pvalue':'p.val_%s-%s' % (v_1, v_2),
                'padj':'q.val_%s-%s' % (v_1, v_2),
                },
                axis=1,
                inplace=True,
            )
    
    out_df['sig_%s-%s' % (v_1, v_2)] = 'n.s.'

    filter_up = (out_df['q.val_%s-%s' % (v_1, v_2)] < PADJ_CUTOFF) & \
                    (out_df['log2FC_%s-%s' % (v_1, v_2)] > LOGFC_CUTOFF)
    
    out_df.loc[filter_up, 'sig_%s-%s' % (v_1, v_2)] = 'up' 

    filter_down = (out_df['q.val_%s-%s' % (v_1, v_2)] < PADJ_CUTOFF) & \
                    (out_df['log2FC_%s-%s' % (v_1, v_2)] < -1. * LOGFC_CUTOFF)
    
    out_df.loc[filter_down, 'sig_%s-%s' % (v_1, v_2)] = 'down' 

    out_df.loc[out_df['p.val_%s-%s' % (v_1, v_2)].isnull()] = 1.0
    out_df.loc[out_df['q.val_%s-%s' % (v_1, v_2)].isnull()] = 1.0



In [None]:
out_df

In [None]:
# Merge the anova results into output dataframe.

out_df = out_df.merge(dds_gene.uns['stat_results_lrtnull_out'], left_on='gene_id', right_on='ens_gene')

out_df.drop(['ens_gene'], axis=1, inplace=True)

out_df.rename(
            {
                'pvalue': 'p.val_anova',
                'padj': 'q.val_anova', 
            },
            inplace=True,
            axis=1,
        )

out_df['sig_anova'] = 'n.s.'

filter_anova = out_df['q.val_anova'] < PADJ_CUTOFF
    
out_df.loc[filter_anova, 'sig_anova'] = 'sig' 

out_df.loc[out_df['p.val_anova'].isnull()] = 1.0
out_df.loc[out_df['q.val_anova'].isnull()] = 1.0

In [None]:
# Process tpm dataframe with normed_tpms to concatenate onto output dataframe.

indices = [''.join(r.split('_')[:-1])+'_'+r.split('_')[-1] for r in dds_gene.obs.index]

tpm_df = pd.DataFrame(dds_gene.layers['normed_tpm'], index=indices, columns=dds_gene.var.index)

tpm_df = np.log2(tpm_df + 1.)

tpm_df['condition-1'] = [r.replace('-','') for r in dds_gene.obs['condition-1']]

tpm_df

In [None]:
# Create tpm dataframe with group means.

tpm_df_mean = tpm_df.groupby('condition-1').mean().T

tpm_df_mean['ens_gene'] = tpm_df_mean.index.str.split('.').str[0]

tpm_df_mean


In [None]:
# Merge tpm dataframes into output dataframe.

tpm_df = tpm_df.T
tpm_df['ens_gene'] = tpm_df.index.str.split('.').str[0]

out_df = out_df.merge(tpm_df_mean, left_on='gene_id', right_on='ens_gene')
out_df.drop('ens_gene', axis=1, inplace=True)

out_df = out_df.merge(tpm_df, left_on='gene_id', right_on='ens_gene')
out_df.drop('ens_gene', axis=1, inplace=True)
out_df.drop('gene_id', axis=1, inplace=True)

out_df

In [None]:
# Remove rows that don't have data.

out_df = out_df[~out_df[out_df.columns[-1]].isnull()]

out_df

In [None]:
# Set cluster in output dataframe, this gets reassigned by Ian during gene-level clustering.

out_df['cluster'] = 1

out_df

In [None]:
# Save output to csv. 

out_df.to_csv(RESULTS_PATH+'_deseq2_out.csv', index=False)

In [None]:
# Build metadataframe from anndata obs. 

meta_df = dds_gene.obs.copy()

meta_df.reset_index(inplace=True)

meta_df

In [None]:
# Create priorites dict that maps sample conditions to the number of comparisons made in contrasts.

count_contrasts = {l.replace('-',''):0 for k,v in dds_gene.uns['contrasts'].items() for l in v[1:]}

for k, v in dds_gene.uns['contrasts'].items():
    count_contrasts[v[2].replace('-','')] += 1

priorities = {k:i+1 for i, (k,v) in enumerate(sorted([*count_contrasts.items()], key=lambda x: x[1], reverse=True))}
priorities

In [None]:
# Set columns in metadata frame to allow for parsing by shiny app.

meta_df['Condition'] = meta_df['accession'].str.split('_').str[:-1].str.join('_').str.replace('_','')
meta_df['condition-1'] = meta_df['condition-1'].str.replace('-','').str.replace('-','')
meta_df['Replicate'] = meta_df['accession'].str.split('_').str[-1]
meta_df['priority'] = meta_df['condition-1'].map(lambda x: priorities[x])
meta_df['cluster'] = 1
meta_df.drop(['accession', 'condition-1', 'size_factors', 'lib_sizes', 'sample_sums'], inplace=True, axis=1)
meta_df.fillna('Empty', inplace=True)

In [None]:
# Fill empty rows in metadata frame. 

meta_df = pd.concat(
                [
                    meta_df, 
                    pd.DataFrame([['Empty']*meta_df.shape[1]]*(18 - meta_df.shape[0]), columns=meta_df.columns),
                ], 
                ignore_index=True, 
                axis=0,
            )

In [None]:
# Set columns to be equivalent to mass-spec inputs.

meta_df['Channel'] = ['TMT %s.00' % i for i in range(1, meta_df.shape[0]+1)]
meta_df['Mixture'] = 'Mixture1'

meta_df = meta_df[['Channel']+[c for c in meta_df.columns if c != 'Channel']]
meta_df

In [None]:
# Save metadata output to csv. 

meta_df.to_csv(RESULTS_PATH+'_deseq2_meta.csv', index=False)