# Introduction
Use this notebook to create the pickled version of the DESeq2 data for later loading in other notebooks.

# Setup

In [1]:
import scanpy as sc
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from scipy import sparse
import warnings
import itertools as it
import json
import seaborn as sns
import pickle as pkl
from functools import reduce
import gc
import timeit
import os

from nero import Nero as nr

In [2]:
sc.settings.verbosity = 4
sc.settings.set_figure_params(dpi=80)
sc.settings.n_jobs=30

In [3]:
mountpoint = '/data/clue/'
prefix_adts = mountpoint + 'prod/adts/'
prefix_mrna = mountpoint + 'prod/mrna/'
prefix_comb = mountpoint + 'prod/comb/'

# Load in Data

In [4]:
print('\n'.join([i[0] for i in os.walk(prefix_mrna + 'vals/de/')]))
print('\n'.join([i[0] for i in os.walk(prefix_adts + 'vals/de/')]))

/data/clue/prod/mrna/vals/de/
/data/clue/prod/mrna/vals/de/IFNs
/data/clue/prod/mrna/vals/de/IFNs/ct2
/data/clue/prod/mrna/vals/de/IFNs/ct2/input
/data/clue/prod/mrna/vals/de/IFNs/ct2/res
/data/clue/prod/mrna/vals/de/IFNs/ct3
/data/clue/prod/mrna/vals/de/IFNs/ct3/input
/data/clue/prod/mrna/vals/de/IFNs/ct3/res
/data/clue/prod/mrna/vals/de/all
/data/clue/prod/mrna/vals/de/all/ct2
/data/clue/prod/mrna/vals/de/all/ct2/input
/data/clue/prod/mrna/vals/de/all/ct2/res
/data/clue/prod/mrna/vals/de/all/ct3
/data/clue/prod/mrna/vals/de/all/ct3/input
/data/clue/prod/mrna/vals/de/all/ct3/res
/data/clue/prod/adts/vals/de/
/data/clue/prod/adts/vals/de/IFNs
/data/clue/prod/adts/vals/de/IFNs/ct2
/data/clue/prod/adts/vals/de/IFNs/ct2/input
/data/clue/prod/adts/vals/de/IFNs/ct2/res
/data/clue/prod/adts/vals/de/IFNs/ct3
/data/clue/prod/adts/vals/de/IFNs/ct3/input
/data/clue/prod/adts/vals/de/IFNs/ct3/res
/data/clue/prod/adts/vals/de/all
/data/clue/prod/adts/vals/de/all/ct2
/data/clue/prod/adts/vals/de/al

# Export

In [5]:
for (prefix, modality) in zip([prefix_mrna, prefix_adts], ['mrna', 'adts']):
    de = list()
    for analysis, ct_type in tqdm(it.product(['IFNs', 'all'], ['ct2', 'ct3']), total=2**2):
        prefix_de = prefix + 'vals/de/' + analysis + '/' + ct_type + '/res/'
        for fname in os.listdir(prefix_de):
            cond, ct = fname.split('.')[0].split('_', 1)
            de_df = pd.read_csv(prefix_de + fname, index_col=0).dropna() 
            de_df.rename({'log2FoldChange': 'l2fc'}, axis=1, inplace=True) # name too long
            de_df['abs(l2fc)'] = de_df['l2fc'].abs()
            
            # To avoid divide by zero in log10, I mark the ones that have padj == 0 and then 
            # replace them with the next smallest pvalue
            de_df['padj0'] = (de_df['padj'] == 0.0)
            replace_zero = np.ceil(-np.log10(de_df['padj'][de_df['padj'] != 0.0].min()))
            de_df['padj'].replace(0.0, 10**-replace_zero, inplace=True)
            
            de_df['-log10p'] = -np.log10(de_df['padj'])
            de_df['mod'], de_df['anlys'], de_df['cttype'], de_df['cond'], de_df['ct'] = modality, analysis, ct_type, cond, ct
            de.append(de_df)
    de = pd.concat(de, axis=0)
    de.to_pickle(prefix + 'pkls/de.pkl')

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

In [6]:
de = pd.read_pickle(prefix_mrna + 'pkls/de.pkl')

In [7]:
de = pd.read_pickle(prefix_adts + 'pkls/de.pkl')