# Preprocess Cano Gamez dataset

In [1]:
import scanpy as sc
import pandas as pd
import itertools
import scipy.io as io
import numpy as np

In [2]:
data_path = '/data_volume/memento/method_comparison/canogamez/'

In [3]:
file_prefix = 'NCOMMS-19-7936188'

### Define datasets

In [5]:
bulk_metadata = pd.read_csv(data_path + 'raw_data/NCOMMS-19-7936188_bulk_RNAseq_metadata.txt', sep='\t')\
    .query('stimulation_time == "5d"')


In [6]:
conditions = [ 'Th0', 'Th2', 'Th17', 'iTreg']
celltypes = ['CD4_Memory', 'CD4_Naive']
times = ['5d', '16h']

In [7]:
datasets = ['{}-{}'.format(a,b) for a,b in list(itertools.product(celltypes, conditions))]

In [8]:
# cell.type  cytokine.condition
# Memory     Th0                   4766
#            Th17                  5267
#            Th2                   2893
#            UNS                   3110
#            iTreg                 6131
# Naive      Th0                   2543
#            Th17                  5615
#            Th2                   4040
#            UNS                   2159
#            iTreg                 6588

In [9]:
datasets

['CD4_Memory-Th0',
 'CD4_Memory-Th2',
 'CD4_Memory-Th17',
 'CD4_Memory-iTreg',
 'CD4_Naive-Th0',
 'CD4_Naive-Th2',
 'CD4_Naive-Th17',
 'CD4_Naive-iTreg']

### Organize bulk dataset

In [116]:
bulk_data = pd.read_csv(data_path + 'raw_data/NCOMMS-19-7936188_bulk_RNAseq_raw_counts.txt', sep='\t')

In [117]:
bulk_metadata['donor_id'] = 'd' + bulk_metadata['donor_id'].astype('str')

In [118]:
for dataset in datasets:
    
    ct, stim = dataset.split('-')
    stims = [stim, 'Resting']
    dataset_metadata = bulk_metadata\
        .query('cell_type=="{}" & cytokine_condition in @stims'.format(ct, stim))
    dataset_counts = bulk_data[dataset_metadata.sample_id]
    
    dataset_metadata.to_csv(data_path + 'bulk/{}_metadata.csv'.format(dataset),index=False)
    dataset_counts.to_csv(data_path + 'bulk/{}_counts.csv'.format(dataset))

In [155]:
dataset_metadata

Unnamed: 0,sample_id,cell_type,cytokine_condition,stimulation_time,donor_id,sex,age,sequencing_batch,cell_culture_batch,rna_integrity_number
24,I0735,CD4_Naive,Resting,5d,d254,Male,58,1,3,8.0
38,I0751,CD4_Naive,Resting,5d,d257,Male,38,1,3,9.9
102,I0815,CD4_Naive,Resting,5d,d255,Male,53,2,3,8.3
103,I0816,CD4_Naive,iTreg,5d,d254,Male,58,2,3,10.0
108,I0821,CD4_Naive,iTreg,5d,d257,Male,38,2,3,9.8
165,I0878,CD4_Naive,iTreg,5d,d255,Male,53,2,3,10.0


### Make adata and pseudobulks

In [75]:
genes = pd.read_csv(data_path + 'raw_data/NCOMMS-19-7936188_scRNAseq_genes.tsv', sep='\t', header=None, index_col=0)
barcodes = pd.read_csv(data_path + 'raw_data/NCOMMS-19-7936188_scRNAseq_barcodes.tsv', sep='\t', header=None, index_col=0)

In [76]:
sc_metadata = pd.read_csv(data_path + 'raw_data/NCOMMS-19-7936188_metadata.txt', sep='\t')

In [61]:
matrix = io.mmread(data_path + 'raw_data/NCOMMS-19-7936188_scRNAseq_matrix.mtx')

In [104]:
adata = sc.AnnData(X=matrix.tocsr().T, obs=sc_metadata, var=genes)

In [135]:
adata.var.index.name= 'Gene'

In [121]:
adata.obs['cell.type'] = 'CD4_' + adata.obs['cell.type']

In [132]:
dataset_adata = adata[(adata.obs['cell.type']==ct) & adata.obs['cytokine.condition'].isin([stim, 'UNS'])]

In [159]:
for dataset in datasets:
    
    ct, stim = dataset.split('-')
    
    dataset_adata = adata[(adata.obs['cell.type']==ct) & adata.obs['cytokine.condition'].isin([stim, 'UNS'])]
    
    pseudobulks = []
    names = []
    adata_list = []
    meta = []
    for ind in ['D1', 'D2', 'D3', 'D4']:
        for s in [stim, 'UNS']:
            
            group_adata = dataset_adata[(dataset_adata.obs['cytokine.condition']==s) & (dataset_adata.obs['donor.id']==ind)]
            if group_adata.shape[0] < 100:
                continue
            # sc.pp.subsample(group_adata, n_obs=100)
            adata_list.append(group_adata.copy())
            pseudobulks.append( group_adata.X.sum(axis=0).A1)
            names.append(s + '_' + ind )
            meta.append((s,ind))
    sc_data = sc.AnnData.concatenate(*adata_list)
    pseudobulks = np.vstack(pseudobulks)
    pseudobulks = pd.DataFrame(pseudobulks.T, columns=names, index=adata.var.index.tolist())
    meta = pd.DataFrame(meta, columns=['cytokine_condition', 'donor_id'], index=names)
    
    sc_data.write(data_path + 'single_cell/{}_1.h5ad'.format(dataset))
    pseudobulks.to_csv(data_path + 'pseudobulks/{}_1.csv'.format(dataset))
    meta.to_csv(data_path + 'pseudobulks/{}_meta_1.csv'.format(dataset))


See the tutorial for concat at: https://anndata.readthedocs.io/en/latest/concatenation.html

See the tutorial for concat at: https://anndata.readthedocs.io/en/latest/concatenation.html

See the tutorial for concat at: https://anndata.readthedocs.io/en/latest/concatenation.html

See the tutorial for concat at: https://anndata.readthedocs.io/en/latest/concatenation.html

See the tutorial for concat at: https://anndata.readthedocs.io/en/latest/concatenation.html

See the tutorial for concat at: https://anndata.readthedocs.io/en/latest/concatenation.html

See the tutorial for concat at: https://anndata.readthedocs.io/en/latest/concatenation.html

See the tutorial for concat at: https://anndata.readthedocs.io/en/latest/concatenation.html


### Generate the dense matrices for MAST

In [10]:
for dataset in datasets:
			
		adata = sc.read(data_path + 'single_cell/{}_1.h5ad'.format(dataset))

		expr_df = pd.DataFrame(adata.X.toarray(), columns=adata.var.index, index=adata.obs.index)

		expr_df.to_csv(data_path + 'single_cell/{}_1_expr.csv'.format(dataset, ))
		adata.obs.to_csv(data_path + 'single_cell/{}_1_obs.csv'.format(dataset, ))
		adata.var.to_csv(data_path + 'single_cell/{}_1_var.csv'.format(dataset, ))
		