# Prepare lupus dataset for DM comparisons

Bulk data here: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE164457

In [80]:
import pandas as pd
import scanpy as sc
import numpy as np
import matplotlib.pyplot as plt

data_path = '/data_volume/memento/method_comparison/lupus/'

### Read single cell and bulk data

adata = sc.read(data_path + '../../lupus/Lupus_study_adjusted_counts.h5ad')

bulk = pd.read_csv(data_path + 'lupus_bulk.csv', index_col=0)

def get_sc_ind(x):
    
    if '-' in x:
        return x.split('-')[1]
    elif '_' in x:
        return x.split('_')[0]
    else:
        return x

meta = adata.obs[['ind_cov', 'Age', 'Sex', 'SLE_status']].drop_duplicates().reset_index(drop=True)
meta['ind'] = meta['ind_cov'].apply(get_sc_ind)

sc_inds = set(meta['ind'].tolist())

bulk_inds = set(bulk.columns.str.split('_').str[1].tolist())

inds = list(sc_inds & bulk_inds)

meta = meta[meta['ind'].isin(inds)]

genes = list(set(bulk.index) & set(adata.var.index))

## Create CD14 vs CD4 comparisons

### Sample individuals

For now, we'll stick to comparing CD14 vs CD4 cells

sampled_inds = np.random.choice(inds, 4)

### Create single cell data and pseudobulks

adata.obs['ind'] = adata.obs['ind_cov'].apply(get_sc_ind)

sampled_adata = adata[adata.obs['ind'].isin(sampled_inds) & adata.obs['cg_cov'].isin(['T4', 'cM']), genes]

pseudobulks = []
names = []
adata_list = []
for ind in sampled_inds:
    for ct in ['T4', 'cM']:
        ind_ct_adata = sampled_adata[(sampled_adata.obs['ind']==ind) & (sampled_adata.obs['cg_cov']==ct)].copy()
        sc.pp.subsample(ind_ct_adata, n_obs=100)
        adata_list.append(ind_ct_adata.copy())
        pseudobulks.append( ind_ct_adata.X.sum(axis=0).A1)
        names.append(('CD14' if ct == 'cM' else 'CD4') + '_' + ind )
sc_data = sc.AnnData.concatenate(*adata_list)
pseudobulks = np.vstack(pseudobulks)
pseudobulks = pd.DataFrame(pseudobulks.T, columns=names, index=genes)

pseudobulks.to_csv(data_path + 'T4_vs_cM.pseudobulk.csv')

sc_data.write(data_path + 'T4_vs_cM.single_cell.h5ad')

### Select bulk data

names = []
for ind in sampled_inds:
    for ct in ['T4', 'cM']:
        
        name = ('CD14' if ct == 'cM' else 'CD4') + '_' + ind
        names.append(name)
bulk.loc[genes, names].to_csv(data_path + 'T4_vs_cM.bulk.csv')

## Create low/high SLEDAI comparisons

In [77]:
'1763_1763' in adata.obs[adata.obs['Status'] == "Managed"].ind_cov.drop_duplicates().values

True

In [54]:
adata.obs[adata.obs['Status'] == "Flare"].ind_cov.drop_duplicates().values.tolist()

['FLARE006',
 'FLARE004',
 '1772_1772',
 'FLARE015',
 'FLARE011',
 'FLARE014',
 'FLARE017',
 '1586_1586',
 'FLARE020',
 'FLARE007',
 'FLARE016',
 'FLARE009',
 'FLARE001',
 '1130_1130',
 'FLARE013',
 '2132_2132',
 'FLARE010',
 'FLARE005',
 '1763_1763']

In [45]:
adata.obs['Status'].value_counts()

Managed    696626
Healthy    486418
Flare       55120
Treated     25512
Name: Status, dtype: int64

In [44]:
adata.obs[['Status','ind_cov']].value_counts()

Status   ind_cov          
Healthy  IGTB469_IGTB469      13543
         IGTB195_IGTB195      12768
         IGTB514_IGTB514      12491
         IGTB508_IGTB508      11178
         IGTB1540_IGTB1540    10382
                              ...  
Treated  FLARE011              1469
Flare    FLARE017              1189
Treated  1586_1586             1065
Flare    FLARE010               761
Healthy  HC-566                 456
Length: 274, dtype: int64