# Process the lupus data

- Filter and subset by cell type
- Create pseudobulks for matrix eQTL

In [204]:
import scanpy as sc
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import itertools
from pybedtools import BedTool
import statsmodels.formula.api as smf
import statsmodels.api as sm

import os
import pickle as pkl
%matplotlib inline
import qvalue

In [205]:
import sys
sys.path.append('/home/ssm-user/Github/scrna-parameter-estimation/dist/memento-0.0.8-py3.8.egg')
sys.path.append('/home/ssm-user/Github/misc-seq/miscseq/')
import encode
import memento

In [206]:
data_path  = '/data_volume/memento/lupus/'

### Read the lupus data

In [218]:
pos_asian = pd.read_csv(data_path + 'mateqtl_input/asian/cm_cg.genos', sep='\t', index_col=0)
pos_eur = pd.read_csv(data_path + 'mateqtl_input/eur/cm.genos', sep='\t', index_col=0)

In [219]:
pos_asian = pos_asian[~pos_asian.index.duplicated(keep='first')]
pos_eur = pos_eur[~pos_eur.index.duplicated(keep='first')]

In [220]:
common_snps = list(set(pos_asian.index) & set(pos_eur.index))

In [221]:
pos_asian = pos_asian.loc[common_snps]
pos_eur = pos_eur.loc[common_snps]

In [231]:
pos_asian.to_csv(data_path + 'mateqtl_input/asian_genos.tsv', sep='\t')
pos_eur.to_csv(data_path + 'mateqtl_input/eur_genos.tsv', sep='\t')

In [232]:
pos_asian.shape

(3285470, 97)

In [233]:
pos_eur.shape

(3285470, 91)

In [259]:
adata = sc.read(data_path + 'Lupus_study_adjusted_counts.h5ad')

In [225]:
adata.obs.cg_cov.value_counts()

T4        129531
cM        111439
T8         70774
B          42419
NK         34552
ncM        18072
cDC         6129
Prolif      2542
pDC         1475
PB           382
Progen       318
Name: cg_cov, dtype: int64

### Read cell types from 1K1K results

In [234]:
onek1k_pos = pd.read_csv('1k1k_snps_chrpos.txt', header=None).iloc[:,0].tolist()
onek1k_info = pd.read_csv('1k1k_snp_info.txt', sep='\t', header=None).iloc[:,:3]
onek1k_info.columns = ['chr', 'pos', 'name']
onek1k_info['rsid'] = onek1k_info['chr'].astype(str) + ':' + onek1k_info['pos'].astype(str)

In [235]:
onek_replication = pd.read_csv(data_path + 'OneK1K_eqtls_for_replication.txt', sep='\t')#.query('cell_type == "Mono_C"')

In [236]:
onek_replication = onek_replication.merge(onek1k_info, left_on='SNP_rsID', right_on='name')#.query('cell_type == "Mono_C"')

In [237]:
ct_converter = {
    'CD4_Naïve_CM':'T4',
    'CD4_EM_TEMRA':'T4',
    'CD4_SOX4':'T4',
    'CD8_EM_TEMRA':'T8',
    'CD8_Naïve_CM':'T8',
    'CD8_S100B':'T8',
    'Mono_C':'cM',
    'Mono_NC':'ncM',
    'NK':'NK',
    'NK_recruiting':'NK',
    'B_Imm_Naïve':'B',
    'BMem':'B',
    'Plasma':'B'
}

In [238]:
cts = list(set([y for x,y in ct_converter.items()]))

In [239]:
onek_replication = onek_replication.\
    query('rsid in @pos_asian.index.tolist() & cell_type in @ct_converter.keys() & GeneID in @adata.var.index')\
    .rename(columns={'GeneID':'gene'})

In [240]:
onek_replication['cg_cov'] = onek_replication['cell_type'].apply(lambda x: ct_converter[x])

In [241]:
onek_replication.to_csv(data_path + 'filtered_onek_eqtls.csv', index=False)

In [242]:
onek_replication.shape

(9382, 10)

### Make subsets and pseudobulks

In [265]:
adata = sc.read(data_path + 'Lupus_study_adjusted_counts.h5ad')

In [267]:
for ct in cts:
        
    subset = adata[(adata.obs['cg_cov'] == ct) & (adata.obs['ind_cov'].isin(pos_asian.columns))].copy()
    subset.write(data_path + 'single_cell/asian_{}.h5ad'.format(ct))
    asian_ind_list = pos.columns.tolist()

    means = []
    for ind in asian_ind_list:
        view = subset[subset.obs.ind_cov==ind]
        if view.shape[0] == 0:
            means.append(np.log(view.X.sum(axis=0).A1+1))
        else:
            means.append(np.log(view.X.mean(axis=0).A1+1))
    pseudobulk = pd.DataFrame(np.vstack(means), columns=subset.var.index, index=asian_ind_list)
    pseudobulk.to_csv(data_path + 'pseudobulk/asian_{}.csv'.format(ct), sep='\t')
        
    subset = adata[(adata.obs['cg_cov'] == ct) & (adata.obs['ind_cov'].isin(pos_eur.columns))].copy()
    subset.write(data_path + 'single_cell/eur_{}.h5ad'.format(ct))
    eur_ind_list = pos.columns.tolist()

    means = []
    for ind in eur_ind_list:
        view = subset[subset.obs.ind_cov==ind]
        if view.shape[0] == 0:
            means.append(np.log(view.X.sum(axis=0).A1+1))
        else:
            means.append(np.log(view.X.mean(axis=0).A1+1))
    pseudobulk = pd.DataFrame(np.vstack(means), columns=subset.var.index, index=eur_ind_list)
    pseudobulk.to_csv(data_path + 'pseudobulk/eur_{}.csv'.format(ct), sep='\t')
    

In [264]:
subset

AnnData object with n_obs × n_vars = 60167 × 32738
    obs: 'batch_cov', 'ind_cov', 'Processing_Cohort', 'louvain', 'cg_cov', 'ct_cov', 'L3', 'ind_cov_batch_cov', 'Age', 'Sex', 'pop_cov', 'Status', 'SLE_status'
    var: 'gene_ids', 'feature_types-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0'

### Make covariate file

In [252]:
asian_cov = pd.read_csv(data_path + 'mateqtl_input/asian/cm_cg.mateqtl_cov.txt', sep='\t', index_col=0).T
eur_cov = pd.read_csv(data_path + 'mateqtl_input/eur/cm.mateqtl_cov.txt', sep='\t', index_col=0).T

In [253]:
asian_cov['batch_cov'] = 'b_'+asian_cov['batch_cov'].astype(int).astype(str)
eur_cov['batch_cov'] = 'b_'+eur_cov['batch_cov'].astype(int).astype(str)

In [254]:
# asian_cov['pop'] = 1.0
# eur_cov['pop'] = 0.0

In [255]:
# asian_cov = asian_cov.drop('batch_cov', axis=1)
# eur_cov = eur_cov.drop('batch_cov', axis=1)

In [256]:
asian_cov = pd.get_dummies(asian_cov, drop_first=True)
eur_cov = pd.get_dummies(eur_cov, drop_first=True)

In [257]:
asian_cov.T.to_csv(data_path + 'mateqtl_input/asian_mateqtl_cov.txt', sep='\t')
eur_cov.T.to_csv(data_path + 'mateqtl_input/eur_mateqtl_cov.txt', sep='\t')

### Save a filtered copy of SNPs

In [203]:
pos.loc[onek_replication['rsid'].drop_duplicates().tolist()].to_csv(data_path + 'mateqtl_input/filtered_genos.tsv', sep='\t')



In [258]:
pos_asian.loc[onek_replication['rsid'].drop_duplicates().tolist()]\
    .to_csv(data_path + 'mateqtl_input/asian_filtered_genos.tsv', sep='\t')
pos_eur.loc[onek_replication['rsid'].drop_duplicates().tolist()]\
    .to_csv(data_path + 'mateqtl_input/eur_filtered_genos.tsv', sep='\t')

In [35]:
shuffled_pos = pos.loc[onek_replication['rsid'].drop_duplicates().tolist()].copy()

In [36]:
def crazyshuffle(arr):
    x, y = arr.shape
    rows = np.indices((x,y))[0]
    cols = [np.random.permutation(y) for _ in range(x)]
    return arr[rows, cols]

In [37]:
shuffled_pos[shuffled_pos.columns] = crazyshuffle(shuffled_pos.values)

In [42]:
shuffled_pos.to_csv(data_path + 'mateqtl_input/shuffled_filtered_genos.tsv', sep='\t')