# Process the lupus data

- Filter and subset by cell type
- Create pseudobulks for matrix eQTL

In [1]:
import scanpy as sc
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import itertools
from pybedtools import BedTool
import statsmodels.formula.api as smf
import statsmodels.api as sm

import os
import pickle as pkl
%matplotlib inline
import qvalue

In [2]:
import sys
sys.path.append('/home/ssm-user/Github/scrna-parameter-estimation/dist/memento-0.0.8-py3.8.egg')
sys.path.append('/home/ssm-user/Github/misc-seq/miscseq/')
import encode
import memento

In [3]:
data_path  = '/data_volume/memento/lupus/'

### Read the lupus data

In [4]:
pos = pd.read_csv(data_path + 'mateqtl_input/cm_cg.genos', sep='\t', index_col=0)

In [5]:
pos = pos[~pos.index.duplicated(keep='first')]

In [6]:
adata = sc.read(data_path + 'Lupus_study_adjusted_counts.h5ad')
adata = adata[adata.obs.ind_cov.isin(pos.columns)].copy()

  res = method(*args, **kwargs)


In [7]:
pos.head(5)

Unnamed: 0_level_0,HC-519,1240_1240,HC-022,1771_1771,1472_1472,HC-573,1294_1294,1754_1754,HC-571,1193_1193,...,1492_1492,1452_1452,1046_1046,HC-551,HC-574,1045_1045,1248_1248,HC-014,HC-011,HC-566
CHROM:POS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1:706368,1,0,1,0,1,1,1,0,0,0,...,0,1,1,1,0,1,1,1,1,0
1:713977,0,0,1,0,1,0,0,0,1,1,...,1,0,0,1,0,0,0,0,0,0
1:714439,0,0,1,0,1,0,0,0,1,1,...,1,0,0,1,0,0,0,0,0,0
1:723891,2,1,1,2,1,2,2,2,0,1,...,0,2,2,1,2,2,2,2,2,2
1:727655,0,0,1,0,1,0,0,0,1,1,...,2,0,0,1,0,0,0,0,0,0


In [8]:
adata.obs.cg_cov.value_counts()

cM        123153
T4        118397
T8         97743
B          60167
NK         28881
ncM        18595
cDC         6272
Prolif      3652
pDC         1307
PB           676
Progen       290
Name: cg_cov, dtype: int64

### Read cell types from 1K1K results

In [9]:
onek1k_pos = pd.read_csv('1k1k_snps_chrpos.txt', header=None).iloc[:,0].tolist()
onek1k_info = pd.read_csv('1k1k_snp_info.txt', sep='\t', header=None).iloc[:,:3]
onek1k_info.columns = ['chr', 'pos', 'name']
onek1k_info['rsid'] = onek1k_info['chr'].astype(str) + ':' + onek1k_info['pos'].astype(str)

In [9]:
onek_replication = pd.read_csv(data_path + 'OneK1K_eqtls_for_replication.txt', sep='\t')#.query('cell_type == "Mono_C"')

In [10]:
onek_replication = onek_replication.merge(onek1k_info, left_on='SNP_rsID', right_on='name')#.query('cell_type == "Mono_C"')

In [12]:
ct_converter = {
    'CD4_Naïve_CM':'T4',
    'CD4_EM_TEMRA':'T4',
    'CD4_SOX4':'T4',
    'CD8_EM_TEMRA':'T8',
    'CD8_Naïve_CM':'T8',
    'CD8_S100B':'T8',
    'Mono_C':'cM',
    'Mono_NC':'ncM',
    'NK':'NK',
    'NK_recruiting':'NK',
    'B_Imm_Naïve':'B',
    'BMem':'B',
    'Plasma':'B'
}

In [13]:
cts = list(set([y for x,y in ct_converter.items()]))

In [14]:
onek_replication = onek_replication.\
    query('rsid in @pos.index.tolist() & cell_type in @ct_converter.keys() & GeneID in @adata.var.index')\
    .rename(columns={'GeneID':'gene'})

In [15]:
onek_replication['cg_cov'] = onek_replication['cell_type'].apply(lambda x: ct_converter[x])

In [16]:
onek_replication.to_csv(data_path + 'filtered_onek_eqtls.csv', index=False)

### Make subsets and pseudobulks

In [22]:
for ct in cts:
    
    subset = adata[adata.obs['cg_cov'] == ct].copy()
    subset.write(data_path + 'single_cell/{}.h5ad'.format(ct))

    ind_list = pos.columns.tolist()
    
    means = []
    for ind in ind_list:
        means.append(np.log(subset[subset.obs.ind_cov==ind].X.mean(axis=0).A1+1))
    pseudobulk = pd.DataFrame(np.vstack(means), columns=subset.var.index, index=ind_list)
    
    pseudobulk.to_csv(data_path + 'pseudobulk/{}.csv'.format(ct))

### Save a filtered copy of SNPs

In [18]:
pos.loc[onek_replication['rsid'].drop_duplicates().tolist()].to_csv(data_path + 'mateqtl_input/filtered_genos.tsv', sep='\t')

In [35]:
shuffled_pos = pos.loc[onek_replication['rsid'].drop_duplicates().tolist()].copy()

In [36]:
def crazyshuffle(arr):
    x, y = arr.shape
    rows = np.indices((x,y))[0]
    cols = [np.random.permutation(y) for _ in range(x)]
    return arr[rows, cols]

In [37]:
shuffled_pos[shuffled_pos.columns] = crazyshuffle(shuffled_pos.values)

In [42]:
shuffled_pos.to_csv(data_path + 'mateqtl_input/shuffled_filtered_genos.tsv', sep='\t')