# Tutorial for performing eQTL analysis for a dataset with many individuals.

Most of this tutorial can also be used for testing any feature that is at the replicate/individual level. For example, comparing case vs control would use similar procedure, since the independent variable is defined for each person and not for each cell.



In [1]:
import scanpy as sc
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import itertools
from pybedtools import BedTool
import statsmodels.formula.api as smf
import statsmodels.api as sm
import imp

import os
import pickle as pkl
%matplotlib inline

In [2]:
import sys
sys.path.append('/home/ssm-user/Github/scrna-parameter-estimation/dist/memento-0.0.8-py3.8.egg')
import memento

In [3]:
data_path  = '/data_volume/memento/lupus/'

### Read the variables of interest (SNPs) and covariates

For each of these dataframes, each row is an individual and columns are different variables of interest. 

For the tutorial's sake, we'll only keep 3 SNPs to test against all genes. This is highly customizable with specifying (optionally) gene-SNPs to test.

In [4]:
pop = 'eur'

In [5]:
snps = pd.read_csv(data_path + 'mateqtl_input/{}_genos.tsv'.format(pop), sep='\t', index_col=0).T
cov = pd.read_csv(data_path + 'mateqtl_input/{}_mateqtl_cov.txt'.format(pop), sep='\t', index_col=0).T


In [6]:
snps = snps.iloc[:, :5]
snps.head(3)

CHROM:POS,3:165182446,6:122682327,22:40561759,3:104381193,15:57107863
1132_1132,1,1,1,1,2
1285_1285,2,1,2,1,2
1961_1961,1,2,2,0,2


In [7]:
cov.head(5)

Unnamed: 0,age,Female,status,PC1_e,PC2_e,PC3_e,PC4_e,PC5_e,PC6_e,PC7_e,...,batch_cov_b_14,batch_cov_b_15,batch_cov_b_2,batch_cov_b_3,batch_cov_b_4,batch_cov_b_5,batch_cov_b_6,batch_cov_b_7,batch_cov_b_8,batch_cov_b_9
1132_1132,45.0,0.0,0.0,19.067178,17.787198,10.275343,-2.82957,-3.546597,-1.269196,-2.183796,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1285_1285,39.0,0.0,0.0,14.471841,18.737343,12.465061,11.195105,-2.246129,-11.168822,2.230269,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1961_1961,43.0,0.0,0.0,-7.343628,38.241007,-7.431836,0.60722,-13.730105,-2.339229,-3.238375,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
HC-526,58.0,0.0,1.0,0.495487,-17.795535,0.458286,5.384761,-10.269823,-2.239953,-4.240055,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1414_1414,41.0,1.0,0.0,-10.31384,-3.423322,1.635042,9.192646,-3.507571,11.446228,-3.834848,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


### Read h5ad object and setup memento

In [8]:
ct = 'T4'

In [9]:
adata = sc.read(data_path + 'single_cell/{}_{}.h5ad'.format(pop, ct))
adata = adata[adata.obs.ind_cov.isin(snps.index)].copy() # pick out individuals we have data for

adata.obs['capture_rate'] = 0.1
memento.setup_memento(adata, q_column='capture_rate', trim_percent=0.1, filter_mean_thresh=0.05, estimator_type='mean_only')

  res = method(*args, **kwargs)


In [10]:
adata.obs.head(3)

Unnamed: 0,batch_cov,ind_cov,Processing_Cohort,louvain,cg_cov,ct_cov,L3,ind_cov_batch_cov,Age,Sex,pop_cov,Status,SLE_status,capture_rate,memento_size_factor
GTCACGGAGATTACCC-1-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-1-0-0-0-0-0-0-0-0-0-0-0-0-0,dmx_YE_8-2,1368_1368,2.0,1,T4,,0.0,1368_1368:dmx_YE_8-2,45.0,Male,European,Managed,SLE,0.1,1.006062
GTCATTTCAGAGTGTG-1-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-1-0-0-0-0-0-0,dmx_YS-JY-22_pool5,HC-540,4.0,2,T4,T4_em,1.0,HC-540:dmx_YS-JY-22_pool5,68.0,Female,European,Healthy,Healthy,0.1,1.005296
AAAGATGGTTCACGGC-1-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-1-0-0-0-0-0-0-0-0-0-0,dmx_YS-JY-20_pool3,HC-006,4.0,1,T4,T4_naive,1.0,HC-006:dmx_YS-JY-20_pool3,53.0,Female,European,Healthy,Healthy,0.1,1.04514


### Run memento

In [11]:
memento.create_groups(adata, label_columns=['ind_cov'])

In [12]:
memento.compute_1d_moments(adata, min_perc_group=.9)

  res = method(*args, **kwargs)


In [13]:
adata.shape

(129531, 1740)

### Setup the SNP and covariate dfs for hypothesis testing

In [14]:
sample_order = memento.get_groups(adata) #the order of samples that memento expects

In [16]:
sample_order.head(5)

Unnamed: 0,ind_cov
sg^1368_1368,1368_1368
sg^HC-540,HC-540
sg^HC-006,HC-006
sg^1219_1219,1219_1219
sg^1596_1596,1596_1596


In [20]:
cov_df = cov.loc[sample_order['ind_cov']]

In [18]:
snps_df = snps.loc[sample_order['ind_cov']]

In [19]:
cov_df.head(5)

Unnamed: 0,age,Female,status,PC1_e,PC2_e,PC3_e,PC4_e,PC5_e,PC6_e,PC7_e,...,batch_cov_b_14,batch_cov_b_15,batch_cov_b_2,batch_cov_b_3,batch_cov_b_4,batch_cov_b_5,batch_cov_b_6,batch_cov_b_7,batch_cov_b_8,batch_cov_b_9
1368_1368,45.0,1.0,0.0,-10.954459,-0.346637,-4.696598,-0.264279,5.929321,2.133823,-1.924456,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
HC-540,68.0,0.0,1.0,15.368271,-11.841817,8.918722,1.11295,-1.139613,0.202273,-4.628266,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
HC-006,53.0,0.0,1.0,8.069081,1.665128,7.906502,-17.151256,5.877348,-2.601942,7.501512,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1219_1219,50.0,0.0,0.0,-20.517857,-3.298162,-6.890365,-6.691988,11.749869,-7.065778,3.629006,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1596_1596,66.0,0.0,0.0,3.838886,8.798288,12.784598,-2.669462,0.753539,1.592293,4.783922,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
snps_df.head(5)

CHROM:POS,3:165182446,6:122682327,22:40561759,3:104381193,15:57107863
1368_1368,0,2,1,2,1
HC-540,0,2,1,1,2
HC-006,0,1,2,2,2
1219_1219,0,2,1,1,1
1596_1596,1,1,2,0,1


### Run hypothesis testing

In [21]:
memento.ht_1d_moments(
    adata, 
    covariate=cov_df,
    treatment=snps_df,
    num_boot=3000, 
    verbose=1,
    num_cpus=13,
    resampling='bootstrap',
    approx=True,
    resample_rep=True) #resample_rep implements the hierarchial bootstrap

[Parallel(n_jobs=13)]: Using backend LokyBackend with 13 concurrent workers.
[Parallel(n_jobs=13)]: Done  24 tasks      | elapsed:    7.6s
[Parallel(n_jobs=13)]: Done 174 tasks      | elapsed:   30.2s
[Parallel(n_jobs=13)]: Done 424 tasks      | elapsed:  1.1min
[Parallel(n_jobs=13)]: Done 774 tasks      | elapsed:  2.0min
[Parallel(n_jobs=13)]: Done 1224 tasks      | elapsed:  3.2min
[Parallel(n_jobs=13)]: Done 1740 out of 1740 | elapsed:  4.7min finished


In [22]:
# mean only mode, so last 3 columns are meaningless
memento.get_1d_ht_result(adata).head(10)

Unnamed: 0,gene,tx,de_coef,de_se,de_pval,dv_coef,dv_se,dv_pval
0,ISG15,3:165182446,-0.002745,0.012898,0.83152,0.0,4.9253660000000006e-17,1.0
1,ISG15,6:122682327,0.037353,0.012024,0.001893,0.0,5.4400240000000006e-17,1.0
2,ISG15,22:40561759,-0.004065,0.012909,0.752869,0.0,5.1891060000000004e-17,1.0
3,ISG15,3:104381193,0.028679,0.011838,0.015411,0.0,5.4651570000000005e-17,1.0
4,ISG15,15:57107863,-0.031084,0.012885,0.015921,0.0,6.101976000000001e-17,1.0
5,AURKAIP1,3:165182446,0.000213,0.003096,0.945264,0.0,2.5016960000000003e-17,1.0
6,AURKAIP1,6:122682327,0.002744,0.003196,0.392535,0.0,2.890471e-17,1.0
7,AURKAIP1,22:40561759,-0.000857,0.003002,0.775397,0.0,3.027066e-17,1.0
8,AURKAIP1,3:104381193,0.003885,0.002904,0.182294,0.0,2.5599500000000003e-17,1.0
9,AURKAIP1,15:57107863,0.002439,0.003655,0.505206,0.0,3.3889390000000006e-17,1.0
