# Simple eQTL analysis

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data_path  = '/data_volume/memento/lupus/'

In [11]:
pos = pd.read_csv(data_path + 'cm_cg.genos', sep='\t')

In [16]:
cbc_eqtls.cell.drop_duplicates()

0         pbmc
192054       b
245550     cdc
284295      cm
398147     ncm
452407      nk
499329     pdc
516907      t4
618590      t8
Name: cell, dtype: object

In [25]:
cbc_eqtls = pd.read_csv(data_path + 'Table6.txt', sep='\t', skiprows=1).query('cell == "cm"').drop('STD_FE', axis=1)
cbc_eqtls.columns = ['ct', 'rsid_gene', 'pvalue', 'beta']

In [26]:
cbc_eqtls.shape

(113852, 4)

In [28]:
cbc_eqtls['gene'] = cbc_eqtls['rsid_gene'].str.split('_').str[1]
cbc_eqtls['rsid'] = cbc_eqtls['rsid_gene'].str.split('_').str[0]

In [33]:
cbc_eqtls.gene.value_counts()

HLA-DQB1    1244
HLA-DRB1     842
HLA-B        816
HLA-DRB5     773
HLA-A        762
            ... 
GMCL1          1
RNF144A        1
TEX261         1
CDC14A         1
WDR61          1
Name: gene, Length: 2438, dtype: int64

### Development

In [1]:
import scanpy as sc
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import itertools
from pybedtools import BedTool
import statsmodels.formula.api as smf
import statsmodels.api as sm

import os
import pickle as pkl
%matplotlib inline

In [2]:
pd.set_option('max_columns', None)
pd.set_option('display.max_rows', 500)

In [3]:
import sys
sys.path.append('/home/ssm-user/Github/scrna-parameter-estimation/dist/memento-0.0.7-py3.8.egg')
sys.path.append('/home/ssm-user/Github/misc-seq/miscseq/')
import encode
import memento

In [4]:
import matplotlib.pylab as pylab
params = {'legend.fontsize': 'medium',
         'axes.labelsize': 'medium',
         'axes.titlesize':'medium',
         'figure.titlesize':'medium',
         'xtick.labelsize':'medium',
         'ytick.labelsize':'medium',
         'axes.labelsize': 'large'}
pylab.rcParams.update(params)

In [5]:
data_path = '/data_volume/memento/hbec/'

In [6]:
adata = sc.read(data_path + 'HBEC_type_I_filtered_counts_deep.h5ad')

In [7]:
converter = {'basal/club':'BC', 'basal':'B', 'ciliated':'C', 'goblet':'G', 'ionocyte/tuft':'IT', 'neuroendo':'N', 'club':'club'}

In [8]:
adata.obs['ct'] = adata.obs['cell_type'].apply(lambda x: converter[x])
# adata_processed.obs['ct'] = adata_processed.obs['cell_type'].apply(lambda x: converter[x])

In [9]:
def assign_q(batch):
    
    if batch == 0:
        return 0.387*0.25
    elif batch == 1:
        return 0.392*0.25
    elif batch == 2:
        return 0.436*0.25
    else:
        return 0.417*0.25

In [10]:
adata.obs['q'] = adata.obs['batch'].apply(assign_q)

In [11]:
memento.setup_memento(adata, q_column='q', trim_percent=0.1)

In [12]:
adata_stim = adata.copy()[
    adata.obs.ct.isin(['C']) & \
    adata.obs.stim.isin(['beta', 'control']), :].copy()
stim_converter={'control':0, 'beta':1}
adata_stim.obs['stim_indicator'] = adata_stim.obs['stim'].apply(lambda x: stim_converter[x])

memento.create_groups(adata_stim, label_columns=['stim_indicator', 'donor'])
memento.compute_1d_moments(adata_stim, min_perc_group=.5)
print(adata_stim.shape)



  res = method(*args, **kwargs)


(4620, 10773)


In [13]:
memento.ht_1d_moments(
    adata_stim, 
    formula_like='1 + stim_indicator',
    treatment_col='stim_indicator', 
    num_boot=10000, 
    verbose=1,
    num_cpus=93,
    resampling='bootstrap',
    approx=False)

   stim_indicator  donor
0               1  d2513
1               0  d2513
2               1  d2614
3               0  d2614


[Parallel(n_jobs=93)]: Using backend LokyBackend with 93 concurrent workers.
[Parallel(n_jobs=93)]: Done  14 tasks      | elapsed:    4.2s
[Parallel(n_jobs=93)]: Done 264 tasks      | elapsed:    5.6s
[Parallel(n_jobs=93)]: Done 614 tasks      | elapsed:    7.8s
[Parallel(n_jobs=93)]: Done 1064 tasks      | elapsed:   10.2s
[Parallel(n_jobs=93)]: Done 1614 tasks      | elapsed:   13.1s
[Parallel(n_jobs=93)]: Done 2264 tasks      | elapsed:   16.4s
[Parallel(n_jobs=93)]: Done 3014 tasks      | elapsed:   20.3s
[Parallel(n_jobs=93)]: Done 3864 tasks      | elapsed:   24.9s
[Parallel(n_jobs=93)]: Done 4814 tasks      | elapsed:   29.8s
[Parallel(n_jobs=93)]: Done 5864 tasks      | elapsed:   35.3s
[Parallel(n_jobs=93)]: Done 7014 tasks      | elapsed:   41.6s
[Parallel(n_jobs=93)]: Done 8264 tasks      | elapsed:   48.0s
[Parallel(n_jobs=93)]: Done 9614 tasks      | elapsed:   54.9s
[Parallel(n_jobs=93)]: Done 10773 out of 10773 | elapsed:  1.1min finished


In [29]:
cbc_eqtls.head(2)

Unnamed: 0,ct,rsid_gene,pvalue,beta,gene,rsid
284295,cm,10:100122640_HPS1,0.000773,-0.06658,HPS1,10:100122640
284296,cm,10:100156113_HPS1,0.000105,-0.072121,HPS1,10:100156113


In [30]:
pos.shape

(4078216, 98)

In [6]:
pos[pos['CHROM:POS'] == '10:100156113']

Unnamed: 0,CHROM:POS,HC-519,1240_1240,HC-022,1771_1771,1472_1472,HC-573,1294_1294,1754_1754,HC-571,...,1492_1492,1452_1452,1046_1046,HC-551,HC-574,1045_1045,1248_1248,HC-014,HC-011,HC-566
2570210,10:100156113,1,0,1,0,0,1,0,1,0,...,0,1,0,0,1,1,0,0,2,0


In [5]:
pos.head(5)

Unnamed: 0,CHROM:POS,HC-519,1240_1240,HC-022,1771_1771,1472_1472,HC-573,1294_1294,1754_1754,HC-571,...,1492_1492,1452_1452,1046_1046,HC-551,HC-574,1045_1045,1248_1248,HC-014,HC-011,HC-566
0,1:706368,1,0,1,0,1,1,1,0,0,...,0,1,1,1,0,1,1,1,1,0
1,1:713977,0,0,1,0,1,0,0,0,1,...,1,0,0,1,0,0,0,0,0,0
2,1:714439,0,0,1,0,1,0,0,0,1,...,1,0,0,1,0,0,0,0,0,0
3,1:723891,2,1,1,2,1,2,2,2,0,...,0,2,2,1,2,2,2,2,2,2
4,1:727655,0,0,1,0,1,0,0,0,1,...,2,0,0,1,0,0,0,0,0,0


In [12]:
mat = np.ones((25000, 25000))

In [None]:
mat.shape