# Simple eQTL analysis

### Development

In [1]:
import scanpy as sc
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import itertools
from pybedtools import BedTool
import statsmodels.formula.api as smf
import statsmodels.api as sm

import os
import pickle as pkl
%matplotlib inline

In [2]:
pd.set_option('max_columns', None)
pd.set_option('display.max_rows', 500)

In [3]:
import sys
sys.path.append('/home/ssm-user/Github/scrna-parameter-estimation/dist/memento-0.0.8-py3.8.egg')
sys.path.append('/home/ssm-user/Github/misc-seq/miscseq/')
import encode
import memento

In [4]:
import matplotlib.pylab as pylab
params = {'legend.fontsize': 'medium',
         'axes.labelsize': 'medium',
         'axes.titlesize':'medium',
         'figure.titlesize':'medium',
         'xtick.labelsize':'medium',
         'ytick.labelsize':'medium',
         'axes.labelsize': 'large'}
pylab.rcParams.update(params)

In [5]:
data_path = '/data_volume/memento/hbec/'

In [6]:
adata = sc.read(data_path + 'HBEC_type_I_filtered_counts_deep.h5ad')

In [7]:
converter = {'basal/club':'BC', 'basal':'B', 'ciliated':'C', 'goblet':'G', 'ionocyte/tuft':'IT', 'neuroendo':'N', 'club':'club'}

In [8]:
adata.obs['ct'] = adata.obs['cell_type'].apply(lambda x: converter[x])
# adata_processed.obs['ct'] = adata_processed.obs['cell_type'].apply(lambda x: converter[x])

In [9]:
def assign_q(batch):
    
    if batch == 0:
        return 0.387*0.25
    elif batch == 1:
        return 0.392*0.25
    elif batch == 2:
        return 0.436*0.25
    else:
        return 0.417*0.25

In [10]:
adata.obs['q'] = adata.obs['batch'].apply(assign_q)

In [11]:
memento.setup_memento(adata, q_column='q', trim_percent=0.1)

KeyboardInterrupt: 

In [None]:
adata_stim = adata.copy()[
    adata.obs.ct.isin(['C']) & \
    adata.obs.stim.isin(['beta', 'control']), :].copy()
stim_converter={'control':0, 'beta':1}
adata_stim.obs['stim_indicator'] = adata_stim.obs['stim'].apply(lambda x: stim_converter[x])

memento.create_groups(adata_stim, label_columns=['stim_indicator', 'donor'])
memento.compute_1d_moments(adata_stim, min_perc_group=.5, gene_list=['ISG15', 'HES4'])
print(adata_stim.shape)



In [13]:
tx = pd.DataFrame([(1, 0), (0, 1),(1, 0), (0, 1)], columns=['stim', 'ctrl'])
# tx = pd.DataFrame(np.random.random((4, 1000)))
cov = pd.DataFrame([(1), (1), (1), (1)], columns=['intercept'])
# cov = pd.DataFrame(np.random.random((2, 1000)))


In [14]:
adata_stim.uns['memento']['groups']

['sg^1^d2513', 'sg^0^d2513', 'sg^1^d2614', 'sg^0^d2614']

In [15]:
%%time

# with a thousand treatment variables 

memento.ht_1d_moments(
    adata_stim, 
    covariate=cov,
    treatment=tx,
    treatment_for_gene={'ISG15':['stim'], 'HES4':['stim', 'ctrl']},
    num_boot=10000, 
    verbose=1,
    num_cpus=1,
    resampling='bootstrap',
    approx=True)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


CPU times: user 11.6 s, sys: 1.18 s, total: 12.8 s
Wall time: 3.11 s


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.1s finished


In [16]:
adata_stim.var

Unnamed: 0,gene_ids,feature_types,genome,mt,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts
HES4,ENSG00000188290,Gene Expression,GRCh38,False,67040,5.536824,16.87951,446567.0
ISG15,ENSG00000187608,Gene Expression,GRCh38,False,78984,52.471233,2.070573,4232015.0


In [17]:
adata_stim.uns['memento']['1d_ht']

{'treatment_for_gene': {'ISG15': ['stim'], 'HES4': ['stim', 'ctrl']},
 'treatment':    stim  ctrl
 0     1     0
 1     0     1
 2     1     0
 3     0     1,
 'covariate':    intercept
 0          1
 1          1
 2          1
 3          1,
 'mean_coef': array([ 0.81339619, -0.81339619,  2.75586696]),
 'mean_se': array([0.04206331, 0.04206331, 0.06892971]),
 'mean_asl': array([2.69178607e-83, 2.69178607e-83, 0.00000000e+00]),
 'var_coef': array([ 0.28314822, -0.28314822, -2.12782651]),
 'var_se': array([0.12275353, 0.12275353, 0.16763688]),
 'var_asl': array([2.25209618e-02, 2.25209618e-02, 6.05261295e-35])}

In [18]:
memento.get_1d_ht_result(adata_stim)

Unnamed: 0,gene,tx,de_coef,de_se,de_pval,dv_coef,dv_se,dv_pval
0,HES4,stim,0.813396,0.042063,2.691786e-83,0.283148,0.122754,0.02252096
1,HES4,ctrl,-0.813396,0.042063,2.691786e-83,-0.283148,0.122754,0.02252096
0,ISG15,stim,2.755867,0.06893,0.0,-2.127827,0.167637,6.052613e-35


In [47]:
%%time

# with a single treatment variable

memento.ht_1d_moments(
    adata_stim, 
    covariate=cov,
    treatment=tx, 
    num_boot=10000, 
    verbose=1,
    num_cpus=93,
    resampling='bootstrap',
    approx=True)

[Parallel(n_jobs=93)]: Using backend LokyBackend with 93 concurrent workers.
[Parallel(n_jobs=93)]: Done  14 tasks      | elapsed:    4.6s
[Parallel(n_jobs=93)]: Done 264 tasks      | elapsed:    6.0s
[Parallel(n_jobs=93)]: Done 614 tasks      | elapsed:    8.1s
[Parallel(n_jobs=93)]: Done 1064 tasks      | elapsed:   10.5s
[Parallel(n_jobs=93)]: Done 1614 tasks      | elapsed:   13.5s
[Parallel(n_jobs=93)]: Done 2264 tasks      | elapsed:   17.1s
[Parallel(n_jobs=93)]: Done 3014 tasks      | elapsed:   21.1s
[Parallel(n_jobs=93)]: Done 3864 tasks      | elapsed:   25.7s
[Parallel(n_jobs=93)]: Done 4814 tasks      | elapsed:   30.8s
[Parallel(n_jobs=93)]: Done 5864 tasks      | elapsed:   36.7s
[Parallel(n_jobs=93)]: Done 7014 tasks      | elapsed:   43.1s
[Parallel(n_jobs=93)]: Done 8264 tasks      | elapsed:   49.7s
[Parallel(n_jobs=93)]: Done 9614 tasks      | elapsed:   57.1s
[Parallel(n_jobs=93)]: Done 10773 out of 10773 | elapsed:  1.1min finished


CPU times: user 30.8 s, sys: 4.15 s, total: 35 s
Wall time: 1min 9s


In [17]:
a = np.ones(2)*np.nan

In [18]:
a[0:2] = np.array([1, 2])

In [19]:
a

array([1., 2.])

In [18]:
def corr2_coeff(A, B, sample_weight):
    # Rowwise mean of input arrays & subtract from input arrays themeselves
    A_mA = A - np.average(A, axis=0, weights=sample_weight)
    B_mB = B - np.average(B, axis=0, weights=sample_weight)
    
    print(A_mA.shape)

    # Sum of squares across rows
    ssA = np.average(A_mA**2, axis=0, weights=sample_weight)

    # Finally get corr coeff
    return A_mA.T.dot(np.diag(sample_weight)).dot(B_mB)/sample_weight.sum() / ssA[:, None]

In [18]:
corr2_coeff(
    np.array([0.43951831, -2.30821457])[:, None],
    np.array([0.15995671, -0.84004329])[:, None],
    np.array([3881, 739])
)

(2, 1)


array([[0.3639364]])

In [33]:
adata_stim.uns['memento']['1d_ht']

{'treatment':    stim  ctrl
 0     1     0
 1     0     1,
 'covariate':    intercept
 0          1
 1          1,
 'mean_coef': array([4.91818148e-50, 4.91818148e-50]),
 'mean_se': array([4.91818148e-50, 4.91818148e-50]),
 'mean_asl': array([4.91818148e-50, 4.91818148e-50]),
 'var_coef': array([4.91818148e-50, 4.91818148e-50]),
 'var_se': array([4.91818148e-50, 4.91818148e-50]),
 'var_asl': array([4.91818148e-50, 4.91818148e-50])}

In [12]:
mat = np.ones((25000, 25000))

In [None]:
mat.shape

### Alternative coefficient calcuation

In [20]:
Y = np.array([1, 5])[:, None]
X = np.array([0, 1])[:, None]
cov = np.array([1, 1])[:, None]

In [21]:

weights1 = np.array([50, 10])
weights2 = np.ones(50)

In [22]:
from sklearn.linear_model import LinearRegression

### Using sklearn

In [26]:
reg = LinearRegression().fit(np.hstack([X, cov]), Y, sample_weight=weights1)
reg.coef_

array([[4., 0.]])

In [27]:
reg = LinearRegression().fit(np.hstack([X[:, [1]], cov]), Y, sample_weight=weights1)
reg.coef_

IndexError: index 1 is out of bounds for axis 1 with size 1

### Using orthogonalization and correlation

In [28]:
def corr2_coeff(A, B, sample_weight):
    # Rowwise mean of input arrays & subtract from input arrays themeselves
    A_mA = A - np.average(A, axis=0, weights=sample_weight)
    B_mB = B - np.average(B, axis=0, weights=sample_weight)
    
    print(A_mA.shape)

    # Sum of squares across rows
    ssA = np.average(A_mA**2, axis=0, weights=sample_weight)

    # Finally get corr coeff
    return A_mA.T.dot(np.diag(sample_weight)).dot(B_mB)/sample_weight.sum() / ssA[:, None]

In [30]:
Y_tilde = Y-LinearRegression().fit(cov, Y, weights1).predict(cov)
X_tilde = X-LinearRegression().fit(cov, X, weights1).predict(cov)

In [31]:
Y_tilde

array([[-0.66666667],
       [ 3.33333333]])

In [32]:
X_tilde

array([[-0.16666667],
       [ 0.83333333]])

In [34]:
corr2_coeff(X_tilde, Y_tilde, weights1)

(2, 1)


array([[4.]])

In [29]:
Y.mean(axis=0)

array([0.52119791, 0.50160106, 0.47426136])

In [27]:
LinearRegression().fit(  np.ones((50,1)),   Y  ).predict(np.ones((50,1)))

array([[0.52119791, 0.50160106, 0.47426136],
       [0.52119791, 0.50160106, 0.47426136],
       [0.52119791, 0.50160106, 0.47426136],
       [0.52119791, 0.50160106, 0.47426136],
       [0.52119791, 0.50160106, 0.47426136],
       [0.52119791, 0.50160106, 0.47426136],
       [0.52119791, 0.50160106, 0.47426136],
       [0.52119791, 0.50160106, 0.47426136],
       [0.52119791, 0.50160106, 0.47426136],
       [0.52119791, 0.50160106, 0.47426136],
       [0.52119791, 0.50160106, 0.47426136],
       [0.52119791, 0.50160106, 0.47426136],
       [0.52119791, 0.50160106, 0.47426136],
       [0.52119791, 0.50160106, 0.47426136],
       [0.52119791, 0.50160106, 0.47426136],
       [0.52119791, 0.50160106, 0.47426136],
       [0.52119791, 0.50160106, 0.47426136],
       [0.52119791, 0.50160106, 0.47426136],
       [0.52119791, 0.50160106, 0.47426136],
       [0.52119791, 0.50160106, 0.47426136],
       [0.52119791, 0.50160106, 0.47426136],
       [0.52119791, 0.50160106, 0.47426136],
       [0.