# Introduction

Preparing for downstream notebook by reading in genotype information and matrix eQTL results, and adjusting for plotting figures.

# Setup

In [1]:
import scanpy as sc
import anndata
from anndata.utils import make_index_unique
import pandas as pd
import seaborn as sns
from IPython.display import display 
import numpy as np
from scipy import sparse
import matplotlib.pyplot as plt
import matplotlib as mpl
import pickle as pkl
from tqdm.notebook import tqdm
from joblib import Parallel, delayed
from multiprocessing import cpu_count
import itertools as it
import subprocess
from glob import glob
import gzip
import sys
import os
import pybedtools as pybed
import re
import warnings
from nero import Hector as hct
from nero import Nero as nr

In [2]:
mountpoint = '/data/clue/'
prefix_prod = mountpoint + 'prod/'
prefix_pks = mountpoint + 'amo/atac/peaks/'
prefix_eqtl = mountpoint + 'prod/eqtl/'

# VCF

In [None]:
vcf = hct.loadVCF(prefix_eqtl + 'mateqtl/vcfs/all_inds.vcf.gz')

In [110]:
vcf.obs.head()

Unnamed: 0_level_0,AF,MAF,R2,ER2,chr,pos,ref,alt
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1:693731:A:G,0.12943,0.12943,0.69831,,1,693731.0,A,G
1:706368:A:G,0.43793,0.43793,0.38773,,1,706368.0,A,G
1:729679:C:G,0.73155,0.26845,0.60783,,1,729679.0,C,G
1:731718:T:C,0.13954,0.13954,0.76517,,1,731718.0,T,C
1:734349:T:C,0.13897,0.13897,0.76455,,1,734349.0,T,C


Add `chr` to the beginning of the chromosomes and IDs so that they match the liftover chain file we have.

In [111]:
hct.adjust_chr(vcf, add='chr')

In [112]:
vcf.obs.head()

Unnamed: 0_level_0,AF,MAF,R2,ER2,chr,pos,ref,alt
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
chr1:693731:A:G,0.12943,0.12943,0.69831,,chr1,693731.0,A,G
chr1:706368:A:G,0.43793,0.43793,0.38773,,chr1,706368.0,A,G
chr1:729679:C:G,0.73155,0.26845,0.60783,,chr1,729679.0,C,G
chr1:731718:T:C,0.13954,0.13954,0.76517,,chr1,731718.0,T,C
chr1:734349:T:C,0.13897,0.13897,0.76455,,chr1,734349.0,T,C


## Perform Liftover

Perform the liftover, which uses LiftoverBed as backend.

In [113]:
hct.liftover(vcf, genomes=('hg19', 'hg38'), 
         chain_abs_path='/data/codec/pilot.amo.oct19/atac/beds/liftover/hg19ToHg38.over.chain',
         output_dir='/data/clue/prod/eqtl/tmp'
        )

In [114]:
hct.adjust_chr(vcf, replace=({'hg19': ('chr', ''), 'hg38': ('chr', '')}))

In [115]:
vcf.obs.head()

Unnamed: 0_level_0,hg38_ID,AF,MAF,R2,ER2,hg19_chr,hg19_pos,ref,alt,hg38_chr,hg38_pos
hg19_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1:693731:A:G,1:758351:A:G,0.12943,0.12943,0.69831,,1,693731.0,A,G,1,758351.0
1:706368:A:G,1:770988:A:G,0.43793,0.43793,0.38773,,1,706368.0,A,G,1,770988.0
1:729679:C:G,1:794299:C:G,0.73155,0.26845,0.60783,,1,729679.0,C,G,1,794299.0
1:731718:T:C,1:796338:T:C,0.13954,0.13954,0.76517,,1,731718.0,T,C,1,796338.0
1:734349:T:C,1:798969:T:C,0.13897,0.13897,0.76455,,1,734349.0,T,C,1,798969.0


## Subset to Individuals, Add Demographic Info

In [8]:
new_var_names = ['ASB' + i[-4:] if i.startswith('AS') else i for i in vcf.var_names]
var_name_dict = dict(zip(new_var_names, vcf.var_names))

In [9]:
inv_var_name_dict = dict([(v, k) for k, v in var_name_dict.items()])
vcf.var_names = new_var_names

In [10]:
vcf.var['original_vcf_names'] = vcf.var.index.map(var_name_dict)

In [11]:
demo_ids = pd.read_csv(prefix_prod + 'vals/demo_ids.csv', index_col=0)

Subset to common names:

In [12]:
common_names = np.intersect1d(vcf.var_names.values, demo_ids.index.values)

In [13]:
vcf = vcf[:, common_names].copy()
demo_ids = demo_ids.loc[common_names].copy()

In [14]:
vcf.var = vcf.var.join(demo_ids)

In [15]:
var_name_dict = dict(zip(vcf.var_names, vcf.var['original_vcf_names']))
inv_var_name_dict = dict([(v, k) for k, v in var_name_dict.items()])

## Note About IGTB1622

Donor ID IGTB1622 has missing information in the demographics file. This is because in the experiment this donor was recorded as IGTB1266 (according to the deconvolution), but their genotype information matched to donor IGTB1622 in the orthogonal genotyping validation. Because of the similarity in names, we hypothesize that it was a a clerical error that caused their ID to be recorded as IGTB1266 (or vice versa). However, we decide to not move forward with this donor. I will remove this donor here. Coincidentally (or not?), the donor also ends up being an outlier in the smartpca analysis just below, so their information is removed anyway.

In [16]:
vcf.var.loc['IGTB1622']

original_vcf_names        IGTB1622
free_id                          6
well_code             2-4-6-7-9-11
exp_id                          10
match_score                 1556.8
genetics_include              True
shipment                       NaN
YOB                            NaN
gender                         NaN
race                           NaN
ethnicity                      NaN
age                            NaN
height                         NaN
weight                         NaN
bmi                            NaN
viability_eq_ctrl             87.3
Name: IGTB1622, dtype: object

In [18]:
vcf = vcf[:, [i for i in vcf.var_names if i != 'IGTB1622']].copy()

## Create `hg19_ID_nuc_code`
Create another column in the `.obs` with ambiguous nucleotide IUPAC codes. This will be helpful for generating the haplotypes later under `mateqtl_df`.

In [27]:
nuc_code_dict = {'A': 'A', 'C': 'C', 'G': 'G', 'T': 'T', 
                 'A/G': 'R', 'C/T': 'Y', 'G/T': 'K', 'A/C': 'M', 'C/G': 'S', 'A/T': 'W', 
                 'C/G/T': 'B', 'A/G/T': 'D', 'A/C/T': 'H', 'A/C/G': 'V', 
                 'A/C/G/T': 'N'}

In [28]:
vcf.obs['hg19_chr_pos'] = vcf.obs_names.str.split(':').str[:2].str.join(':')
vcf.obs.reset_index(drop=False, inplace=True)
vcf.obs.set_index('hg19_chr_pos', inplace=True)

In [29]:
refalt_dict = (vcf.obs['ref'].str[:] + ':' + vcf.obs['alt'].str[:]).rename('ref:alt').reset_index()

Sort lexographically so that alt alleles appear in the same order as they do in `nuc_code_dict`. It does not matter that chr10 will come before chr1, since I'll use the index to replace `'ref:alt'` in `refalt_dict`.

In [30]:
dupe_bool = refalt_dict['hg19_chr_pos'].duplicated(keep=False)
mapper = refalt_dict.set_index('hg19_chr_pos')[~dupe_bool.values].to_dict()['ref:alt']
dupes = refalt_dict[dupe_bool].sort_values(by=['hg19_chr_pos', 'ref:alt'])

In [31]:
for chr_pos in tqdm(dupes['hg19_chr_pos'].unique()):
    subdupe = dupes[dupes['hg19_chr_pos'] == chr_pos]
    ref = subdupe['ref:alt'].iloc[0].split(':')[0]
    alts = subdupe['ref:alt'].str.split(':').str[1]
    nuc_code = '/'.join(subdupe['ref:alt'].str.split(':').str[1])
    mapper[subdupe.iloc[0].loc['hg19_chr_pos']] = ref + ':' + nuc_code_dict[nuc_code]

  0%|          | 0/752 [00:00<?, ?it/s]

In [32]:
vcf.obs['hg19_nuc_code'] = vcf.obs.index.map(mapper).values
vcf.obs['hg19_nuc_code'] = vcf.obs['hg19_nuc_code'].astype('category')
vcf.obs['hg19_ID_nuc_code'] = vcf.obs_names + ':' + vcf.obs['hg19_nuc_code'].str[:]

In [33]:
vcf.obs['hg19_nuc_code'].cat.categories

Index(['A:B', 'A:C', 'A:G', 'A:K', 'A:S', 'A:T', 'A:Y', 'C:A', 'C:D', 'C:G',
       'C:K', 'C:R', 'C:T', 'C:W', 'G:A', 'G:C', 'G:H', 'G:M', 'G:T', 'G:W',
       'G:Y', 'T:A', 'T:C', 'T:G', 'T:M', 'T:R', 'T:S'],
      dtype='object')

In [5]:
vcf.obs.set_index('hg19_ID', inplace=True)

## Export

In [9]:
# vcf.write_h5ad(prefix_eqtl + 'mateqtl/vcfs/vcf.h5ad')
vcf = anndata.read_h5ad(prefix_eqtl + 'mateqtl/vcfs/vcf.h5ad')

# `mateqtl`

In [68]:
output_files = [fn for fn in os.listdir(prefix_eqtl + 'mateqtl/vals/') if fn.startswith('mateqtl_')]
mateqtl_df = {}
for output_file in tqdm(output_files):
    cond_cell_type = output_file.replace('mateqtl_', '').replace('_all_cis.csv', '').split('_', maxsplit=1)
    cond = cond_cell_type[0]
    cell_type = cond_cell_type[1]
    mateqtl_df[output_file] = pd.read_csv(prefix_eqtl + 'mateqtl/vals/' + output_file, index_col=0, sep=" ")
    mateqtl_df[output_file]["cond"] = cond
    mateqtl_df[output_file]["ct"] = cell_type
mateqtl_df = pd.concat(mateqtl_df.values())

  0%|          | 0/77 [00:00<?, ?it/s]

In [4]:
# calculate number of tests
snppos_file = prefix_eqtl + 'mateqtl/vals/all_chr_snpspos.txt'
gene_locs_file = prefix_eqtl + 'mateqtl/vals/gene_locs.txt'

snppos = pd.read_csv(snppos_file, header=0, sep="\t")
gene_locs = pd.read_csv(gene_locs_file, sep="\t", names=["gene", "chr", "start", "stop"])

In [21]:
max_jobs = cpu_count() - 2 # don't use all the processors just in case

In [51]:
def get_num_tests(sub_gene_locs):
    num_tests = 0
    for i, gene in sub_gene_locs.iterrows():
        num_tests += snppos[(snppos["chr"] == gene["chr"]) & (snppos["pos"] >= gene["start"] - 100000) & (snppos["pos"] <= gene["stop"] + 100000)].shape[0]
    return num_tests

In [52]:
idxs = np.int64(np.linspace(0, gene_locs.shape[0], num=max_jobs + 1)) # +1 because fencepost
slices = [slice(idxs[i-1], idxs[i]) for i in range(1, len(idxs))]
assert len(slices) == max_jobs 

In [53]:
par_outs = Parallel(n_jobs=max_jobs)(delayed(get_num_tests)(gene_locs[sl]) for sl in slices)

In [55]:
num_tests = sum(par_outs)

In [56]:
num_tests

3722461

In [7]:
FDR = 0.05
mateqtl_df = mateqtl_df.sort_values("pvalue")
mateqtl_df["BH"] = (np.arange(1, mateqtl_df.shape[0]+1)/(num_tests*len(output_files)))*FDR

In [8]:
mateqtl_df = mateqtl_df.set_index('snps')

In [9]:
mateqtl_df.shape

(30924710, 8)

## Liftover to hg38

In [6]:
original_len = mateqtl_df.shape[0]

In [7]:
all_mateqtl_bed = [['chr' +i.split(':')[0], i.split(':')[1], 
                    str(int(i.split(':')[1]) + 1),  # this plus 1 was necessary for samtools depth
                    i] for i in mateqtl_df.index.unique()]

In [8]:
all_mateqtl_bed_hg38 = hct.liftover_bed(all_mateqtl_bed, chain_abs_path=prefix_eqtl + 'hg19ToHg38.over.chain', 
                                    output_dir='/data/clue/prod/eqtl/tmp/', keep_files=False)

In [9]:
all_mateqtl_lifted_snps = dict([(i[3], i[0].replace('chr', '') + ':' + i[1]) for i in all_mateqtl_bed_hg38])

The following `.dropna()` will reduce the number of total eQTLs by some.

In [10]:
mateqtl_df['hg38_snps'] = mateqtl_df.index.map(all_mateqtl_lifted_snps)
mateqtl_df.dropna(inplace=True)
mateqtl_df['hg19_snps'] = mateqtl_df.index.to_series()

In [11]:
mateqtl_df.shape[0]/original_len

0.9997917522912907

Still retaining 99.98% of all reported eQTLs.

## Adjustments

In [18]:
mateqtl_df['abs(beta)'] = np.abs(mateqtl_df['beta'])
mateqtl_df['-log10FDR'] = -np.log10(mateqtl_df['FDR'])
mateqtl_df['-log10p'] = -np.log10(mateqtl_df['pvalue'])

In [19]:
mateqtl_df_split = mateqtl_df.index.str.split(':', expand=True).to_frame()
mateqtl_df_split.index = mateqtl_df.index
mateqtl_df_split.columns = ['hg19_snp_chrom', 'hg19_snp_pos']
mateqtl_df = pd.concat([mateqtl_df, mateqtl_df_split], axis=1)
mateqtl_df[['hg19_snp_chrom', 'hg19_snp_pos']] = mateqtl_df[['hg19_snp_chrom', 'hg19_snp_pos']].astype(int)

In [10]:
categorical_cols = ['gene', 'cond', 'ct', 'hg19_snp_chrom']

In [11]:
for cat_col in tqdm(categorical_cols):
    mateqtl_df[cat_col] = mateqtl_df[cat_col].astype('category')

  0%|          | 0/4 [00:00<?, ?it/s]

In [13]:
mateqtl_df.sort_values('pvalue', inplace=True)

In [14]:
sig_thresh = np.argmax(mateqtl_df['pvalue'] > mateqtl_df['BH']) - 1
mateqtl_df_sig = mateqtl_df.iloc[:sig_thresh].copy()

In [18]:
sig_thresh

2.764614278341353e-05

In [15]:
mateqtl_df['BH_sig'] = mateqtl_df['pvalue'] < mateqtl_df['BH']

In [16]:
print(list(mateqtl_df_sig["gene"].unique()[:20]))

['UBE2D4', 'PCSK5', 'IPPK', 'MICU3', 'YES1', 'BCL2', 'TCF7L2', 'HIF1AN', 'AL157938.3', 'DET1', 'ZNF628', 'ZNF324B', 'ZNF525', 'CA11', 'SPACA6', 'ZNF845', 'LINC00926', 'ABHD14A', 'DUSP7', 'TUBA4A']


In [18]:
mateqtl_df_sig.shape

(158444, 16)

## Generate Haplotypes

In [39]:
vcf = vcf[~vcf.obs['hg19_ID_nuc_code'].duplicated()].copy()

In [40]:
vcf.obs.index.is_unique

True

In [26]:
mateqtl_df_haplo = list()
# could've parallelized this but too much trouble
for gene in tqdm(mateqtl_df['gene'].cat.categories):
    snps = mateqtl_df[mateqtl_df['gene'] == gene].index.unique()
    dupe_per_gene = vcf[snps].to_df().duplicated()
    keep_snps = dupe_per_gene.index[~dupe_per_gene]
    mateqtl_df_haplo.append(mateqtl_df[mateqtl_df['gene'] == gene].loc[keep_snps])
mateqtl_df_haplo = pd.concat(mateqtl_df_haplo, axis=0)

  0%|          | 0/10134 [00:00<?, ?it/s]

In [29]:
mateqtl_df_haplo.shape[0]/mateqtl_df.shape[0]

0.3368975042911521

In [33]:
mateqtl_df_haplo = mateqtl_df_haplo.sort_values(by='pvalue')

## Export

In [4]:
# mateqtl_df.to_pickle(prefix_eqtl + 'mateqtl/pkls/mateqtl_df.pkl')
mateqtl_df = pd.read_pickle(prefix_eqtl + 'mateqtl/pkls/mateqtl_df.pkl')

In [14]:
# mateqtl_df_sig.to_pickle(prefix_eqtl + 'mateqtl/pkls/mateqtl_df_sig.pkl')
mateqtl_df_sig = pd.read_pickle(prefix_eqtl + 'mateqtl/pkls/mateqtl_df_sig.pkl')

In [34]:
# mateqtl_df_haplo.to_pickle(prefix_eqtl + 'mateqtl/pkls/mateqtl_df_haplo.pkl')
mateqtl_df_haplo = pd.read_pickle(prefix_eqtl + 'mateqtl/pkls/mateqtl_df_haplo.pkl')

In [5]:
mateqtl = dict()
for cond in tqdm(mateqtl_df['cond'].cat.categories):
    mateqtl[cond] = dict()
    for ct in mateqtl_df['ct'].cat.categories:
        mateqtl[cond][ct] = mateqtl_df[(mateqtl_df['cond'] == cond) & (mateqtl_df['ct'] == ct)].copy()

  0%|          | 0/6 [00:00<?, ?it/s]

In [6]:
# with open(prefix_eqtl + 'mateqtl/pkls/mateqtl.pkl', 'wb') as file:
#     pkl.dump(mateqtl, file, protocol=4)

with open(prefix_eqtl + 'mateqtl/pkls/mateqtl.pkl', 'rb') as file:
    mateqtl = pkl.load(file)

# `mdsr`

I want to create a new object that has the same data as `mateqtl_df` but restructured according to condition and cell type and indexed on eQTLs (chromosome + position + gene) where condition-celltypes with the same eQTL was observed will be in the same row. I'll call it `mdsr` (`mateqtl_df_sorted_reindex`).

In [13]:
mateqtl_df.head()

Unnamed: 0_level_0,gene,statistic,pvalue,FDR,beta,cond,ct,BH,hg38_snps,hg19_snps,abs(beta),-log10FDR,-log10p,hg19_snp_chrom,hg19_snp_pos,BH_sig
snps,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
7:43989516,UBE2D4,inf,2.225074e-308,2.733331e-304,0.68257,G,pDC,1.744412e-10,7:43949917,7:43989516,0.68257,303.563308,307.652656,7,43989516,True
9:78853916,PCSK5,inf,2.225074e-308,2.733331e-304,1.617403,G,pDC,3.610933e-08,9:76239000,9:78853916,1.617403,303.563308,307.652656,9,78853916,True
9:78854489,PCSK5,inf,2.225074e-308,2.733331e-304,1.617403,G,pDC,3.593489e-08,9:76239573,9:78854489,1.617403,303.563308,307.652656,9,78854489,True
9:78855343,PCSK5,inf,2.225074e-308,2.733331e-304,1.617403,G,pDC,3.576045e-08,9:76240427,9:78855343,1.617403,303.563308,307.652656,9,78855343,True
9:95276587,IPPK,inf,2.225074e-308,2.733331e-304,1.83298,G,pDC,3.558601e-08,9:92514305,9:95276587,1.83298,303.563308,307.652656,9,95276587,True


In [14]:
unique_eQTLs = (mateqtl_df['cond'].str[:] + ':' + 
                mateqtl_df['ct'].str[:] + ':' + 
                mateqtl_df.index.str[:] + ':' + 
                mateqtl_df['gene'].str[:]).reset_index(drop=True)

In [15]:
repeated = dict()
for i in tqdm(np.argwhere(unique_eQTLs.duplicated(keep=False).values).flatten()):
    eQTL = unique_eQTLs.iloc[i]
    if unique_eQTLs.iloc[i] in repeated.keys():
        unique_eQTLs.iloc[i] = unique_eQTLs.iloc[i] + '-' + str(repeated[eQTL])
        repeated[eQTL] += 1
    else:
        repeated[eQTL] = 0 

  0%|          | 0/4396 [00:00<?, ?it/s]

Grab unique eQTLs:

Okay, on a per condition-celltype basis, there are sometimes repeated locus-gene combinations. The only way the same locus and gene combination would be reported twice is in the case of multiallelic sites. Although I've confirmed that at most there are 3 per duplicated eQTL (which would make sense for multi-allelic sites), the alleles were not reported in the matrix eQTL results and I haven't checked the input VCF to confirm. In any case, I need to find those and label them differently with a suffix.

In [7]:
mateqtl_df['hg19_snps:gene_unique'] = unique_eQTLs.str.split(':').str[2:].str.join(':').values

In [9]:
cols_to_keep = ['-log10p']
cols_to_pivot = ['cond', 'ct']

In [10]:
mds = mateqtl_df.sort_values(by=['hg19_snp_chrom', 'hg19_snp_pos'])[cols_to_keep + cols_to_pivot + ['hg19_snps:gene_unique']].reset_index(drop=True) # mds = mateqtl_df_sorted
# get TypeError: cannot insert an item into a CategoricalIndex that is not already an existing category
# at the pivot() command further below if cond and ct are categorical type, so converting back to str
for col in ['cond', 'ct']:
    mds[col] = mds[col].astype(str)

Intermediate export if not enough free memory:

In [11]:
# # mds.to_pickle(prefix_eqtl + 'mateqtl/pkls/mds.pkl')
# mds = pd.read_pickle(prefix_eqtl + 'mateqtl/pkls/mds.pkl')

Reduce memory usage before the next cell if necessary:

In [12]:
# mds['-log10p'] = mds['-log10p'].astype(np.float32)

Create `mdsr`:

In [13]:
mdsr = mds.set_index(['hg19_snps:gene_unique'] + cols_to_pivot, append=True) # mdsr = mateqtl_df_sorted_reindexed

In [None]:
mdsr = mdsr.unstack(level='ct', fill_value=0)
mdsr = mdsr.unstack(level='cond', fill_value=0)

Further reduce memory if not enough:

In [None]:
mdsr = mdsr.astype(np.float16)

In [None]:
mdsr = mdsr.groupby('hg19_snps:gene_unique').sum()

Intermediate export if not enough free memory:

In [None]:
# # mdsr.to_pickle(prefix_eqtl + 'mateqtl/pkls/mdsr.pkl')
# mdsr = pd.read_pickle(prefix_eqtl + 'mateqtl/pkls/mdsr.pkl')

In [None]:
df_join = mdsr.index.str.split(':', expand=True).to_frame().set_index(mdsr.index)
df_join[[0, 1]] = df_join[[0, 1]].astype(int)

In [None]:
df_join.columns = ['snp_chrom', 'snp_pos', 'gene']

In [None]:
for r in tqdm(repeated):
    r_idx = ":".join(r.split(":")[2:])
    for i in it.count():
        try:
            gene_name = df_join.loc[r_idx + '-' + str(i), 'gene']
            df_join.loc[r_idx + '-' + str(i), 'gene'] = '-'.join(gene_name.split('-')[:-1])
        except KeyError:
            break

In [None]:
df_join.columns = pd.MultiIndex.from_product([df_join.columns, [''], ['']])

In [None]:
mdsr = mdsr.join(df_join)

In [None]:
mdsr = mdsr.sort_values(by=['snp_chrom', 'snp_pos'])

## Export

In [16]:
# unique_eQTLs.to_pickle(prefix_eqtl + 'mateqtl/pkls/unique_eQTLs.pkl')
unique_eQTLs = pd.read_pickle(prefix_eqtl + 'mateqtl/pkls/unique_eQTLs.pkl')

In [13]:
# mdsr.to_pickle(prefix_eqtl + 'mateqtl/pkls/mdsr.pkl')
mdsr = pd.read_pickle(prefix_eqtl + 'mateqtl/pkls/mdsr.pkl')

# Pseudobulks

In [4]:
pbulk_dir = prefix_eqtl + 'mateqtl/pbulks/'
pbulk_fnames = ! ls $pbulk_dir

In [5]:
pbulk_fnames_split = [i.split('.')[0] for i in pbulk_fnames if i.endswith('.expr')]
pbulk_combos = [tuple(i.split('_', 1)) for i in pbulk_fnames_split]

In [6]:
pbulks = dict()
for cond in np.unique([i[0] for i in pbulk_combos]):
    pbulks[cond] = dict()

In [7]:
for comb in tqdm(pbulk_combos):
    pbulks[comb[0]][comb[1]] = pd.read_csv(pbulk_dir + '_'.join(comb) + '.expr', sep='\t', index_col=0)

  0%|          | 0/78 [00:00<?, ?it/s]

## Adjustments

Map names:

In [8]:
exp_id_subject_mapper = dict(zip(vcf.var['exp_id'], vcf.var_names))

Rename columns in `pbulks`, subset to common_names by removing `nan`s.

In [9]:
for cond in tqdm(pbulks):
    for ct in pbulks[cond]:
        pbulks[cond][ct].columns = pbulks[cond][ct].columns.astype(int)
        pbulks[cond][ct].columns = pbulks[cond][ct].columns.map(exp_id_subject_mapper)
        pbulks[cond][ct] = pbulks[cond][ct].loc[:, ~pbulks[cond][ct].columns.isna()].copy()

  0%|          | 0/6 [00:00<?, ?it/s]

## Export

In [10]:
# with open(prefix_eqtl + 'mateqtl/pkls/pbulks.pkl', 'wb') as file:
#     pkl.dump(pbulks, file, protocol=4)

with open(prefix_eqtl + 'mateqtl/pkls/pbulks.pkl', 'rb') as file:
    pbulks = pkl.load(file)