In [1]:
from helpers.getpaths import *
from helpers.ldlink import *

In [75]:
import os 
import pandas as pd


# we only want a subset of the eqtl catalogue files, I have handpicked these to focus on immune and blood cell types

paths = get_paths(root="/media/")

all_files = os.listdir(paths['eqtl_cat'] + 'credible_sets')

meta = [i for i in os.listdir(paths['eqtl_cat']) if i.split(".")[-1]=="csv"]
meta = paths['eqtl_cat'] + meta[0]
meta = pd.read_csv(meta).Study.tolist()

files = []

for i in all_files:
    for m in meta:
        if m in i:
            files.append(paths['eqtl_cat']+'credible_sets/' + i)
            

# copy these files to a separate folder for further processing
            

out_dir = "/media/cbio3/data/eQTL_DB/curated_crediblesets/"

for path in files:
    name = path.split("/")[-1]
    out_path = out_dir + name
    command = "cp " + path + " " + out_path
    os.system(command)

We would like to tabix index these files - unfortunately they are not sorted. So I do the following:

    1) unzip all the files using gunzip
    2) sort each by chromosome and position
    3) bgzip all the files
    4) run tabix on the bgzipped files 

In [76]:
# all files have been gunzipped, now sort using pandas 

sort_dict = {}

for i in range(23):
    sort_dict[str(i)] = i
    
sort_dict['X'] = 23

files = [out_dir + i for i in os.listdir(out_dir)]


In [77]:
# sort all the files 

for file in files:

    x = pd.read_csv(file, sep='\t')
    x.chromosome = x.chromosome.astype(str)
    x['int_chrom'] = x.chromosome.map(sort_dict)
    x = x.sort_values(by=['int_chrom', 'position'])
    x.to_csv(file, sep='\t', index=False)


In [82]:
# check that files were sorted

pd.read_csv(files[1], sep='\t').head()

Unnamed: 0,molecular_trait_id,variant,chromosome,position,ref,alt,cs_id,cs_index,finemapped_region,pip,z,cs_min_r2,cs_avg_r2,cs_size,posterior_mean,posterior_sd,cs_log10bf,int_chrom
0,ENST00000304952,chr1_989148_C_A,1,989148,C,A,ENST00000304952_L1,L1,chr1:172-2000172,0.088759,-11.038267,0.794414,0.908261,10,-0.078922,0.254003,23.243713,1
1,ENST00000428771,chr1_989148_C_A,1,989148,C,A,ENST00000428771_L1,L1,chr1:172-2000172,0.224966,13.740977,0.995728,0.997834,3,0.276131,0.516782,40.848565,1
2,ENST00000304952,chr1_992967_GGGAGGGTCCATGTGTCCGTCATCTGA_G,1,992967,GGGAGGGTCCATGTGTCCGTCATCTGA,G,ENST00000304952_L1,L1,chr1:172-2000172,0.048185,-10.903157,0.794414,0.908261,10,-0.042205,0.188385,23.243713,1
3,ENST00000428771,chr1_992967_GGGAGGGTCCATGTGTCCGTCATCTGA_G,1,992967,GGGAGGGTCCATGTGTCCGTCATCTGA,G,ENST00000428771_L1,L1,chr1:172-2000172,0.248802,13.78157,0.995728,0.997834,3,0.302779,0.530263,40.848565,1
4,ENST00000304952,chr1_999842_C_A,1,999842,C,A,ENST00000304952_L1,L1,chr1:172-2000172,0.25461,-11.276897,0.794414,0.908261,10,-0.22441,0.386022,23.243713,1


In [85]:
# gzip files for tabix
for file in files:
    command = "bgzip " + file
    os.system(command)
    
    

In [95]:
os.listdir(paths['eqtl_cat']+ '/curated_crediblesets')

['Nedelec_2016_tx_macrophage_naive.purity_filtered.txt',
 'Schmiedel_2018_exon_monocyte_naive.purity_filtered.txt',
 'Schmiedel_2018_ge_Treg_naive.purity_filtered.txt',
 'Nedelec_2016_tx_macrophage_naive.purity_filtered.txt.gz',
 'Alasoo_2018_exon_macrophage_IFNg+Salmonella.purity_filtered.txt.gz',
 'Quach_2016_ge_monocyte_R848.purity_filtered.txt.gz',
 'Schmiedel_2018_ge_CD8_T-cell_naive.purity_filtered.txt.gz',
 'Nedelec_2016_tx_macrophage_Listeria.purity_filtered.txt.gz',
 'Alasoo_2018_ge_macrophage_IFNg+Salmonella.purity_filtered.txt.gz',
 'BLUEPRINT_SE_ge_neutrophil.purity_filtered.txt',
 'Nedelec_2016_txrev_macrophage_Listeria.purity_filtered.txt.gz',
 'Alasoo_2018_exon_macrophage_IFNg+Salmonella.purity_filtered.txt',
 'Alasoo_2018_tx_macrophage_naive.purity_filtered.txt',
 'BLUEPRINT_SE_txrev_neutrophil.purity_filtered.txt',
 'Bossini-Castillo_2019_exon_Treg_naive.purity_filtered.txt.gz',
 'GENCORD_ge_fibroblast.purity_filtered.txt.gz',
 'CEDAR_microarray_monocyte_CD14.purity_fi

In [91]:
# Now need to tabix index all of the files in the curated crediblesets directory

files = [out_dir + i for i in os.listdir(out_dir) if i.split(".")[-1]=="gz"]

for file in files:
    command = "tabix -c m -s 3 -b 4 -e 4 "
    command +=file
    os.system(command)



In [50]:
x.chromosome = x.chromosome.astype(str)

In [54]:
name = [i for i in files if i.split(".")[-1]=="txt"][0]

name = name.split("/")[-1]
name = "sorted_" + name

name = out_dir + name

In [57]:
len(files)

154