## Introduction

This code integrates into the exisiting Open Targets pipeline. It calculates 2 things, Firstly, pairwise LD between a list of SNP ids calculated from indexed 1000 genomes BCF files. Secondly, pairwise LD within a window around a specified SNP. This module currently takes approximately 15 seconds in the current pipeline but by using $\texttt{tabix}$ to regionally index the files, the necessary SNPs can be extracted much more quickly with $\texttt{bcftools}$. Both functions now complete in a wall time of $<0.5$s. I use the ENSEMBL API to lookup SNP regions, and then tabix and bcftools to extract out the necessary SNPs. Finally, I use $\texttt{plink2}$ to efficiently calculate linkage disequilibium.

In [23]:
import subprocess
import json
import requests
import pdb
import re

In [24]:
def get_region(SNP_ids):
    """
    Given a list SNPs id, extract the region that encompassed them all from the ENSEMBL REST API.
    Advised only for small SNP set sizes < 10.
    """
    
    ### Extract list of SNP locations from ENSEMBL REST API
    SNP_json = json.dumps({"ids": SNP_ids})
    server = "http://rest.ensembl.org"
    ext = "/variation/homo_sapiens"
    headers={ "Content-Type" : "application/json", "Accept" : "application/json"}
    r = requests.post(server+ext, headers=headers, data=SNP_json)

    if not r.ok:
        print "Regions of one of more SNPs could not be retieved"
        r.raise_for_status()
        sys.exit()

    decoded = r.json()

    ### Return the union of all regions.
    sorted_locations = sorted([decoded[k]['mappings'][0]['location'] for k in decoded.keys()])
    region = '-'.join([sorted_locations[0].split('-')[0],sorted_locations[-1].split('-')[1]])

    return region

In [41]:
def calculate_LD_window(SNP_id, window_size,db=0):
    
    """
    Given a SNP id, calculate the pairwise LD between all SNPs within window_size base pairs.
    """

    ### Get the SNP location from ENSEMBL
    loc = get_region([SNP_id])

    ### Define the necessary region.
    from_pos = int(loc.split(":")[1].split('-')[0]) - (window_size / 2)
    to_pos = int(loc.split(":")[1].split('-')[1]) + (window_size / 2)
    chromosome = loc.split(':')[0]
    region = '{}:{}-{}'.format(chromosome,from_pos,to_pos)
    
    ### Extract this region out from the 1000 genomes BCF
    extract_region_comm = "bcftools view -r {} ../data/processed/CEPH.chr{}.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.nodup.bcf.gz -O v -o region.vcf".format(region,chromosome)
    subprocess.call(extract_region_comm.split(" "))
    region_file = open('region.vcf','r')
    region_vcf = region_file.read()
    

    ### Find the order of SNPs in the VCF
    SNPs_order = re.findall('rs[0-9]+', region_vcf)
    

    ### Calculate the pairwise LD using plink2
    plinkcomm = "plink2 --vcf region.vcf --r square --out LDwindow"
    plinkcomm_list = plinkcomm.split(" ")
    subprocess.call(plinkcomm_list)
    
    ### Remove intermediate region VCF file
    if db != 1:
        subprocess.call(['rm', 'region.vcf'])
    
    LD_file = open('LDwindow.ld','r')
    g = LD_file.read()
    LD_array = [x.split('\t') for x in g.splitlines()]
    LD_file.close
    
    ### Remove intermediate LD file
    if db != 1:
        subprocess.call(['rm', 'LDwindow.ld', 'LDwindow.log', 'LDwindow.nosex', 'out.log'])
    
    return SNPs_order, LD_array



In [40]:
%%time
res = calculate_LD_window('rs74509095', 10000)

CPU times: user 11.1 ms, sys: 14.8 ms, total: 26 ms
Wall time: 172 ms


In [42]:
def calculate_pairwise_LD(SNPs_filepath=None,SNP_ids=None, region=None,db=0):
    """
    For large numbers of SNPs, best to specify SNP region with chrom:to-from, e.g. 1:7654947-8155562
    For small numbers (<10), regions are extracted from ENSEMBL REST API.
    SNPs can be inputted in a list or from a file with one SNP id per line.
    """

    assert SNPs_filepath or SNP_ids, "SNPs must be inputted either from a file or a list"
    
    ### If a SNP file is provided, use it. Otherwise continue with the provided SNP ids.
    if SNPs_filepath:
        SNPs_file = open(SNPs_filepath, 'r')
        SNP_ids = SNPs_file.read().splitlines()
        SNPs_file.close()

    

    ### If a region is not specified, extract it using ENSEMBL REST API. If large amount of SNPs, manually specify this region.
    if not region: 
        region = get_region(SNP_ids)
    
    if db != 1:
        print region
        
    


    
    ### Extract the required region from the VCF
    chromosome = region.split(':')[0]
    extract_region_comm = "bcftools view -r {} ../data/processed/CEPH.chr{}.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.nodup.bcf.gz -O z -o region.vcf.gz".format(region,chromosome)
    subprocess.call(extract_region_comm.split(" "))
    region_file = "region.vcf.gz"
    
    
    ### Extract the list of SNP ids from this region
    vcfcomm = "vcftools --gzvcf {} --snps {} --recode --stdout".format(region_file, SNPs_filepath)
    vcf = subprocess.check_output(vcfcomm.split(" "))
    
    ### Remove intermediate region VCF file
    if db != 1:
        subprocess.call(['rm', 'region.vcf.gz'])


    f = open('snps.vcf', 'w')
    f.write(vcf)
    f.close()
    
    ### Extract out the order of SNPs
    SNPs_order = re.findall('rs[0-9]+', vcf)
    
    ### Use plink2 to calculate pairwise LD between these SNPs.
    plinkcomm = "plink2 --vcf snps.vcf --r square --out LD"
    plinkcomm_list = plinkcomm.split(" ")
    subprocess.call(plinkcomm_list)

    ### Remove intermediate SNPs VCF file
    if db != 1:
        subprocess.call(['rm', 'snps.vcf'])
    
    ### Read from the generated results file and output an array.
    LD_file = open('LD.ld','r')
    g = LD_file.read()
    LD_array = [x.split('\t') for x in g.splitlines()]
    LD_file.close
    
    ### Remove intermediate LD 
    if db != 1:
        subprocess.call(['rm', 'LD.ld', 'LD.log', 'LD.nosex', 'out.log'])
    
    return (SNPs_order, LD_array)
    


In [43]:
time calculate_pairwise_LD("../data/raw/smallrsIDs.txt", db=0)

1:7637119-7968778
CPU times: user 4.25 ms, sys: 13 ms, total: 17.2 ms
Wall time: 431 ms


(['rs6661496',
  'rs79544751',
  'rs148043253',
  'rs74509095',
  'rs4908705',
  'rs4908708'],
 [['1', '1', '-0.0256579', '-0.0413742', '-0.0729325', '-0.0729325'],
  ['1', '1', '-0.0256579', '-0.0413742', '-0.0729325', '-0.0729325'],
  ['-0.0256579', '-0.0256579', '1', '0.325107', '0.248882', '0.248882'],
  ['-0.0413742', '-0.0413742', '0.325107', '1', '-0.0778898', '-0.0778898'],
  ['-0.0729325', '-0.0729325', '0.248882', '-0.0778898', '1', '1'],
  ['-0.0729325', '-0.0729325', '0.248882', '-0.0778898', '1', '1']])