# Discover expansions

Authors
- Egor Dolzhenko, PacBio, edolzhenko@pacificbiosciences.com
- Giulia Del Gobbo, CHEO, GDelGobbo@cheo.on.ca
- Madeline Couse, SickKids, madeline.couse@sickkids.ca
- Tom Mokveld, PacBio, tmokveld@pacificbiosciences.com
- Khi Pin Chua, PacBio, kpin@pacificbiosciences.com


Description of the workflow...

In [1]:
import os
import gzip
from collections import namedtuple
import itertools
import numpy as np
from pathlib import Path, PosixPath
import time
from typing import Any, Callable

In [68]:
# Directory with VCF files generated by TRGT
vcf_path = Path("input/vcfs/").resolve(strict=True)

# Ids of cases; all other samples are assumed to be controls 
case_ids = ["HG00280", "HG00438", "HG00621"]

In [69]:
# Create directories for scratch files and final output
Path("scratch").mkdir(exist_ok=True)
Path("output").mkdir(exist_ok=True)

In [6]:
def skip_header(file_handle: gzip.GzipFile, prefix: bytes = b'#') -> None:
    last_pos = file_handle.tell()
    while file_handle.readline().startswith(prefix):
        last_pos = file_handle.tell()
    file_handle.seek(last_pos)

def get_alleles(path: PosixPath):    
    gt_actions = {
        "0/0": lambda: (trid, [ref, ref]),
        "0/1": lambda: (trid, [ref, alt]),
        "1/2": lambda: (trid, alt.split(",")),
        "1/1": lambda: (trid, [alt, alt]),
        "1": lambda: (trid, [alt]),
        "0": lambda: (trid, [ref])
    }
    
    with gzip.open(path, 'r') as f_in:
        skip_header(f_in)
        for line in f_in:     
            line = line.decode("utf8")
            sl = line.split()
            
            gt = sl[-1].split(":")[0]
            
            if gt == '.':
                continue
            
            assert gt in gt_actions, f"Unknown gt: {gt}"
            
            ref, alt = sl[3], sl[4]               
            trid = sl[-3].split(";")[0].lstrip("TRID=")

            yield gt_actions[gt]()
            
            
with gzip.open("scratch/alleles_db.gz", "w", compresslevel=6) as f_out:
    for index, path in enumerate(vcf_path.glob("*.vcf.gz")):    
        sample = path.name.rstrip(".vcf.gz")
        for (trid, alleles) in get_alleles(path):        
            alleles = ",".join(alleles)
            f_out.write(f"{trid} {sample} {alleles}\n".encode())   

In [7]:
%%bash

zcat < scratch/alleles_db.gz | sort -k 1,1 | gzip > scratch/alleles_db.sorted.gz
mv scratch/alleles_db.sorted.gz scratch/alleles_db.gz
zcat < scratch/alleles_db.gz | head

chr10_100000834_100000912_A HG00099 TTTAGAAAGTGACACCTGTTATGGCAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCAGCAGGGTACAGGAATGGGGAA,TTTAGAAAGTGACACCTGTTATGGCAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCAGCAGGGTACAGGAATGGGGAA
chr10_100000834_100000912_A HG00280 TTTAGAAAGTGACACCTGTTATGGCAAAAAAAAAAAAAAAAAAACAAAAAAAACCCAGCAGGGTACAGGAATGGGGAA,TTTAGAAAGTGACACCTGTTATGGCAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCAGCAGGGTACAGGAATGGGGAA
chr10_100000834_100000912_A HG00323 TTTAGAAAGTGACACCTGTTATGGCAAAAAAAAAAAAAAAAAAACAAAAAAAACCCAGCAGGGTACAGGAATGGGGAA,TTTAGAAAGTGACACCTGTTATGGCAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCAGCAGGGTACAGGAATGGGGAA
chr10_100000834_100000912_A HG00423 TTTAGAAAGTGACACCTGTTATGGCAAAAAAAAAAAAAAAAAAACAAAAAAAACCCAGCAGGGTACAGGAATGGGGAA,TTTAGAAAGTGACACCTGTTATGGCAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCAGCAGGGTACAGGAATGGGGAA
chr10_100000834_100000912_A HG00438 TTTAGAAAGTGACACCTGTTATGGCAAAAACAAAAACAAAAAAAAAAAAACCCAGCAGGGTACAGGAATGGGGAA,TTTAGAAAGTGACACCTGTTATGGCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCAGCAGGGTACAGGAATGGGGAA
chr10_100000834_100000912_A 

In [70]:
RepeatRec = namedtuple("RepeatRec", "sample short_allele long_allele")

def get_repeat_recs(path):
    def parse_alleles(group):
        alleles = list(line.decode("utf8").split() for line in group)
        alleles = [(rec[1], rec[2].split(",")) for rec in alleles]
        return alleles
    
    with gzip.open(path, "r") as file:
        for trid, group in itertools.groupby(file, key=lambda line: line.decode("utf8").split()[0]):
            alleles = parse_alleles(group)
            repeat_recs = [(s, [len(a) for a in als]) for s, als in alleles]
            repeat_recs = [RepeatRec(s, min(als), max(als)) for s, als in repeat_recs]
            
            yield trid, repeat_recs

In [71]:
def resample_quantiles(counts, num_resamples):
    """Based on https://github.com/Illumina/ExpansionHunterDenovo/blob/master/scripts/core/common.py"""
    resamples = np.random.choice(counts, len(counts) * num_resamples)
    resamples = np.split(resamples, num_resamples)

    resampled_quantiles = []
    for resample in resamples:
        quantile = np.quantile(resample, 0.95)
        resampled_quantiles.append(quantile)

    return resampled_quantiles


def get_counts_with_finite_std(counts):
    if len(set(counts)) == 1:
        return counts[:-1] + [counts[-1] + 0.1]
    return counts


def get_cutoff(quantiles):
    mean = np.mean(quantiles)
    std = max(1, np.std(quantiles))
    cutoff = mean + std
    return cutoff

In [None]:
HitRec = namedtuple("HitRec", "trid sample allele_type allele_len control_range")

def get_hits(allele_type, repeat_recs):
    assert allele_type in ["long", "short"]
    allele_index = 2 if allele_type == "long" else 1
    cases, controls = {}, []
    for rec in repeat_recs:
        if rec.sample in case_ids:
            cases[rec.sample] = rec[allele_index]
        else:
            controls.append(rec[allele_index])
    quantiles = resample_quantiles(controls, 100)
    quantiles = get_counts_with_finite_std(quantiles)
    cutoff = get_cutoff(quantiles)
    
    for case, allele_len in cases.items():
        if allele_len > cutoff:
            yield case, allele_len, (min(controls), max(controls))


hits = []
for trid, repeat_recs in get_repeat_recs("scratch/alleles_db.gz"):
    for hit_type in ["long", "short"]:
        for case, allele_len, control_range in get_hits(hit_type, repeat_recs):
            hits.append(HitRec(trid, case, hit_type, allele_len, control_range))


print(f"Found {len(hits)} hits")