In [1]:
import os
import pandas as pd
import numpy as np

from tqdm import tqdm
import sys

sys.path.append('/private/groups/shapirolab/brock/Software')
from py3_functions import *

from IPython.display import display

!mkdir -p /private/groups/shapirolab/brock/mutation
os.chdir('/private/groups/shapirolab/brock/mutation')
!mkdir -p cmds logs

### R stuff
from rpy2 import rinterface
#from jupyter_helpers import rpy2_autocompletion
%load_ext rpy2.ipython

samples = []
with open('poplists/samples.txt') as infile:
    for line in infile:
        line = line.strip()
        samples.append(line)
infile.close()

AB_sizes = {}
with open('white_abalone.fasta.fai','r') as infile:
    for line in infile:
        chrom, size, total, b, c = line.split('\t')
        if int(size) > 1e6:
            AB_sizes[chrom] = int(size)
infile.close()
AB_sizes

{'HiC_scaffold_1': 88721917,
 'HiC_scaffold_2': 79379215,
 'HiC_scaffold_3': 74701305,
 'HiC_scaffold_4': 73032783,
 'HiC_scaffold_5': 70278957,
 'HiC_scaffold_6': 70171953,
 'HiC_scaffold_7': 69140750,
 'HiC_scaffold_8': 63626884,
 'HiC_scaffold_9': 62461566,
 'HiC_scaffold_10': 62032692,
 'HiC_scaffold_11': 60969504,
 'HiC_scaffold_12': 60416524,
 'HiC_scaffold_13': 59927272,
 'HiC_scaffold_14': 59492018,
 'HiC_scaffold_15': 54906828,
 'HiC_scaffold_16': 53887023,
 'HiC_scaffold_17': 51816475,
 'HiC_scaffold_18': 45703695}

# Whatshap

**Warning**: attempting to use pedigree information for phasing with `whatshap phase --ped` will cause whatshapp to skip Mendelian violations, which are exactly the variants I want to phase. Currently can't find a workaround, so I phase without pedigree information to assist

In [18]:
!mkdir -p whatshap whatshap/bams/ whatshap/phased whatshap/vcfs
 
    
AB_sizes = {}
with open('white_abalone.fasta.fai','r') as infile:
    for line in infile:
        chrom, size, total, b, c = line.split('\t')
        if int(size) > 1e6:
            AB_sizes[chrom] = int(size)
infile.close()
AB_sizes


#### This code creates a single slurm script PER CHROMOSOME ####
#### but parallelizes within the slurm script using subregions ####
#### and multithreading ####
for CHROM, SIZE in AB_sizes.items():
    outfile = open(f"cmds/{CHROM}_whatshap_phase.txt",'w')
    ##
    SIZE = int(SIZE)
    for I in range(1,SIZE,5000000):
        
        ### REGION SETUP AND CHECKS ###
        START = I
        END = I + 5000000 - 1
        if END > SIZE:
            END = SIZE
        if os.path.exists('whatshap/phased/%s:%s-%s.whatshap.vcf.gz' % (CHROM, START, END)): continue
        REGION = '%s:%s-%s' % (CHROM, START, END)
        baminput = ''
        subfile = open(f"cmds/{REGION}.whatshap.sh", 'w')
        
        ### VCF CMD ###
        cmd = f"bcftools view -O z -r {REGION} vcfs/biallelic.vcf.gz > whatshap/vcfs/{REGION}.vcf.gz && tabix -f whatshap/vcfs/{REGION}.vcf.gz\n"
        subfile.write(cmd)
        
        ### SUBBAMS CMDS ###
        for SAMPLE in samples:
            BAM = f"aln/{SAMPLE}.mdup.bam"
            SUBBAM = f"whatshap/bams/{SAMPLE}.{REGION}.bam"
            if not os.path.exists(SUBBAM + '.bai'):
                samcmd = f"samtools view -b -1 -M {BAM} {REGION} > {SUBBAM} && samtools index -b {SUBBAM}\n"
                subfile.write(samcmd)
            baminput += '%s ' % SUBBAM
        
        ### PHASE CMD ###
        phasecmd = ('''\nwhatshap phase '''
                    '''--ped poplists/plink.ped '''
                    '''whatshap/vcfs/{REGION}.vcf.gz '''
                    '''{baminput} '''
                    '''-o whatshap/phased/{REGION}.whatshap.vcf '''
                    '''--reference=white_abalone.fasta '''
                    #'''--ignore-read-groups '''
                    '''--chromosome {CHROM} '''
                    '''&& bgzip -f whatshap/phased/{REGION}.whatshap.vcf && tabix -f whatshap/phased/{REGION}.whatshap.vcf.gz &> logs/{REGION}.whatshap.log\n''').format(baminput = baminput, REGION = REGION, CHROM = CHROM)
        subfile.write(phasecmd)
        
        ### CLEANUP CMD ###
        cleancmd = ('''\nif [ -f whatshap/phased/{REGION}.whatshap.vcf.gz.tbi ];then '''
                    '''rm -f bams/subset/*{REGION}*bam*;'''
                    '''fi\n''').format(REGION = REGION)
        subfile.write(cleancmd)
        subfile.close()
        outfile.write(f"bash cmds/{REGION}.whatshap.sh\n")
        
    ### SLURM ###
    slurm_cmd = f"ls cmds/{CHROM}:*whatshap.sh | sed 's/^/bash /g' | parallel --progress --jobs 10"
    slurm = make_slurm(echo = True, id = f"{CHROM}.whatshap",cmd_string = slurm_cmd, mem = '200000',time = '120:00:00', c = '10', p = 'long')
    


#!/bin/bash
#SBATCH -p long
#SBATCH -t 120:00:00
#SBATCH --mem=200000
#SBATCH -n 1
#SBATCH --array=1-1%1
#SBATCH -e ./logs/HiC_scaffold_1.whatshap.e
#SBATCH -o ./logs/HiC_scaffold_1.whatshap.o
#SBATCH -c 10
#SBATCH -N 1
#SBATCH -J HiC_scaffold_1.whatshap


ls cmds/HiC_scaffold_1:*whatshap.sh | sed 's/^/bash /g' | parallel --progress --jobs 10

#!/bin/bash
#SBATCH -p long
#SBATCH -t 120:00:00
#SBATCH --mem=200000
#SBATCH -n 1
#SBATCH --array=1-1%1
#SBATCH -e ./logs/HiC_scaffold_2.whatshap.e
#SBATCH -o ./logs/HiC_scaffold_2.whatshap.o
#SBATCH -c 10
#SBATCH -N 1
#SBATCH -J HiC_scaffold_2.whatshap


ls cmds/HiC_scaffold_2:*whatshap.sh | sed 's/^/bash /g' | parallel --progress --jobs 10

#!/bin/bash
#SBATCH -p long
#SBATCH -t 120:00:00
#SBATCH --mem=200000
#SBATCH -n 1
#SBATCH --array=1-1%1
#SBATCH -e ./logs/HiC_scaffold_3.whatshap.e
#SBATCH -o ./logs/HiC_scaffold_3.whatshap.o
#SBATCH -c 10
#SBATCH -N 1
#SBATCH -J HiC_scaffold_3.whatshap


ls cmds/HiC_scaffold_3:*whatshap.sh | sed 's/^/bash

Run for small interstitial regions to facilitate ligation

In [15]:
!mkdir -p whatshap whatshap/bams/ whatshap/phased whatshap/vcfs
 
    
AB_sizes = {}
with open('white_abalone.fasta.fai','r') as infile:
    for line in infile:
        chrom, size, total, b, c = line.split('\t')
        if int(size) > 1e6:
            AB_sizes[chrom] = int(size)
infile.close()
AB_sizes


#### This code creates a single slurm script PER CHROMOSOME ####
#### but parallelizes within the slurm script using subregions ####
#### and multithreading ####
for CHROM, SIZE in AB_sizes.items():
    ##
    SIZE = int(SIZE)
    for I in range(1,SIZE,5000000):
        
        ### REGION SETUP AND CHECKS ###
        START = I
        END = I + 5000000 - 1
        if END > SIZE:
            END = SIZE
        seg_start = END - 100000
        seg_end = END + 100000
        REGION = '%s:%s-%s' % (CHROM, seg_start, seg_end)
        if os.path.exists(f"whatshap/phased/{REGION}.vcf.gz.tbi"): continue
        baminput = ''
        subfile = open(f"cmds/{REGION}.inter.sh", 'w')
        
        ### VCF CMD ###
        cmd = f"bcftools view -O z -r {REGION} vcfs/biallelic.vcf.gz > whatshap/vcfs/{REGION}.vcf.gz && tabix -f whatshap/vcfs/{REGION}.vcf.gz\n"
        subfile.write(cmd)
        
        ### SUBBAMS CMDS ###
        for SAMPLE in samples:
            BAM = f"aln/{SAMPLE}.mdup.bam"
            SUBBAM = f"whatshap/bams/{SAMPLE}.{REGION}.bam"
            if not os.path.exists(SUBBAM + '.bai'):
                samcmd = f"samtools view -b -1 -M {BAM} {REGION} > {SUBBAM} && samtools index -b {SUBBAM}\n"
                subfile.write(samcmd)
            baminput += '%s ' % SUBBAM
        
        ### PHASE CMD ###
        phasecmd = ('''\nwhatshap phase '''
                    '''--ped poplists/plink.ped '''
                    '''whatshap/vcfs/{REGION}.vcf.gz '''
                    '''{baminput} '''
                    '''-o whatshap/phased/{REGION}.whatshap.vcf '''
                    '''--reference=white_abalone.fasta '''
                    #'''--ignore-read-groups '''
                    '''--chromosome {CHROM} '''
                    '''&& bgzip -f whatshap/phased/{REGION}.whatshap.vcf && tabix -f whatshap/phased/{REGION}.whatshap.vcf.gz &> logs/{REGION}.whatshap.log\n''').format(baminput = baminput, REGION = REGION, CHROM = CHROM)
        subfile.write(phasecmd)
        
        ### CLEANUP CMD ###
        cleancmd = ('''\nif [ -f whatshap/phased/{REGION}.whatshap.vcf.gz.tbi ];then '''
                    '''rm -f bams/subset/*{REGION}*bam*;'''
                    '''fi\n''').format(REGION = REGION)
        subfile.write(cleancmd)
        subfile.close()
        
    ### SLURM ###
    slurm_cmd = f"ls cmds/{CHROM}:*inter.sh | sed 's/^/bash /g' | parallel --progress --jobs 10"
    slurm = make_slurm(echo = False, id = f"{CHROM}.inter",cmd_string = slurm_cmd, mem = '60000',time = '08:00:00', c = '10', p = 'long')




Ligate 

In [31]:
!mkdir -p whatshap/ligated

def checkinput(CHROM):
    chunksfile = open('whatshap/%s_chunks.txt' % CHROM, 'w')
    SIZE = int(AB_sizes[CHROM])
    for I in range(1,SIZE,5000000):
        START = I
        END = I + 5000000 - 1
        if END > SIZE:
            END = SIZE
            INTER_REGION = None
        else:
            seg_start = END - 100000
            seg_end = END + 100000
            INTER_REGION = '%s:%s-%s' % (CHROM, seg_start, seg_end)

        REGION = '%s:%s-%s' % (CHROM, START, END)
        
        if not os.path.exists('whatshap/phased/%s.whatshap.vcf.gz' % REGION): 
            return
        else:
            chunksfile.write('whatshap/phased/%s.whatshap.vcf.gz\n' % REGION)
        if INTER_REGION:
            chunksfile.write('whatshap/phased/%s.whatshap.vcf.gz\n' % INTER_REGION)
    
    chunksfile.close()
    return('done')

for CHROM, SIZE in AB_sizes.items():
    if checkinput(CHROM):
        if os.path.exists('whatshap/ligated/%s.ligated.vcf.gz.tbi' % CHROM):
            continue
        print('%s done, ready to ligate' % CHROM)
        cmd = ('''bcftools concat --ligate '''
               '''-f whatshap/{CHROM}_chunks.txt '''
               '''-O z > whatshap/ligated/{CHROM}.ligated.vcf.gz && tabix -f whatshap/ligated/{CHROM}.ligated.vcf.gz\n''').format(CHROM = CHROM)
        slurm = make_slurm(run = True, id = f"{CHROM}.ligate", cmd_string = cmd, mem = '5000', time = '24:00:00', p = 'long')

HiC_scaffold_1 done, ready to ligate
Submitted batch job 3336728
HiC_scaffold_2 done, ready to ligate
Submitted batch job 3336729
HiC_scaffold_3 done, ready to ligate
Submitted batch job 3336730
HiC_scaffold_4 done, ready to ligate
Submitted batch job 3336731
HiC_scaffold_5 done, ready to ligate
Submitted batch job 3336732
HiC_scaffold_6 done, ready to ligate
Submitted batch job 3336733
HiC_scaffold_7 done, ready to ligate
Submitted batch job 3336734
HiC_scaffold_8 done, ready to ligate
Submitted batch job 3336735
HiC_scaffold_9 done, ready to ligate
Submitted batch job 3336736
HiC_scaffold_10 done, ready to ligate
Submitted batch job 3336737
HiC_scaffold_11 done, ready to ligate
Submitted batch job 3336738
HiC_scaffold_12 done, ready to ligate
Submitted batch job 3336739
HiC_scaffold_13 done, ready to ligate
Submitted batch job 3336740
HiC_scaffold_14 done, ready to ligate
Submitted batch job 3336741
HiC_scaffold_15 done, ready to ligate
Submitted batch job 3336742
HiC_scaffold_16 don

Concatenate

In [34]:
catcmd = 'bcftools concat -O z -o whatshap/biallelic.phased.vcf.gz '
for CHROM in AB_sizes.keys():
    catcmd += f"whatshap/ligated/{CHROM}.ligated.vcf.gz "
!$catcmd

Checking the headers and starting positions of 18 files
Concatenating whatshap/ligated/HiC_scaffold_1.ligated.vcf.gz	11.427196 seconds
Concatenating whatshap/ligated/HiC_scaffold_2.ligated.vcf.gz	11.308870 seconds
Concatenating whatshap/ligated/HiC_scaffold_3.ligated.vcf.gz	12.645885 seconds
Concatenating whatshap/ligated/HiC_scaffold_4.ligated.vcf.gz	8.985294 seconds
Concatenating whatshap/ligated/HiC_scaffold_5.ligated.vcf.gz	12.284028 seconds
Concatenating whatshap/ligated/HiC_scaffold_6.ligated.vcf.gz	13.187475 seconds
Concatenating whatshap/ligated/HiC_scaffold_7.ligated.vcf.gz	19.020247 seconds
Concatenating whatshap/ligated/HiC_scaffold_8.ligated.vcf.gz	8.479397 seconds
Concatenating whatshap/ligated/HiC_scaffold_9.ligated.vcf.gz	10.283542 seconds
Concatenating whatshap/ligated/HiC_scaffold_10.ligated.vcf.gz	10.326275 seconds
Concatenating whatshap/ligated/HiC_scaffold_11.ligated.vcf.gz	10.409046 seconds
Concatenating whatshap/ligated/HiC_scaffold_12.ligated.vcf.gz	20.642290 sec

Cleanup

In [35]:
%%bash
rm whatshap/phased/* whatshap/ligated/*

# POOHA

In [68]:
sample_by_candidates = {}
with open('final/all_sample_by_candidates.txt','r') as infile:
    for line in infile:
        parts = line.split('\t')
        sample,chrom,start,end = parts[0:4]
        if sample not in sample_by_candidates.keys():
            sample_by_candidates[sample] = [[chrom,start,end]]
        else:
            sample_by_candidates[sample].append([chrom,start,end])
infile.close()

sample_2_parents = {}
with open('poplists/child_2_parent.txt','r') as infile:
    for line in infile:
        line = line.strip()
        child, parentstring = line.split('\t')
        sample_2_parents[child] = parentstring.split(',')
infile.close()


!mkdir -p haplotypes

allcmds = 'rm -f haplotypes/pooha_output.txt\n'
for SAMPLE, CANDIDATES in sample_by_candidates.items():
    CHILD = SAMPLE
    P0, P1 = sample_2_parents[CHILD]
    for CAN in CANDIDATES:
        chrom, start, end = CAN
        left = int(start) - 50000
        right = int(end) + 50000
        cmd = ('''/private/groups/shapirolab/brock/Software/POOHA/POOHA '''
               '''--verbose --min-parents-GQ 60 --min-child-GQ 60 '''
               '''--max-marker-distance 10000 '''
               '''--region {chrom}:{left}-{right} '''
               '''--conflicting_pairs haplotypes/{CHILD}_{chrom}_{end}_conflicting.txt '''
               '''--output_variants germline '''
               '''vcfs/biallelic.vcf.gz '''
               '''{P0} {P1} {CHILD} '''
               '''aln/{CHILD}.mdup.bam | sed "s/ P / {P0} /;s/ M / {P1} /" '''
               '''>> haplotypes/pooha_output.txt'''
               '''\n''').format(CHILD = CHILD, P0 = P0, P1 = P1, chrom = chrom, end = end, left = left, right = right)
        allcmds += cmd
slurm = make_slurm(id = f"pooha", cmd_string = allcmds, mem = '10000', time = '04:00:00', p = 'long')

Results

In [8]:
%%R
library(data.table)
library(tidyverse)
library(magrittr)
library(ggplot2)

In [10]:
%%R
ph = fread(cmd = 'grep -v "chrom" haplotypes/pooha_output.txt')
ph %<>% set_colnames(c('chrom', 'pos', 'ref', 'alt', 'child', 'parent', 'method', 'parent_qual', 'variant_type', 'n_informative_reads', 'n_informative_vars', 'n_likely_somatic_reads', 'n_conflicting_reads' ))

candidates = fread('final/all_sample_by_candidates.txt') 
candidates %<>% set_colnames(c('child','chrom','bedstart','pos','ref','alt'))

cd_parents = 
    left_join(candidates, ph) %>%
    filter(parent != 'U')


Joining with `by = join_by(child, chrom, pos, ref, alt)`


In [13]:
%%R
left_join(candidates, ph)

Joining with `by = join_by(child, chrom, pos, ref, alt)`
            child           chrom bedstart      pos    ref    alt   parent
           <char>          <char>    <int>    <int> <char> <char>   <char>
 1:          FG29  HiC_scaffold_2 67338091 67338092      T      A        U
 2:          FG29  HiC_scaffold_2 71549310 71549311      G      A        U
 3:          FG29  HiC_scaffold_3 31079938 31079939      G      C     Y121
 4:          FG29  HiC_scaffold_6 20837340 20837341      C      G        U
 5:          FG29  HiC_scaffold_7 33841068 33841069      C      T Green312
 6:          FG29  HiC_scaffold_9 55177073 55177074      T      G        U
 7:          FG29 HiC_scaffold_10  6202422  6202423      G      A        U
 8:          FG29 HiC_scaffold_10 54470578 54470579      G      A        U
 9:          FG29 HiC_scaffold_11 47449978 47449979      G      T     Y121
10:          FG29 HiC_scaffold_14 31764071 31764072      G      A        U
11:          FG29 HiC_scaffold_15 10189581 

In [12]:
%%R
cd_parents

            child           chrom bedstart      pos    ref    alt   parent
           <char>          <char>    <int>    <int> <char> <char>   <char>
 1:          FG29  HiC_scaffold_3 31079938 31079939      G      C     Y121
 2:          FG29  HiC_scaffold_7 33841068 33841069      C      T Green312
 3:          FG29 HiC_scaffold_11 47449978 47449979      G      T     Y121
 4:          FG29 HiC_scaffold_15 31631911 31631912      C      A     Y121
 5:          FG29 HiC_scaffold_17 35289449 35289450      G      T     Y121
 6:          FG30 HiC_scaffold_13  2046468  2046469      T      A     Y121
 7:          FG30 HiC_scaffold_16 43758122 43758123      C      A     Y121
 8:          FG33  HiC_scaffold_4  6842929  6842930      T      C     Y121
 9:          FG33  HiC_scaffold_4  8785696  8785697      C      T     Y121
10:          FG33  HiC_scaffold_6 49235550 49235551      C      G     Y121
11:          FG33  HiC_scaffold_7 31541473 31541474      G      C     Y121
12:          FG33 HiC_sca

In [65]:
%%R
#Patrick family
res <- prop.test(x = c(13, 2), n = c(15, 15))
print(res)

#Norbert family
res <- prop.test(x = c(4, 2), n = c(6, 6))
print(res)

#Toothless family
res <- prop.test(x = c(6, 4), n = c(10, 10))
print(res)


	2-sample test for equality of proportions with continuity correction

data:  c(13, 2) out of c(15, 15)
X-squared = 13.333, df = 1, p-value = 0.0002607
alternative hypothesis: two.sided
95 percent confidence interval:
 0.423383 1.000000
sample estimates:
   prop 1    prop 2 
0.8666667 0.1333333 


	2-sample test for equality of proportions with continuity correction

data:  c(4, 2) out of c(6, 6)
X-squared = 0.33333, df = 1, p-value = 0.5637
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.366768  1.000000
sample estimates:
   prop 1    prop 2 
0.6666667 0.3333333 


	2-sample test for equality of proportions with continuity correction

data:  c(6, 4) out of c(10, 10)
X-squared = 0.2, df = 1, p-value = 0.6547
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.3294066  0.7294066
sample estimates:
prop 1 prop 2 
   0.6    0.4 



In [63]:
%%R
candidates %>% filter(grepl('Toothless',child))

            child           chrom bedstart      pos    ref    alt
           <char>          <char>    <int>    <int> <char> <char>
 1: ToothlessL824  HiC_scaffold_4 72889403 72889404      A      C
 2: ToothlessL824  HiC_scaffold_6 41367342 41367343      T      C
 3: ToothlessL824  HiC_scaffold_7 24369086 24369087      C      A
 4: ToothlessL824 HiC_scaffold_11 51469515 51469516      A      G
 5: ToothlessL824 HiC_scaffold_16 16565415 16565416      A      G
 6: ToothlessL960  HiC_scaffold_6 22617314 22617315      C      T
 7: ToothlessL960  HiC_scaffold_8 50474106 50474107      G      A
 8: ToothlessL960  HiC_scaffold_9 42092584 42092585      C      G
 9: ToothlessL960 HiC_scaffold_10 44646255 44646256      A      G
10: ToothlessL960 HiC_scaffold_11 30035456 30035457      T      C
11: ToothlessL960 HiC_scaffold_11 44319500 44319501      A      G
12: ToothlessL960 HiC_scaffold_16  7894734  7894735      T      C
13: ToothlessL960 HiC_scaffold_17 22058068 22058069      A      G
14: Toothl