# Leave-one-out approach

Mask SNPs to simulate lower SNP density. The example provided here is for masking 5% of SNPs.

In [None]:
#!/bin/bash -e

#SBATCH --job-name      target-thin
#SBATCH --mem           2G
#SBATCH --time          5:00:00
#SBATCH --cpus-per-task=12
#SBATCH --account       project

# load modules
module load PLINK/2.00a2.3
module load BCFtools/1.19-GCC-11.3.0
module load SAMtools/1.19-GCC-12.3.0
module load vcflib/1.0.1-GCC-9.2.0

# set working directory
maindir=/nesi/nobackup/project/project_2/imputation-highCov-allTests/target
cd $maindir

# subsample target file to SNP density of 2.73K SNPs/Mb
vcfrandomsample -r 0.95 \
/nesi/nobackup/project/project_2/imputation-input/ref/hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele.vcf.gz > \
hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-thin5.vcf
bcftools view hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-thin5.vcf -Oz -o hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-thin5.vcf.gz
tabix -f hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-thin5.vcf.gz

Create target files with one sample selected as the target each time

In [None]:
#!/bin/bash -e

#SBATCH --job-name      target-3-extract_sample
#SBATCH --mem           20G
#SBATCH --time          5:00:00
#SBATCH --cpus-per-task=12
#SBATCH --account       project

# load modules
module load PLINK/2.00a2.3
module load BCFtools/1.19-GCC-11.3.0
module load SAMtools/1.19-GCC-12.3.0

# set working directory
maindir=/nesi/nobackup/project/project_2/imputation-highCov-allTests/target
cd $maindir

# extract each sample into its own file
while read s; do
  bcftools view -s ${s} hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-thin5.vcf.gz -Oz -o target-${s}-thin5.vcf.gz
  tabix -f target-${s}-thin5.vcf.gz
done < /nesi/nobackup/project/project_2/imputation-highCov-allTests/codes/ref.list # list of sample names


Create corresponding reference panels for each target sample

In [None]:
#!/bin/bash -e

#SBATCH --job-name      ref-1-make_ref
#SBATCH --mem           20G
#SBATCH --time          5:00:00
#SBATCH --cpus-per-task=12
#SBATCH --account       project

# load modules
module load PLINK/2.00a2.3
module load BCFtools/1.19-GCC-11.3.0
module load SAMtools/1.19-GCC-12.3.0

# set working directory
maindir=/nesi/nobackup/project/project_2/imputation-highCov-allTests/
cd $maindir

# extract each sample into its own file
while read s; do
  bcftools view -s ^${s} \
  /nesi/nobackup/project/project_2/imputation-input/ref/hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele.vcf.gz -Oz -o $maindir/ref/ref-${s}.vcf.gz
  tabix -f $maindir/ref/ref-${s}.vcf.gz
done < /nesi/nobackup/project/project_2/imputation-highCov-allTests/codes/ref.list


# Imputation tests & evaluation

## Tests of imputation parameters

### Default parameters

In [None]:
#!/bin/bash -e

#SBATCH --job-name      test-1-default
#SBATCH --mem           20G
#SBATCH --time          5:00:00
#SBATCH --array         1-30
#SBATCH --cpus-per-task 12
#SBATCH --account       project

# load modules
module load Beagle/5.4-22Jul22.46e
module load SAMtools/1.19-GCC-12.3.0

# set working directory
maindir=/nesi/nobackup/project/project_2/imputation-highCov-allTests
cd $maindir

# reference config file
config=/nesi/nobackup/project/project_2/imputation-highCov-allTests/codes/array.config

# extract sample name
sample=$(awk -v ArrayTaskID=$SLURM_ARRAY_TASK_ID '$1==ArrayTaskID {print $2}' $config)

# import files
target=$maindir/target/target-${sample}-thin5.vcf.gz
refphased=$maindir/ref/ref-${sample}_phased.vcf.gz

# impute
beagle gt=${target} ref=${refphased} gp=false out=$maindir/test_1-default/target-${sample}-thin5_beagle5-default
tabix -f $maindir/test_1-default/target-${sample}-thin5_beagle5-default.vcf.gz

# evaluate
module purge
module load VCFtools/0.1.15-GCC-9.2.0-Perl-5.30.1
module load SAMtools/1.19-GCC-12.3.0
module load BCFtools/1.19-GCC-11.3.0

# load files
validate=$maindir/validate/validate-${sample}.vcf.gz
imputed=$maindir/test_1-default/target-${sample}-thin5_beagle5-default.vcf.gz

# genotype concordance
vcf-compare $validate $imputed > $maindir/test_1-default/concordance-${sample}-default

# extract dr2
bcftools query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\t%QUAL\t%FILTER\t%DR2\t%AF\t%IMP\n' $imputed > $maindir/test_1-default/target-${sample}-thin5_beagle5-default.r2

# r2 & IQS
module purge
module load Python/3.6.3-gimkl-2017a
module load cyvcf2
cd $maindir/test_1-default
python3 /home/user/imputation/Compare_imputation_to_WGS.py \
--ga $target \
--imputed $imputed \
--wgs $validate \
--ref $refphased


### Effective population size (<i>Ne<i>)

In [None]:
#!/bin/bash -e

#SBATCH --job-name      test_3-ne
#SBATCH --mem           20G
#SBATCH --time          12:00:00
#SBATCH --array         1-10
#SBATCH --cpus-per-task 12
#SBATCH --account       project

for ne in 10 100 1000 10000 100000 1000000
    do
    
    # load modules
    module load Beagle/5.4-22Jul22.46e
    module load SAMtools/1.19-GCC-12.3.0

    # reference config file
    config=/nesi/nobackup/project/project_2/imputation-highCov-allTests/codes/array.config

    # extract sample name
    sample=$(awk -v ArrayTaskID=$SLURM_ARRAY_TASK_ID '$1==ArrayTaskID {print $2}' $config)

    # set working directory
    maindir=/nesi/nobackup/project/project_2/imputation-highCov-allTests/test_3-ne
    cd $maindir

    # reference panel
    ref=/nesi/nobackup/project/project_2/imputation-highCov-allTests/ref/ref-${sample}.vcf.gz
    
    # phase using Beagle 5
    beagle gt=$ref ne=${ne} em=false out=ref-${sample}_phased-ne${ne} nthreads=12
    
    # index file
    tabix -f ref-${sample}_phased-ne${ne}.vcf.gz

    # import files
    target=/nesi/nobackup/project/project_2/imputation-highCov-allTests/target/target-${sample}-thin5.vcf.gz
    refphased=ref-${sample}_phased-ne${ne}.vcf.gz

    # impute
    beagle gt=${target} ref=${refphased} gp=false ne=${ne} em=false out=target-${sample}-thin5_beagle5-ne${ne}
    tabix -f target-${sample}-thin5_beagle5-ne${ne}.vcf.gz

    # evaluate
    module purge
    module load VCFtools/0.1.15-GCC-9.2.0-Perl-5.30.1
    module load SAMtools/1.19-GCC-12.3.0
    module load BCFtools/1.19-GCC-11.3.0

    # load files
    validate=/nesi/nobackup/project/project_2/imputation-highCov-allTests/validate/validate-${sample}.vcf.gz
    imputed=target-${sample}-thin5_beagle5-ne${ne}.vcf.gz

    # genotype concordance
    vcf-compare $validate $imputed > concordance-${sample}-ne${ne}

    # extract dr2
    bcftools query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\t%QUAL\t%FILTER\t%DR2\t%AF\t%IMP\n' $imputed > target-${sample}-thin5_beagle5-ne${ne}.r2

    # IQS
    module purge
    module spider Python/3.9.9-gimkl-2020a
    module load cyvcf2
    python3 /home/user/imputation/Compare_imputation_to_WGS.py \
    --ga $target \
    --imputed $imputed \
    --wgs $validate \
    --ref $refphased

done

### Combination of parameters

In [None]:
#!/bin/bash -e

#SBATCH --job-name      test_4-combination
#SBATCH --mem           40G
#SBATCH --time          60:00:00
#SBATCH --array         1-30
#SBATCH --cpus-per-task 12
#SBATCH --account       project

for window in 40 60 80
    do
    
    for iter in 12 24
        do
        
        for impState in 500 1000 1600
            do
            
            for impSeg in 6 20 50
                do

                # load modules
                module load Beagle/5.4-22Jul22.46e
                module load SAMtools/1.19-GCC-12.3.0

                # reference config file
                config=/nesi/nobackup/project/project_2/imputation-highCov-allTests/codes/array.config

                # extract sample name
                sample=$(awk -v ArrayTaskID=$SLURM_ARRAY_TASK_ID '$1==ArrayTaskID {print $2}' $config)

                # set working directory
                maindir=/nesi/nobackup/project/project_2/imputation-highCov-allTests/test_4-combination
                cd $maindir

                # reference panel
                ref=/nesi/nobackup/project/project_2/imputation-highCov-allTests/ref/ref-${sample}.vcf.gz
                # phase using Beagle 5
                beagle gt=$ref window=${window} iterations=${iter} out=ref-${sample}_phased-w${window}-i${iter} nthreads=12
                # index file
                tabix -f ref-${sample}_phased-w${window}-i${iter}.vcf.gz

                # import files
                target=/nesi/nobackup/project/project_2/imputation-highCov-allTests/target/target-${sample}-thin5.vcf.gz
                refphased=ref-${sample}_phased-w${window}-i${iter}.vcf.gz

                # impute
                beagle gt=${target} ref=${refphased} gp=false window=${window} imp-states=${impState} imp-segment=${impSeg} out=target-${sample}-thin5_beagle5-w${window}-i${iter}-st${impState}-sg${impSeg}
                tabix -f target-${sample}-thin5_beagle5-w${window}-i${iter}-st${impState}-sg${impSeg}.vcf.gz

                # evaluate
                module purge
                module load VCFtools/0.1.15-GCC-9.2.0-Perl-5.30.1
                module load SAMtools/1.19-GCC-12.3.0
                module load BCFtools/1.19-GCC-11.3.0

                # load files
                validate=/nesi/nobackup/project/project_2/imputation-highCov-allTests/validate/validate-${sample}.vcf.gz
                imputed=target-${sample}-thin5_beagle5-w${window}-i${iter}-st${impState}-sg${impSeg}.vcf.gz

                # genotype concordance
                vcf-compare $validate $imputed > concordance-${sample}-w${window}-i${iter}-st${impState}-sg${impSeg}

                # extract dr2
                bcftools query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\t%QUAL\t%FILTER\t%DR2\t%AF\t%IMP\n' $imputed > target-${sample}-thin5_beagle5-w${window}-i${iter}-st${impState}-sg${impSeg}.r2

                # IQS
                module purge
                module spider Python/3.9.9-gimkl-2020a
                module load cyvcf2
                python3 /home/user/imputation/Compare_imputation_to_WGS.py \
                --ga $target \
                --imputed $imputed \
                --wgs $validate \
                --ref $refphased
                done
            done
        done
    done


### Inclusion of linkage maps

Example given is for the analysis using a linkage map constructed using CRI-MAP v2.4 and with parameters changed from default.

In [None]:
#!/bin/bash -e

#SBATCH --job-name      test_2-map-crimap
#SBATCH --mem           40G
#SBATCH --time          5:00:00
#SBATCH --array         1-30
#SBATCH --cpus-per-task 12
#SBATCH --account       project

# load modules
module load Beagle/5.4-22Jul22.46e
module load SAMtools/1.19-GCC-12.3.0

# reference config file
config=/nesi/nobackup/project/project_2/imputation-highCov-allTests/codes/array.config

# extract sample name
sample=$(awk -v ArrayTaskID=$SLURM_ARRAY_TASK_ID '$1==ArrayTaskID {print $2}' $config)

# set working directory
maindir=/nesi/nobackup/project/project_2/imputation-highCov-allTests/test_2-map-crimap
cd $maindir

# set parameters
iterations=24
window=60
impSeg=20

# reference panel
ref=/nesi/nobackup/project/project_2/imputation-highCov-allTests/ref/ref-${sample}.vcf.gz

# phase using Beagle 5
beagle gt=$ref out=ref-${sample}_phased-map-crimap-i${iterations}-w${window} \
nthreads=12 iterations=${iterations} window=${window} \
map=/nesi/nobackup/project/project_2/imputation-input/hihi-autosomes-crimap.map

# index file
tabix -f ref-${sample}_phased-map-crimap-i${iterations}-w${window}.vcf.gz

# import files
target=/nesi/nobackup/project/project_2/imputation-highCov-allTests/target/target-${sample}-thin5.vcf.gz
refphased=ref-${sample}_phased-map-crimap-i${iterations}-w${window}.vcf.gz

# impute
beagle gt=${target} ref=${refphased} \
nthreads=12 iterations=${iterations} window=${window} imp-segment=${impSeg} \
map=/nesi/nobackup/project/project_2/imputation-input/hihi-autosomes-crimap.map \
out=target-${sample}-thin5_beagle5-map-crimap-i${iterations}-w${window}-sg${impSeg}
tabix -f target-${sample}-thin5_beagle5-map-crimap-i${iterations}-w${window}-sg${impSeg}.vcf.gz

# evaluate
module purge
module load VCFtools/0.1.15-GCC-9.2.0-Perl-5.30.1
module load SAMtools/1.19-GCC-12.3.0
module load BCFtools/1.19-GCC-11.3.0

# load files
validate=/nesi/nobackup/project/project_2/imputation-highCov-allTests/validate/validate-${sample}.vcf.gz
imputed=target-${sample}-thin5_beagle5-map-crimap-i${iterations}-w${window}-sg${impSeg}.vcf.gz

# genotype concordance
vcf-compare $validate $imputed > concordance-${sample}-map-crimap-i${iterations}-w${window}-sg${impSeg}

# extract dr2
bcftools query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\t%QUAL\t%FILTER\t%DR2\t%AF\t%IMP\n' $imputed > target-${sample}-thin5_beagle5-map-crimap-i${iterations}-w${window}-sg${impSeg}.r2

# IQS
module purge
module spider Python/3.9.9-gimkl-2020a
module load cyvcf2
python3 /home/user/imputation/Compare_imputation_to_WGS.py \
--ga $target \
--imputed $imputed \
--wgs $validate \
--ref $refphased


## Tests of input data structure

### Exclusion of distant samples

Calculate genetic distance

In [None]:
#!/bin/bash -e

#SBATCH --job-name      vcf-pairwise_relate
#SBATCH --mem           10G
#SBATCH --time          5:00:00
#SBATCH --cpus-per-task=8
#SBATCH --account       project

# load programs
module load PLINK/2.00a2.3

# set working directory
maindir=/nesi/nobackup/project/project_2/imputation-input
cd $maindir

# pairwise relatedness
plink2 --vcf /nesi/nobackup/project/project_2/imputation-input/ref/hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele.vcf.gz \
--make-king-table --chr-set 95 --allow-extra-chr \
--out hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele


Prepare reference panels with distant samples excluded

In [None]:
#!/bin/bash -e

#SBATCH --job-name      ref-distant
#SBATCH --mem           20G
#SBATCH --time          5:00:00
#SBATCH --cpus-per-task 12
#SBATCH --account       project

# load modules
module load PLINK/2.00a2.3
module load BCFtools/1.19-GCC-11.3.0
module load SAMtools/1.19-GCC-12.3.0

# set working directory
maindir=/nesi/nobackup/project/project_2/imputation-highCov-allTests/ref/
cd $maindir

# remove distant samples (n=5) from ref panel files
 bcftools view -s ^distant1,distant2,distant3,distant4,distant5 \
 /nesi/nobackup/project/project_2/imputation-input/ref/hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele.vcf.gz -Oz -o hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-distant.vcf.gz
  tabix -f hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-distant.vcf.gz

# extract each sample into its own file
while read s; do
  bcftools view -s ^${s} hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-distant.vcf.gz -Oz -o ref-distant-${s}.vcf.gz
  tabix -f ref-distant-${s}.vcf.gz
done < /nesi/nobackup/project/project_2/imputation-highCov-allTests/codes/ref-distant.list # list of samples without distant individuals


Run imputation test

In [None]:
#!/bin/bash -e

#SBATCH --job-name      test_5-distant
#SBATCH --mem           20G
#SBATCH --time          5:00:00
#SBATCH --array         1-30
#SBATCH --cpus-per-task 12
#SBATCH --account       project

# load modules
module load Beagle/5.4-22Jul22.46e
module load SAMtools/1.19-GCC-12.3.0

# reference config file
config=/nesi/nobackup/project/project_2/imputation-highCov-allTests/codes/array.config

# extract sample name
sample=$(awk -v ArrayTaskID=$SLURM_ARRAY_TASK_ID '$1==ArrayTaskID {print $2}' $config)

# set working directory
maindir=/nesi/nobackup/project/project_2/imputation-highCov-allTests/test_5-distant
cd $maindir

# reference panel
ref=/nesi/nobackup/project/project_2/imputation-highCov-allTests/ref/ref-distant-${sample}.vcf.gz

# phase using Beagle 5
beagle gt=$ref out=ref-distant-${sample}_phased nthreads=12

# index file
tabix -f ref-distant-${sample}_phased.vcf.gz

# import files
target=/nesi/nobackup/project/project_2/imputation-highCov-allTests/target/target-${sample}-thin5.vcf.gz
refphased=ref-distant-${sample}_phased.vcf.gz

# impute
beagle gt=${target} ref=${refphased} gp=true out=target-${sample}-thin5_beagle5-distant
tabix -f target-${sample}-thin5_beagle5-distant.vcf.gz

# evaluate
module purge
module load VCFtools/0.1.15-GCC-9.2.0-Perl-5.30.1
module load SAMtools/1.19-GCC-12.3.0
module load BCFtools/1.19-GCC-11.3.0

# load files
validate=/nesi/nobackup/project/project_2/imputation-highCov-allTests/validate/validate-${sample}.vcf.gz
imputed=target-${sample}-thin5_beagle5-distant.vcf.gz

# genotype concordance
vcf-compare $validate $imputed > concordance-${sample}-distant

# extract dr2
bcftools query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\t%QUAL\t%FILTER\t%DR2\t%AF\t%IMP\n' $imputed > target-${sample}-thin5_beagle5-distant.r2

# IQS
module purge
module spider Python/3.9.9-gimkl-2020a
module load cyvcf2
python3 /home/user/imputation/Compare_imputation_to_WGS.py \
--ga $target \
--imputed $imputed \
--wgs $validate \
--ref $refphased


### Number of individuals in reference panels

Individuals were randomly sampled from all 30 high-coverage samples, with 10 repeats per reference panel size.

In [None]:
#!/bin/bash -e

#SBATCH --job-name      test_8-ref_thin-v2
#SBATCH --mem           20G
#SBATCH --time          60:00:00
#SBATCH --array         1-26
#SBATCH --cpus-per-task 12
#SBATCH --account       project

# reference config file
config=/nesi/nobackup/project/project_2/imputation-highCov-allTests/codes/array-Tiritiri.config

# extract sample name
sample=$(awk -v ArrayTaskID=$SLURM_ARRAY_TASK_ID '$1==ArrayTaskID {print $2}' $config)

# set working directory
maindir=/nesi/nobackup/project/project_2/imputation-highCov-allTests/test_8-ref_thin
cd $maindir

for j in 5 10 15 20
do
    for i in {1..10}
    do
        module purge
        module load Beagle/5.4-22Jul22.46e
        module load SAMtools/1.19-GCC-12.3.0
        module load BCFtools/1.19-GCC-11.3.0
        
        # number of rows to remove
        let remove=29-${j}

        # output sample names from reference panel, shuffle order, remove first X rows, re-order (randomly subset X rows)
        bcftools query -l /nesi/nobackup/project/project_2/imputation-highCov-allTests/ref/ref-${sample}.vcf.gz | shuf | sed "1,${remove}d" | sort > \
        /nesi/nobackup/project/project_2/imputation-highCov-allTests/ref/ref_thin${j}-${sample}_${i}.txt

        cd /nesi/nobackup/project/project_2/imputation-highCov-allTests/ref

        # subset reference panel and rename with a number from 1-10
        bcftools view -S ref_thin${j}-${sample}_${i}.txt \
        ref-${sample}.vcf.gz -Oz -o ref_thin${j}-${sample}_${i}.vcf.gz
        tabix -f ref_thin${j}-${sample}_${i}.vcf.gz

        cd $maindir

        # reference panel
        ref=/nesi/nobackup/project/project_2/imputation-highCov-allTests/ref/ref_thin${j}-${sample}_${i}.vcf.gz
        # phase using Beagle 5
        beagle gt=$ref out=ref_thin${j}-${sample}_${i}_phased nthreads=12
        # index file
        tabix -f ref_thin${j}-${sample}_${i}_phased.vcf.gz

        # import files
        target=/nesi/nobackup/project/project_2/imputation-highCov-allTests/target/target-${sample}-thin5.vcf.gz
        refphased=ref_thin${j}-${sample}_${i}_phased.vcf.gz

        # impute
        beagle gt=${target} ref=${refphased} gp=true out=target-${sample}-thin5_beagle5-ref_thin${j}_${i}
        tabix -f target-${sample}-thin5_beagle5-ref_thin${j}_${i}.vcf.gz

        # evaluate
        module purge
        module load VCFtools/0.1.15-GCC-9.2.0-Perl-5.30.1
        module load SAMtools/1.19-GCC-12.3.0
        module load BCFtools/1.19-GCC-11.3.0

        # load files
        validate=/nesi/nobackup/project/project_2/imputation-highCov-allTests/validate/validate-${sample}.vcf.gz
        imputed=target-${sample}-thin5_beagle5-ref_thin${j}_${i}.vcf.gz

        # genotype concordance
        vcf-compare $validate $imputed > concordance-${sample}-ref_thin${j}_${i}

        # extract dr2
        bcftools query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\t%QUAL\t%FILTER\t%DR2\t%AF\t%IMP\n' $imputed > target-${sample}-thin5_beagle5-ref_thin${j}_${i}.r2

        # IQS
        module purge
        module spider Python/3.9.9-gimkl-2020a
        module load cyvcf2
        python3 /home/user/imputation/Compare_imputation_to_WGS.py \
        --ga $target \
        --imputed $imputed \
        --wgs $validate \
        --ref $refphased

    done
done


### SNP densities

Subsample SNPs in target file to simulate lower SNP densities

In [None]:
#!/bin/bash -e

#SBATCH --job-name      target-thin
#SBATCH --mem           2G
#SBATCH --time          5:00:00
#SBATCH --cpus-per-task=12
#SBATCH --account       project
#SBATCH --partition=milan
#SBATCH --output=/nesi/nobackup/project/project_2/imputation-highCov-allTests/target/logs/%x-%j.out

# load modules
module load PLINK/2.00a2.3
module load BCFtools/1.19-GCC-11.3.0
module load SAMtools/1.19-GCC-12.3.0
module load vcflib/1.0.1-GCC-9.2.0

# set working directory
maindir=/nesi/nobackup/project/project_2/imputation-highCov-allTests/target
cd $maindir

# subsample target file to SNP density of 2.10K SNPs/Mb
vcfrandomsample -r 0.77 \
hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-thin5.vcf.gz > \
hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-thin27.vcf
bcftools view hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-thin27.vcf \
-Oz -o hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-thin27.vcf.gz
tabix -f hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-thin27.vcf.gz

# subsample target file to SNP density of 1.58K SNPs/Mb
vcfrandomsample -r 0.75 \
hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-thin27.vcf.gz > \
hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-thin45.vcf
bcftools view hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-thin45.vcf \
-Oz -o hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-thin45.vcf.gz
tabix -f hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-thin45.vcf.gz

# subsample target file to SNP density of 1.09K SNPs/Mb
vcfrandomsample -r 0.69 \
hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-thin45.vcf.gz > \
hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-thin62.vcf
bcftools view hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-thin62.vcf \
-Oz -o hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-thin62.vcf.gz
tabix -f hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-thin62.vcf.gz

# subsample target file to SNP density of 197 SNPs/Mb
vcfrandomsample -r 0.18 \
hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-thin62.vcf.gz > \
hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-thin93.vcf
bcftools view hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-thin93.vcf \
-Oz -o hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-thin93.vcf.gz
tabix -f hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-thin93.vcf.gz

# subsample target file to SNP density of 48.9 SNPs/Mb (SNP-chip)
vcfrandomsample -r 0.25 \
hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-thin5.vcf.gz > \
hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-thinSnpchip.vcf
bcftools view hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-thinSnpchip.vcf \
-Oz -o hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-thinSnpchip.vcf.gz
tabix -f hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-thinSnpchip.vcf.gz

# subsample target file to SNP density of 19.6 SNPs/Mb (RADseq)
vcfrandomsample -r 0.4 \
hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-thinSnpchip.vcf.gz > \
hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-thinRadseq.vcf
bcftools view hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-thinRadseq.vcf \
-Oz -o hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-thinRadseq.vcf.gz
tabix -f hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-thinRadseq.vcf.gz

# extract each sample into its own file
while read s; do
    for sub in 27 45 62 93 Snpchip Radseq
    do
        bcftools view -s ${s} hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-thin${sub}.vcf.gz -Oz -o target-${s}-thin${sub}.vcf.gz
        tabix -f target-${s}-thin${sub}.vcf.gz
        bcftools stats -s "-" target-${s}-thin${sub}.vcf.gz > target-${s}-thin${sub}.stats
        done
done < /nesi/nobackup/project/project_2/imputation-highCov-allTests/codes/ref.list


Run imputation test

In [None]:
#!/bin/bash -e

#SBATCH --job-name      test_7-thin-gp
#SBATCH --mem           40G
#SBATCH --time          15:00:00
#SBATCH --array         1-30
#SBATCH --cpus-per-task 12
#SBATCH --account       project

# reference config file
config=/nesi/nobackup/project/project_2/imputation-highCov-allTests/codes/array.config

# extract sample name
sample=$(awk -v ArrayTaskID=$SLURM_ARRAY_TASK_ID '$1==ArrayTaskID {print $2}' $config)

# set working directory
maindir=/nesi/nobackup/project/project_2/imputation-highCov-allTests/test_7-thin-gp
cd $maindir

for sub in 93 Snpchip Radseq
do
    for window in 60
    do

        for iter in 24
        do

            for impState in 1600
            do

                for impSeg in 6
                do
                    module load Beagle/5.4-22Jul22.46e
                    module load SAMtools/1.19-GCC-12.3.0
                    # import files
                    target=/nesi/nobackup/project/project_2/imputation-highCov-allTests/target/target-${sample}-thin${sub}.vcf.gz
                    refphased=/nesi/nobackup/project/project_2/imputation-highCov-allTests/test_4-combination/ref-${sample}_phased-w${window}-i${iter}.vcf.gz

                    # impute
                    beagle gt=${target} ref=${refphased} gp=true window=${window} imp-states=${impState} imp-segment=${impSeg} out=target-${sample}-thin${sub}_beagle5-w${window}-i${iter}-st${impState}-sg${impSeg}-gp
                    tabix -f target-${sample}-thin${sub}_beagle5-w${window}-i${iter}-st${impState}-sg${impSeg}-gp.vcf.gz

                    # evaluate
                    module purge
                    module load VCFtools/0.1.15-GCC-9.2.0-Perl-5.30.1
                    module load SAMtools/1.19-GCC-12.3.0
                    module load BCFtools/1.19-GCC-11.3.0

                    # load files
                    validate=/nesi/nobackup/project/project_2/imputation-highCov-allTests/validate/validate-${sample}.vcf.gz
                    imputed=target-${sample}-thin${sub}_beagle5-w${window}-i${iter}-st${impState}-sg${impSeg}-gp.vcf.gz

                    # genotype concordance
                    vcf-compare $validate $imputed > concordance-${sample}-thin${sub}-w${window}-i${iter}-st${impState}-sg${impSeg}-gp

                    # extract dr2
                    bcftools query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\t%QUAL\t%FILTER\t%DR2\t%AF\t%IMP\n' $imputed > target-${sample}-thin${sub}_beagle5-w${window}-i${iter}-st${impState}-sg${impSeg}-gp.r2

                    # IQS
                    module purge
                    module spider Python/3.9.9-gimkl-2020a
                    module load cyvcf2
                    python3 /home/user/imputation/Compare_imputation_to_WGS.py \
                    --ga $target \
                    --imputed $imputed \
                    --wgs $validate \
                    --ref $refphased
                done
            done
        done
    done
done


### Sporadic missingness

Simulate sporadic missingness

In [None]:
# set working directory
cd /nesi/nobackup/project/project_2/imputation-wgs-ref-sporadic-highCov

# rename samples in target
bcftools reheader --samples ./codes/rename-samples.txt \
/nesi/nobackup/project/project_2/imputation-input/ref/hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele.vcf.gz \
-o hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-renameSamples.vcf

# compress file
bgzip hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-renameSamples.vcf

# downsample file to 69K/788K
# 69K
bcftools view --include ID==@./codes/target-69k.txt hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-renameSamples.vcf.gz -Oz -o target-69K.vcf.gz
plink2 --vcf target-69K.vcf.gz --missing --chr-set 95 --allow-extra-chr --out target-69K

# 788K
bcftools view --include ID==@./codes/target-788k.txt hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-renameSamples.vcf.gz -Oz -o target-788K.vcf.gz
plink2 --vcf target-788K.vcf.gz --missing --chr-set 95 --allow-extra-chr --out target-788K

## input sporadic missingness using R code (example shown for 788K SNPs allowing 50% missingness)
library(pgirmess)
library(dplyr)

# set working directory
setwd("/nesi/nobackup/project/project_2/imputation-wgs-ref-sporadic-highCov")

# read in vcf file
vcf <- read.table("target-788K.vcf", header = FALSE, fill = TRUE)

# trim down to just genotypes
vcf2 <- vcf[,-c(1:9)]

# matrix size
num_rows <- nrow(vcf2)
num_cols <- ncol(vcf2)

# calculate the total number of cells
total_cells <- num_rows * num_cols

# set missingness threshold
missingThreshold <- 0.5
missingPc <-missingThreshold*100

# Calculate the number of cells to set as mising (around X%/0.X)
num_cells_to_replace <- round(missingThreshold * total_cells)

# Create a matrix of random numbers
random_matrix <- matrix(sample(c(1:10), total_cells, replace = TRUE), nrow = num_rows, ncol = num_cols)

# Flatten the matrix into a vector
random_vector <- as.vector(random_matrix)

# Randomly select cells set as missing
cells_to_replace <- sample(total_cells, num_cells_to_replace)

# Set selected cells as missing
random_vector[cells_to_replace] <- "./."

# Reshape the vector back into a matrix
final_matrix <- matrix(random_vector, nrow = num_rows, ncol = num_cols)

# Convert matrix to dataframe
final_df <- as.data.frame(final_matrix)

# If cell value is not equal to "./.", grab value from original genotype vcf
final_df[final_df != "./."] <- vcf2[final_df != "./."]

# Put first few columns back in
final_vcf <- cbind(vcf[,c(1:9)], final_df)

# output vcf
write.delim(final_vcf, file = "target-788K-sporadic50.vcf", row.names = FALSE, quote = FALSE, sep = "\t")

# compress sporadic file
bgzip -c target-69K-sporadic40.vcf > target-69K-sporadic40.vcf.gz
tabix -f target-69K-sporadic40.vcf.gz
bgzip -c target-788K-sporadic50.vcf > target-788K-sporadic50.vcf.gz
tabix -f target-788K-sporadic50.vcf.gz

# calculate new missingness
plink2 --vcf target-69K-sporadic40.vcf.gz --missing --chr-set 95 --allow-extra-chr --out target-69K-sporadic40
plink2 --vcf target-788K-sporadic50.vcf.gz --missing --chr-set 95 --allow-extra-chr --out target-788K-sporadic50


Run imputation test. Example shown for 788K SNPs allowing 50% missingness.

In [None]:
#!/bin/bash -e

#SBATCH --job-name      impute-ne-788K-sporadic
#SBATCH --mem           40G
#SBATCH --time          5:00:00
#SBATCH --cpus-per-task=12
#SBATCH --account       project

# set up environment
module load Beagle/5.4-22Jul22.46e
module load SAMtools/1.19-GCC-12.3.0

# set parameters
window=60
iter=24
impState=1600
impSeg=6

# set working directory
maindir=/nesi/nobackup/project/project_2/imputation-wgs-ref-sporadic-highCov
cd $maindir

# import files
target=target-788K-sporadic50-remove25A.vcf.gz
refphased=ref-noMissing_phased-w${window}-i${iter}-remove25A.vcf.gz

# impute
beagle gt=$target ref=$refphased \
impute=true gp=false window=${window} imp-states=${impState} imp-segment=${impSeg} nthreads=12 \
out=target-788K-sporadic50-remove25A_beagle5-w${window}-i${iter}-st${impState}-sg${impSeg}
tabix -f target-788K-sporadic50-remove25A_beagle5-w${window}-i${iter}-st${impState}-sg${impSeg}.vcf.gz

# EVALUATE
module purge
module load VCFtools/0.1.15-GCC-9.2.0-Perl-5.30.1
module load SAMtools/1.19-GCC-12.3.0
module load BCFtools/1.19-GCC-11.3.0

# load files
refphased=ref-noMissing_phased-w${window}-i${iter}-remove25A.vcf.gz
target=target-788K-sporadic50-remove25A.vcf.gz
validate=hihi_wgs_filter_highcov-autosomes-snpID-noMissing-noSingleton-2allele-renameSamples.vcf.gz
imputed=target-788K-sporadic50-remove25A_beagle5-w${window}-i${iter}-st${impState}-sg${impSeg}.vcf.gz

# genotype concordance
vcf-compare $validate $imputed > target-788K-sporadic50-remove25A_beagle5-w${window}-i${iter}-st${impState}-sg${impSeg}

# extract dr2
bcftools query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\t%QUAL\t%FILTER\t%DR2\t%AF\t%IMP\n' $imputed > target-788K-sporadic50-remove25A_beagle5-w${window}-i${iter}-st${impState}-sg${impSeg}.r2

# IQS
module purge
module load Python/3.6.3-gimkl-2017a
module load cyvcf2
python3 /home/user/imputation/Compare_imputation_to_WGS.py \
--ga $target \
--imputed $imputed \
--wgs $validate \
--ref $refphased
