In [1]:
from collections import OrderedDict
import sys

import numpy as np

import cyvcf2

print(f"cyvcf2 {cyvcf2.__version__}")

cyvcf2 0.30.14


In [25]:
def compute_maf_from_vcf(vcf_file,
                         categorize = False,
                         verbose = False):
    num_singletons = 0
    
    maf_dict = OrderedDict()
    
    for variant in cyvcf2.VCF(vcf_file):
        # Assume that there are only biallelic sites.
        num_genotypes_00 = 0
        num_genotypes_01 = 0 # Includes genotype 10
        num_genotypes_11 = 0
        num_alleles_0 = 0
        num_alleles_1 = 0
        num_alleles_2 = 0
        
        for genotype in variant.genotypes:
            # Tally up genotypes.
            if   genotype[0] == 0 and genotype[0] == 0:
                num_genotypes_00 += 1
            elif genotype[0] == 1 and genotype[1] == 1:
                num_genotypes_11 += 1
            else:
                num_genotypes_01 += 1 # Or 10
                
            # Tally up alleles in genotype.
            if   genotype[0] == 0:
                num_alleles_0 += 1
            elif genotype[0] == 1:
                num_alleles_1 += 1
            elif genotype[0] == 2:
                num_alleles_2 += 1
                if verbose:
                    print(f"WARNING: Multi-alleic site at {variant.start} " +\
                          f"with genotype {genotype}.")
            else:
                if verbose:
                    print(f"WARNING: Allele value in {genotype}" +\
                          f"at {variant.start} is not recognized.")
                
            # Tally up alleles in genotype.
            if   genotype[1] == 0:
                num_alleles_0 += 1
            elif genotype[1] == 1:
                num_alleles_1 += 1
            elif genotype[1] == 2:
                num_alleles_2 += 1
                if verbose:
                    print(f"WARNING:" + " " +\
                          f"Multi-alleic site at {variant.start}" + " " +\
                          f"with genotype {genotype}.")
            else:
                if verbose:
                    print(f"WARNING:" + " " +\
                          f"Allele value in {genotype}" + " " +\
                          f"at {variant.start} is not recognized.")
                
        #assert min(num_alleles_0, num_alleles_1) > num_alleles_2,\
        #    "Allele 2 occurs at higher frequency than alleles 0 and 1."
        
        num_alleles_total = num_alleles_0 + num_alleles_1 + num_alleles_2
        # MAF is defined as the frequency of the SECOND most common allele.
        minor_allele = 0 if num_alleles_0 < num_alleles_1 else 1
        maf = float(min(num_alleles_0, num_alleles_1)) / float(num_alleles_total)
        
        # Key: Site position
        # Val: (MAF or MAF category, minor allele,)
        if categorize:
            #if maf < 0.0000002:
            #    maf_dict[variant.start] = ('(0.00000%, 0.00002%)', minor_allele)
            #elif maf >= 0.0000002 and maf < 0.0000005:
            #    maf_dict[variant.start] = ('[0.00002%, 0.00005%)', minor_allele)
            #elif maf >= 0.0000005 and maf < 0.000001:
            #    maf_dict[variant.start] = ('[0.00005%,  0.0001%)', minor_allele)
            #elif maf >= 0.000001 and maf < 0.000002:
            #    maf_dict[variant.start] = ('[ 0.0001%,  0.0002%)', minor_allele)
            #elif maf >= 0.000002 and maf < 0.000005:
            #    maf_dict[variant.start] = ('[ 0.0002%,  0.0005%)', minor_allele)
            #elif maf >= 0.000005 and maf < 0.00001:
            #    maf_dict[variant.start] = ('[ 0.0005%,  0.0010%)', minor_allele)
            #elif maf >= 0.00001 and maf < 0.00002:
            #    maf_dict[variant.start] = ('[ 0.0010%,  0.0020%)', minor_allele)
            #elif maf >= 0.00002 and maf < 0.00005:
            #    maf_dict[variant.start] = ('[ 0.0020%,  0.0050%)', minor_allele)
            #elif maf >= 0.00005 and maf < 0.00010:
            #    maf_dict[variant.start] = ('[ 0.0050%,  0.0100%)', minor_allele)
            if maf < 0.00010:
                maf_dict[variant.start] = ('( 0.0000%,  0.0100%)', minor_allele)
            elif maf >= 0.00010 and maf < 0.00100:
                maf_dict[variant.start] = ('[ 0.0100%,  0.1000%)', minor_allele)
            elif maf >= 0.00100 and maf < 0.00200:
                maf_dict[variant.start] = ('[ 0.1000%,  0.2000%)', minor_allele)
            elif maf >= 0.00200 and maf < 0.00300:
                maf_dict[variant.start] = ('[ 0.2000%,  0.3000%)', minor_allele)
            elif maf >= 0.00300 and maf < 0.00400:
                maf_dict[variant.start] = ('[ 0.3000%,  0.4000%)', minor_allele)
            elif maf >= 0.00400 and maf < 0.00500:
                maf_dict[variant.start] = ('[ 0.4000%,  0.5000%)', minor_allele)
            elif maf >= 0.00500 and maf < 0.01000:
                maf_dict[variant.start] = ('[ 0.5000%,  1.0000%)', minor_allele)
            elif maf >= 0.01000 and maf < 0.02000:
                maf_dict[variant.start] = ('[ 1.0000%,  2.0000%)', minor_allele)
            elif maf >= 0.02000 and maf < 0.05000:
                maf_dict[variant.start] = ('[ 2.0000%,  5.0000%)', minor_allele)
            elif maf >= 0.05000 and maf < 0.10000:
                maf_dict[variant.start] = ('[ 5.0000%, 10.0000%)', minor_allele)
            elif maf >= 0.10000 and maf < 0.20000:
                maf_dict[variant.start] = ('[10.0000%, 20.0000%)', minor_allele)
            elif maf >= 0.20000 and maf < 0.30000:
                maf_dict[variant.start] = ('[20.0000%, 30.0000%)', minor_allele)
            elif maf >= 0.30000 and maf < 0.40000:
                maf_dict[variant.start] = ('[30.0000%, 40.0000%)', minor_allele)
            elif maf >= 0.40000 and maf <= 0.50000:
                maf_dict[variant.start] = ('[40.0000%, 50.0000%]', minor_allele)
            else:
                print(f"MAF value {maf} is out of recognized range.")
        else:
            maf_dict[variant.start] = (maf, minor_allele)
        
        if verbose:
            print(f"MAF is {round(maf, 4)}"\
                  f" at {variant.start}"\
                  f" with {variant.aaf}"\
                  f" in {maf_dict[variant.start]}")
        
        if min(num_alleles_0, num_alleles_1) == 1:
            num_singletons += 1
            if verbose:
                print(f"Singleton at position {variant.start}")
                
    if verbose:
        print(f"Number of singletons in vcf file {i} is {num_singletons}")
    
    return(maf_dict)

In [26]:
def parse_vcf_file(vcf_file):
    """
    If gt_types = False, then 0=HOM_REF, 1=HET, 2=UNKNOWN, the coordinates are 0-based.
    It returns a list of dictionaries, each containing a VCF record.
    """
    parsed_vcf = []
    
    for variant in cyvcf2.VCF(vcf_file,
                              gts012 = False, # 0=HOM_REF, 1=HET, 2=UNKNOWN, 3=HOM_ALT
                              strict_gt = True):
        record = {
            'ref': variant.REF,
            'alt': variant.ALT,
            'ctg': variant.CHROM, # Contig id/name
            'pos': int(variant.start),
            'aa' : variant.INFO.get('AA'), # Ancestral allele
            'gt' : variant.genotypes
        }
        
        parsed_vcf.append(record)
        
    return(parsed_vcf)


def compare_vcf(vcf_1, vcf_2):
    assert len(vcf_1) == len(vcf_2),\
        "vcf_1 and vcf_2 have different number of records."
    
    for i in range(len(vcf_1)):
        is_valid_ref = vcf_1[i].get('ref') == vcf_2[i].get('ref')
        #is_valid_alt = vcf_1[i].get('alt') == vcf_2[i].get('alt')
        is_valid_ctg = vcf_1[i].get('ctg') == vcf_2[i].get('ctg')
        is_valid_pos = vcf_1[i].get('pos') == vcf_2[i].get('pos')
        #is_valid_aa  = vcf_1[i].get('aa' ) == vcf_2[i].get('aa' )
        is_all_valid = np.all([is_valid_ref,
                               #is_valid_alt,
                               is_valid_ctg,
                               is_valid_pos
                               #is_valid_aa
                              ])
        
        if not is_all_valid:
            pos_1 = vcf_1[i].get('pos')
            pos_2 = vcf_2[i].get('pos')
            print(f"{is_valid_ref} {is_valid_alt} {is_valid_ctg} {is_valid_pos}")
            print(f"{vcf_1[i].get('alt')} {vcf_2[i].get('alt')}")
            print(f"Incomparable records at {pos_1} and {pos_2}.")
            return(False)
        
    return(True)


def get_common_positions_in_vcf(vcf_1, vcf_2):
    pos_1 = []
    pos_2 = []
    for i, record in enumerate(vcf_1):
        pos_1.append(record.get('pos'))
    for i, record in enumerate(vcf_2):
        pos_2.append(record.get('pos'))
        
    # All positions should be unique.
    assert len(pos_1) == len(set(pos_1)),\
        "The positions in vcf_1 are not all unique."
    assert len(pos_2) == len(set(pos_2)),\
        "The positions in vcf_2 are not all unique."
    
    common_pos = list(set.intersection(set(pos_1), set(pos_2)))
    
    return(common_pos)


def compare_variants(true_vcf_file,
                     miss_vcf_file,
                     imputed_vcf_file,
                     maf_dict,
                     ploidy_level,
                     verbose = False):
    assert ploidy_level == 1 or ploidy_level == 2,\
        f"ploidy_level {ploidy_level} is not recognized."
    
    if ploidy_level == 1:
        MISSING_GENOTYPE_CONSTANT = [-1, False]
    else:
        MISSING_GENOTYPE_CONSTANT = [-1, -1, True]
    
    true_vcf    = parse_vcf_file(true_vcf_file)
    miss_vcf    = parse_vcf_file(miss_vcf_file)
    imputed_vcf = parse_vcf_file(imputed_vcf_file)
    
    new_true_vcf, new_miss_vcf, new_imputed_vcf = filter_non_biallelic_sites(true_vcf, miss_vcf, imputed_vcf)
    
    # Reencode genotype from 0|0, 0|1, 1|0, and 1|1 to 1, 2, 2, and 3, respectively.
    reencode_map = {
        (0, 0, True) : 1, # AA
        (0, 1, True) : 2, # AB, treated as equal to BA
        (1, 0, True) : 2, # BA
        (1, 1, True) : 3, # BB
        (0, False)   : 1, # A
        (1, False)   : 2, # B
    }
    
    # Imputed VCF file must have at most the number of positions as the true/miss VCF files.
    nbr_gt_missing_all = 0 # Number of genotypes imputed, correctly or not.
    nbr_gt_correct_all = 0 # Number of instances of genotypes correctly imputed.
    
    # Key   : MAF category
    # Value : (number of total, number of correctly imputed, percent correctly imputed)
    maf_categories = OrderedDict()
    #maf_categories['(0.00000%, 0.00002%)'] = [0.0, 0.0, 0.0]
    #maf_categories['[0.00002%, 0.00005%)'] = [0.0, 0.0, 0.0]
    #maf_categories['[0.00005%,  0.0001%)'] = [0.0, 0.0, 0.0]
    #maf_categories['[ 0.0001%,  0.0002%)'] = [0.0, 0.0, 0.0]
    #maf_categories['[ 0.0002%,  0.0005%)'] = [0.0, 0.0, 0.0]
    #maf_categories['[ 0.0005%,  0.0010%)'] = [0.0, 0.0, 0.0]
    #maf_categories['[ 0.0010%,  0.0020%)'] = [0.0, 0.0, 0.0]
    #maf_categories['[ 0.0020%,  0.0050%)'] = [0.0, 0.0, 0.0]
    #maf_categories['[ 0.0050%,  0.0100%)'] = [0.0, 0.0, 0.0]
    maf_categories['( 0.0000%,  0.0100%)'] = [0.0, 0.0, 0.0]
    maf_categories['[ 0.0100%,  0.1000%)'] = [0.0, 0.0, 0.0]
    maf_categories['[ 0.1000%,  0.2000%)'] = [0.0, 0.0, 0.0]
    maf_categories['[ 0.2000%,  0.3000%)'] = [0.0, 0.0, 0.0]
    maf_categories['[ 0.3000%,  0.4000%)'] = [0.0, 0.0, 0.0]
    maf_categories['[ 0.4000%,  0.5000%)'] = [0.0, 0.0, 0.0]
    maf_categories['[ 0.5000%,  1.0000%)'] = [0.0, 0.0, 0.0]
    maf_categories['[ 1.0000%,  2.0000%)'] = [0.0, 0.0, 0.0]
    maf_categories['[ 2.0000%,  5.0000%)'] = [0.0, 0.0, 0.0]
    maf_categories['[ 5.0000%, 10.0000%)'] = [0.0, 0.0, 0.0]
    maf_categories['[10.0000%, 20.0000%)'] = [0.0, 0.0, 0.0]
    maf_categories['[20.0000%, 30.0000%)'] = [0.0, 0.0, 0.0]
    maf_categories['[30.0000%, 40.0000%)'] = [0.0, 0.0, 0.0]
    maf_categories['[40.0000%, 50.0000%]'] = [0.0, 0.0, 0.0]
    
    for i in range(len(new_imputed_vcf)):
        position = new_imputed_vcf[i]['pos']
        
        if position not in maf_dict:
            continue
            
        maf, minor_allele = maf_dict[position]
        
        missing_bool   = [x == MISSING_GENOTYPE_CONSTANT for x in new_miss_vcf[i]['gt']]
        true_gt_oi     = [x for x, y in zip(new_true_vcf[i]['gt'],    missing_bool) if y]
        imputed_gt_oi  = [x for x, y in zip(new_imputed_vcf[i]['gt'], missing_bool) if y]
        
        assert len(true_gt_oi) == len(imputed_gt_oi),\
            f"true_gt_oi {len(true_gt_oi)}" + " and " +\
            f"imputed_gt_oi {len(imputed_gt_oi)}" + " " +\
            f"are not of the same length."
        
        true_gt_oi_minor    = [x
                               for x, y
                               in zip(true_gt_oi, imputed_gt_oi)
                               if x == [minor_allele, False]
                              ]
        imputed_gt_oi_minor = [y
                               for x, y
                               in zip(true_gt_oi, imputed_gt_oi)
                               if x == [minor_allele, False]
                              ]
        
        nbr_gt_missing = len(imputed_gt_oi_minor)
        if nbr_gt_missing == 0:
            continue
            
        true_gt_reencoded    = [reencode_map[tuple(x)] for x in true_gt_oi_minor]
        imputed_gt_reencoded = [reencode_map[tuple(x)] for x in imputed_gt_oi_minor]
        
        nbr_gt_correct       = np.count_nonzero([x == y
                                                 for x, y
                                                 in zip(true_gt_reencoded, imputed_gt_reencoded)])
        
        if verbose:
            if nbr_gt_missing != nbr_gt_correct:
                print(f"{nbr_gt_missing},\
                        {nbr_gt_correct},\
                        {round(float(nbr_gt_correct) / float(nbr_gt_missing), 6)},\
                        {position},\
                        {maf[position]}")
                
        maf_categories[maf][0] += nbr_gt_missing
        maf_categories[maf][1] += nbr_gt_correct
        
        nbr_gt_missing_all += nbr_gt_missing # Update overall tally.
        nbr_gt_correct_all += nbr_gt_correct # Update overall tally.
        
    for i, category in enumerate(maf_categories):
        if float(maf_categories[category][0]) == 0:
            concordance_rate = -1
        else:
            concordance_rate = round(maf_categories[category][1] / maf_categories[category][0], 6)
        maf_categories[category][2] = concordance_rate
        
    overall_concordance_rate = float(nbr_gt_correct_all) / float(nbr_gt_missing_all)
    
    return((nbr_gt_missing_all, nbr_gt_correct_all, overall_concordance_rate, maf_categories))

In [27]:
def filter_non_biallelic_sites(vcf_1, vcf_2, vcf_3):
    assert compare_vcf(vcf_1, vcf_2),\
        "vcf_1 and vcf_2 are not comparable."
    assert compare_vcf(vcf_2, vcf_3),\
        "vcf_2 and vcf_3 are not comparable."
    
    new_vcf_1 = []
    new_vcf_2 = []
    new_vcf_3 = []
    
    for i in range(len(vcf_1)):
        is_biallelic_1 = len(set(vcf_1[i]['alt']) - {'.'}) == 1
        is_biallelic_2 = len(set(vcf_2[i]['alt']) - {'.'}) == 1
        is_biallelic_3 = len(set(vcf_3[i]['alt']) - {'.'}) == 1
        
        if is_biallelic_1 and is_biallelic_2 and is_biallelic_3:
            new_vcf_1.append(vcf_1[i])
            new_vcf_2.append(vcf_2[i])
            new_vcf_3.append(vcf_3[i])
            
    assert  len(new_vcf_1) == len(new_vcf_2)\
        and len(new_vcf_1) == len(new_vcf_3),\
        "The number of site positions in " +\
        "new_vcf_1, new_vcf_2, and new_vcf_3" +\
        "are different."
        
    return(new_vcf_1, new_vcf_2, new_vcf_3)

In [28]:
base_dir = "../data/modern_ooa_unequal_900505_haploid_miss05_yri_demes_yri/"

In [34]:
maf_dict = []
for i in np.arange(10):
    vcf_file = base_dir + "ref/ref." + str(i) + ".vcf"
    print(f"{vcf_file}")
    maf_dict.append(compute_maf_from_vcf(vcf_file,
                                         categorize = True,
                                         verbose = False))

../data/modern_ooa_unequal_900505_haploid_miss05_yri_demes_yri/ref/ref.0.vcf
../data/modern_ooa_unequal_900505_haploid_miss05_yri_demes_yri/ref/ref.1.vcf
../data/modern_ooa_unequal_900505_haploid_miss05_yri_demes_yri/ref/ref.2.vcf
../data/modern_ooa_unequal_900505_haploid_miss05_yri_demes_yri/ref/ref.3.vcf
../data/modern_ooa_unequal_900505_haploid_miss05_yri_demes_yri/ref/ref.4.vcf
../data/modern_ooa_unequal_900505_haploid_miss05_yri_demes_yri/ref/ref.5.vcf
../data/modern_ooa_unequal_900505_haploid_miss05_yri_demes_yri/ref/ref.6.vcf
../data/modern_ooa_unequal_900505_haploid_miss05_yri_demes_yri/ref/ref.7.vcf
../data/modern_ooa_unequal_900505_haploid_miss05_yri_demes_yri/ref/ref.8.vcf
../data/modern_ooa_unequal_900505_haploid_miss05_yri_demes_yri/ref/ref.9.vcf


In [38]:
method = "tsonly"
imputed_dir = "imputed" + "_" + method + "/"

results = []

for i in range(10):
    true_vcf_file    = base_dir + "true/" + "true."  + str(i) + ".vcf"
    miss_vcf_file    = base_dir + "miss/" + "miss."  + str(i) + ".vcf"
    imputed_vcf_file = base_dir + imputed_dir + "imputed." + str(i) + ".vcf"
    print(f"{imputed_vcf_file}")
    
    stats = compare_variants(true_vcf_file,
                             miss_vcf_file,
                             imputed_vcf_file,
                             maf_dict = maf_dict[i],
                             ploidy_level = 1,
                             verbose = False)
    
    results.append(stats)

../data/modern_ooa_unequal_900505_haploid_miss05_yri_demes_yri/imputed_tsonly/imputed.0.vcf
../data/modern_ooa_unequal_900505_haploid_miss05_yri_demes_yri/imputed_tsonly/imputed.1.vcf
../data/modern_ooa_unequal_900505_haploid_miss05_yri_demes_yri/imputed_tsonly/imputed.2.vcf
../data/modern_ooa_unequal_900505_haploid_miss05_yri_demes_yri/imputed_tsonly/imputed.3.vcf
../data/modern_ooa_unequal_900505_haploid_miss05_yri_demes_yri/imputed_tsonly/imputed.4.vcf
../data/modern_ooa_unequal_900505_haploid_miss05_yri_demes_yri/imputed_tsonly/imputed.5.vcf
../data/modern_ooa_unequal_900505_haploid_miss05_yri_demes_yri/imputed_tsonly/imputed.6.vcf
../data/modern_ooa_unequal_900505_haploid_miss05_yri_demes_yri/imputed_tsonly/imputed.7.vcf
../data/modern_ooa_unequal_900505_haploid_miss05_yri_demes_yri/imputed_tsonly/imputed.8.vcf
../data/modern_ooa_unequal_900505_haploid_miss05_yri_demes_yri/imputed_tsonly/imputed.9.vcf


In [39]:
for i, result in enumerate(results):
    print("\n".join([';'.join([str(i), method, k, str(v[0]), str(v[1]), str(v[2])])
                     for k, v
                     in result[3].items()]))

0;tsonly;( 0.0000%,  0.0100%);86.0;76.0;0.883721
0;tsonly;[ 0.0100%,  0.1000%);601.0;599.0;0.996672
0;tsonly;[ 0.1000%,  0.2000%);545.0;545.0;1.0
0;tsonly;[ 0.2000%,  0.3000%);338.0;337.0;0.997041
0;tsonly;[ 0.3000%,  0.4000%);434.0;434.0;1.0
0;tsonly;[ 0.4000%,  0.5000%);229.0;229.0;1.0
0;tsonly;[ 0.5000%,  1.0000%);977.0;977.0;1.0
0;tsonly;[ 1.0000%,  2.0000%);473.0;473.0;1.0
0;tsonly;[ 2.0000%,  5.0000%);133.0;133.0;1.0
0;tsonly;[ 5.0000%, 10.0000%);247.0;247.0;1.0
0;tsonly;[10.0000%, 20.0000%);577.0;577.0;1.0
0;tsonly;[20.0000%, 30.0000%);520.0;520.0;1.0
0;tsonly;[30.0000%, 40.0000%);628.0;628.0;1.0
0;tsonly;[40.0000%, 50.0000%];4203.0;4203.0;1.0
1;tsonly;( 0.0000%,  0.0100%);102.0;94.0;0.921569
1;tsonly;[ 0.0100%,  0.1000%);549.0;549.0;1.0
1;tsonly;[ 0.1000%,  0.2000%);461.0;461.0;1.0
1;tsonly;[ 0.2000%,  0.3000%);487.0;487.0;1.0
1;tsonly;[ 0.3000%,  0.4000%);146.0;146.0;1.0
1;tsonly;[ 0.4000%,  0.5000%);197.0;197.0;1.0
1;tsonly;[ 0.5000%,  1.0000%);330.0;329.0;0.99697
1;tsonly;[ 