In [9]:
import vcf # type: ignore
import random
def snp_vcf_to_genepop_dna(vcf_file, output_file):
    """Convert SNP VCF format to GenePop format with DNA bases converted to numbered alleles."""
    # Mapping DNA bases to numbers for GenePop
    dna_to_number = {'A': '001', 'C': '002', 'G': '003', 'T': '004'}
    
    vcf_reader = vcf.Reader(filename=vcf_file)
    with open(output_file, 'w') as genepop:
        # Write title
        genepop.write("SNPvcf2genepop\n")
        
        # Collect locus names
        # locus_names = [record.ID if record.ID else f"SNP_{record.POS}" for record in vcf_reader]
        locus_names = []
        unique_ids = set()
        for record in vcf_reader:
            locus_id = f"SNP_{record.POS}"
            while locus_id in unique_ids:
                # Add a random digit to the end of the locus_id if it's not unique
                locus_id += str(random.randint(0, 9))
            unique_ids.add(locus_id)
            locus_names.append(locus_id)
        genepop.write("\n".join(locus_names) + "\n")
        
        # Reset the reader to read samples
        vcf_reader = vcf.Reader(filename=vcf_file)
        
        # Initialize a dictionary to collect genotypes for each sample
        samples_genotypes = {}
        for record in vcf_reader:
            alleles_map = [record.REF] + [alt.sequence for alt in record.ALT]  # 0 is REF, 1 and more are ALT alleles
            for sample in record.samples:
                if sample.sample not in samples_genotypes:
                    samples_genotypes[sample.sample] = []
                # Convert numeric genotype to DNA bases using REF and ALT
                if sample.gt_alleles:
                    dna_alleles = [alleles_map[int(allele)] if allele is not None else '0' for allele in sample.gt_alleles]
                    numbered_alleles = [dna_to_number.get(allele, '000') for allele in dna_alleles]
                    genotype = ''.join(numbered_alleles)
                else:
                    genotype = '000000'  # Default for missing data (two alleles missing)
                samples_genotypes[sample.sample].append(genotype)
        
        # Organize samples by prefix
        prefix_groups = {}
        for sample, genotypes in samples_genotypes.items():
            prefix = sample.split('_')[0]
            if prefix not in prefix_groups:
                prefix_groups[prefix] = []
            prefix_groups[prefix].append((sample, genotypes))
        
        # Write genotypes with POP labels for each group
        for prefix, samples in prefix_groups.items():
            genepop.write("POP\n")
            for sample, genotypes in samples:
                genepop.write(f"{sample} ,  " + " ".join(genotypes) + "\n")


In [10]:
# Example usage (commented out):
snp_vcf_to_genepop_dna("vcf/bass_snp_clean.vcf", "SNP.filted.gen")