# Preprocess genotype data

In [7]:
import vcf
import csv
import os

# Directory containing VCF files
vcf_dir = '/Users/itscclemon/Desktop/CSE 284/'

# Output directory for CSV files
csv_dir = '/Users/itscclemon/Desktop/CSE 284/'

# Create the output directory if it doesn't exist
if not os.path.exists(csv_dir):
    os.makedirs(csv_dir)

# Get a list of VCF files in the directory
vcf_files = [f for f in os.listdir(vcf_dir) if f.endswith('.vcf')]

for vcf_file in vcf_files:
    vcf_path = os.path.join(vcf_dir, vcf_file)
    csv_file = os.path.join(csv_dir, os.path.splitext(vcf_file)[0] + '_snp_individual_ids.csv')

    with open(vcf_path, 'r') as vcf_input:
        vcf_reader = vcf.Reader(vcf_input)
        
        # Extract individual IDs from the VCF header
        individual_ids = vcf_reader.samples
        
        with open(csv_file, 'w', newline='') as csv_output:
            csv_writer = csv.writer(csv_output)
            
            # Write header row with SNP ID and individual IDs
            header_row = ['SNP_ID'] + individual_ids
            csv_writer.writerow(header_row)
            
            # Iterate over each record in the VCF file
            for record in vcf_reader:
                # Extract SNP ID
                snp_id = record.ID if record.ID else '.'
                
                # Extract genotype data for each individual
                genotype_data = [record.genotype(individual_id).gt_bases for individual_id in individual_ids]
                
                # Write SNP ID and genotype data to CSV row
                csv_writer.writerow([snp_id] + genotype_data)

print("CSV files have been generated successfully.")


CSV files have been generated successfully.
