In [1]:
import pandas as pd
import os
from Bio import SeqIO
import zarr

In [2]:
from sgkit.io.vcf import vcf_to_zarr

In [3]:
wgs_sample_manifest = pd.read_table('/QRISdata/Q6151/dennistpw/far_hin_1.x/production/sample_manifest.txt')
ref_path = '/QRISdata/Q6151/dennistpw/far_hin_1.x/data/reference/VectorBase-54_AfarautiFAR1_Genome.fasta'

In [26]:
files = wgs_sample_manifest.gvcf_path + '/' + wgs_sample_manifest.gvcf
extant_files = []
for file_path in files:
    if os.path.exists(file_path):
        extant_files.append(file_path)
    else:
        print(f"{file_path} does not exist.")

/QRISdata/Q6151/6_gvcf/1_An_farauti/1_gvcf/BC123.g.vcf.gz does not exist.


TypeError: stat: path should be string, bytes, os.PathLike or integer, not float

In [27]:
# check indices and write files that look ok
file_path = "/QRISdata/Q6151/dennistpw/far_hin_1.x/work/gvcf_list_forgenotyping.txt"

needs_indexing = []
with open(file_path, "w") as file:
    for file_path in files:
        if os.path.exists(file_path+'.tbi'): 
            file.write(str(file_path) + "\n")
        else:
            print(f"{file_path}.tbi does not exist.")
            needs_indexing.append(file_path)

/QRISdata/Q6151/6_gvcf/1_An_farauti/1_gvcf/BC123.g.vcf.gz.tbi does not exist.


TypeError: unsupported operand type(s) for +: 'float' and 'str'

In [4]:
# now let's get a list of the contigs that we are going to call over
contig_lengths = {}
for record in SeqIO.parse(ref_path, "fasta"):
    seq_id = record.id
    seq_length = len(record.seq)
    contig_lengths[seq_id] = seq_length



In [7]:
# how many contigs are there
print(f"there are {sum(contig_lengths.values())} bp in  {len(contig_lengths)} contigs")
# let's prune this

there are 183103254 bp in  310 contigs


In [6]:
# Get contigs over eg 1e5 bp
filtered_contigs = {k: v for k, v in sorted(contig_lengths.items(), key=lambda item: item[1], reverse=True) if v > 100000}
print(f"there are {sum(filtered_contigs.values())} bp in  {len(filtered_contigs)} filtered contigs")
# we lose 3 million bp for a fairly substantial loss of small contigs - let's call on these larger contigs only

there are 180662415 bp in  46 filtered contigs


In [8]:
filtered_contigs

{'KI915040': 30017264,
 'KI915041': 22738934,
 'KI915042': 16089237,
 'KI915043': 15719670,
 'KI915044': 12895223,
 'KI915045': 12084465,
 'KI915046': 7675037,
 'KI915047': 6913162,
 'KI915048': 6651167,
 'KI915049': 6084899,
 'KI915050': 6025392,
 'KI915051': 3202847,
 'KI915052': 3111553,
 'KI915053': 2944040,
 'KI915054': 2584459,
 'KI915055': 2376283,
 'KI915056': 1893860,
 'KI915057': 1856345,
 'KI915058': 1744624,
 'KI915059': 1738253,
 'KI915060': 1664982,
 'KI915061': 1567260,
 'KI915062': 1435676,
 'KI915063': 1423879,
 'KI915064': 1392011,
 'KI915065': 1359364,
 'KI915066': 1336591,
 'KI915067': 1024229,
 'KI915068': 850718,
 'KI915069': 738023,
 'KI915070': 576774,
 'KI915071': 445017,
 'KI915072': 338598,
 'KI915073': 300166,
 'KI915074': 274521,
 'KI915075': 267210,
 'KI915076': 177854,
 'KI915077': 160886,
 'KI915078': 148821,
 'KI915079': 142905,
 'AXCN02002502': 132425,
 'KI915080': 116697,
 'KI915081': 115073,
 'KI915082': 111152,
 'KI915083': 107467,
 'KI915084': 1074

In [None]:
def create_bed_intervals(contigs_dict, num_intervals, output_file):
    """Create evenly distributed BED intervals from a dictionary of contigs."""
    # Calculate total genome size and bases per interval
    total_size = sum(contigs_dict.values())
    bases_per_interval = total_size / num_intervals
    
    bed_data = []
    remaining_intervals = num_intervals
    
    # Distribute intervals across contigs
    for contig, length in contigs_dict.items():
        # Calculate number of intervals for this contig
        contig_intervals = max(1, round(length / bases_per_interval))
        contig_intervals = min(contig_intervals, remaining_intervals)
        
        if contig_intervals <= 0:
            continue
            
        remaining_intervals -= contig_intervals
        interval_size = length / contig_intervals
        
        # Create intervals
        for i in range(contig_intervals):
            start = int(i * interval_size)  # 0-based for BED
            end = int((i + 1) * interval_size)
            
            # Ensure last interval covers to end of contig
            if i == contig_intervals - 1:
                end = length
                
            name = f"{contig}:{start+1}-{end}"  # 1-based in name for readability
            bed_data.append([contig, start, end, name])
            
        if remaining_intervals <= 0:
            break
    
    # Create DataFrame and write to BED file
    df = pd.DataFrame(bed_data, columns=['chrom', 'start', 'end', 'name'])
    df.to_csv(output_file, sep='\t', header=False, index=False)
    
    return len(bed_data)

In [None]:
create_bed_intervals(filtered_contigs, 50, "/QRISdata/Q6151/dennistpw/far_hin_1.x/work/contigs_over_1e5.bed")

In [None]:
# More simply, we can output a list of contigs and use that as an input to a single genotypegvcfs call
file_path = "/QRISdata/Q6151/dennistpw/far_hin_1.x/work/contigs_to_call.txt"

needs_indexing = []
with open(file_path, "w") as file:
    for c in filtered_contigs.keys():
            file.write(str(c) + "\n")


In [23]:
# Let's have a look at zarr conversion now


# Load vcf names and drop na
vcfs_for_zarr = wgs_sample_manifest['genotyped_vcf_name'].dropna()



inputs = [f'/QRISdata/Q6151/dennistpw/far_hin_1.x/production/genotyped_vcf/{vcfs_for_zarr[70]}',f'/QRISdata/Q6151/dennistpw/far_hin_1.x/production/genotyped_vcf/{vcfs_for_zarr[50]}']


['/QRISdata/Q6151/dennistpw/far_hin_1.x/production/genotyped_vcf/far_nNG_LR-176BO2.vcf.gz',
 '/QRISdata/Q6151/dennistpw/far_hin_1.x/production/genotyped_vcf/far_Man-2.vcf.gz']

In [44]:
vcf

'far_nNG_LR-176BO2.vcf.gz'

In [45]:
%%time
from sgkit.io.vcf import partition_into_regions, vcf_to_zarr

vcf = f'/QRISdata/Q6151/dennistpw/far_hin_1.x/production/genotyped_vcf/{vcfs_for_zarr[70]}'

regions = partition_into_regions(vcf, num_parts=10)
#vcf_to_zarr(regions, "/QRISdata/Q6151/dennistpw/far_hin_1.x/production/output.zarr")

CPU times: user 249 ms, sys: 68.8 ms, total: 318 ms
Wall time: 257 ms


In [10]:
t = zarr.open('/scratch/user/uqtdenni/zarr_by_contig/KI915066.zarr')
x = t['call_AD'][:]
x.shape

(4009773, 1, 4)

In [None]:
vcf = f'/QRISdata/Q6151/dennistpw/far_hin_1.x/production/genotyped_vcf/{vcfs_for_zarr[70]}'


In [14]:
import sgkit as sg
import allel

In [6]:
test = sg.load_dataset('/QRISdata/Q6151/dennistpw/far_hin_1.x/production/tests/test_output.zarr')

In [10]:
t = test.call_genotype.compute()

In [32]:
%time
from sgkit.io.vcf import partition_into_regions, vcf_to_zarr
vcf = "/QRISdata/Q6151/dennistpw/far_hin_1.x/production/tests/test.vcf.gz"

regions = partition_into_regions(vcf, num_parts=10)

#vcf_to_zarr(vcf, "/QRISdata/Q6151/dennistpw/far_hin_1.x/production/tests/test_output.zarr", regions = regions, fields = ['FORMAT/GQ', 'FORMAT/AD', 'FORMAT/GT','INFO/MQ'])


CPU times: user 2 μs, sys: 1 μs, total: 3 μs
Wall time: 6.44 μs


In [34]:
%%time
vcf_to_zarr(vcf, "/QRISdata/Q6151/dennistpw/far_hin_1.x/production/tests/test_output.zarr", regions = regions, fields = ['FORMAT/GQ', 'FORMAT/AD', 'FORMAT/GT','INFO/MQ'])


CPU times: user 23.9 s, sys: 8.68 s, total: 32.6 s
Wall time: 27.6 s
