In [1]:
# Import libs and setup environment

import allel
import seaborn as sns
import zarr
import xarray as xr
import plotly.express as px
import dask.array as da
from collections import Counter
import numpy as np
import json
import hashlib
import numba
import pandas as pd
import matplotlib.pyplot as plt

from Bio import SeqIO
from pathlib import Path

# Define metadata and qc bool globally to start
#define useful variables
zarr_base_path = f"/scratch/user/uqtdenni/afar_production_bunya/curation/uq-beebe-001/combined_zarr/{{contig}}.zarr"

# Let's start by converting zarrs for the 5 largest contigs - we can do QC on them...
ref_path = '/scratch/user/uqtdenni/afar_production_bunya/reference/VectorBase-54_AfarautiFAR1_Genome.fasta'
# now let's get a list of the contigs that we are going to call over
contig_lengths = {}
for record in SeqIO.parse(ref_path, "fasta"):
    seq_id = record.id
    seq_length = len(record.seq)
    contig_lengths[seq_id] = seq_length
filtered_contigs = {k: v for k, v in sorted(contig_lengths.items(), key=lambda item: item[1], reverse=True) if v > 100000}

# Because these data are unstaged, we need to faff about a bit more and load the unstaged metadata to exclude extra dud samples
df_samples_dirty = pd.read_csv('/scratch/user/uqtdenni/far_hin_1.x/work/metadata_development_20250702/sample_metadata_interim_seq_qc_pass.txt', index_col = 'derived_sample_id')
# And load the final (cleaned) metadata
df_samples = pd.read_csv('/scratch/user/uqtdenni/far_hin_1.x/work/metadata_development_20250702/sample_metadata_pass_qc.txt', index_col = 'derived_sample_id')

# Mask removing samples we removed before the staging step of QC (that I haven't done yet)
qc_bool = df_samples_dirty.index.isin(df_samples.index)

In [5]:
# Define helper functions

zarr_base_path = f"/scratch/user/uqtdenni/afar_production_bunya/curation/uq-beebe-001/combined_zarr/{{contig}}.zarr"

def hash_params(*args, **kwargs):
    """Helper function to hash analysis parameters."""
    o = {
        'args': args,
        'kwargs': kwargs
    }
    s = json.dumps(o, sort_keys=True).encode()
    h = hashlib.md5(s).hexdigest()
    return h

def load_genotype_array(contig, qc_bool=qc_bool, df_samples=df_samples, sample_query = None):
    # Load gts and remove failed qc samples
    z = zarr.open(zarr_base_path.format(contig = contig))
    gt = allel.GenotypeChunkedArray(z[f"{contig}/calldata/GT"])
    gt = gt.compress(qc_bool, axis=1)
    # If an additional mask is supplied to subset the data from the finished metadata, apply, else return all samples
    if sample_query is not None:
        bool_query = np.array(df_samples.eval(sample_query))
        return gt.compress(bool_query, axis=1)
    else:
        return gt

def compute_fst(ac1, ac2, scheme):
    
    if scheme == 'first':
        loc_asc = ac1.is_segregating()
    elif scheme == 'second':
        loc_asc = ac2.is_segregating()
    elif scheme == 'either':
        loc_asc = ac1.is_segregating() | ac2.is_segregating()
    elif scheme == 'both':
        loc_asc = ac1.is_segregating() & ac2.is_segregating()    
    n_snps = np.count_nonzero(loc_asc)
    
    ac1 = ac1.compress(loc_asc, axis=0)
    ac2 = ac2.compress(loc_asc, axis=0)
    
    fst, se, _, _ = allel.blockwise_hudson_fst(ac1, ac2, blen=10000)
    
    print('%.04f +/- %.04f (using %s SNPs segregating in %s population)' % (fst, se, n_snps, scheme))

def compute_ac(contig, qc_bool=qc_bool, df_samples=df_samples, sample_query = None):
    gt = load_genotype_array(contig, qc_bool, df_samples, sample_query)
    return(gt.count_alleles())
    
def fst_gwss(sample_query_a, sample_query_b, contig, window_size):
    
    z = zarr.open(zarr_base_path.format(contig = contig))

    ac_a = compute_ac(contig, qc_bool, df_samples, sample_query_a)
    ac_b = compute_ac(contig, qc_bool, df_samples, sample_query_b)

    pos =  np.array((z[f"{contig}/variants/POS"]))
    
    fst = allel.moving_hudson_fst(ac_a, ac_b, size=window_size)
    # Sometimes Fst can be very slightly below zero, clip for simplicity.
    
    fst = np.clip(fst, a_min=0.0, a_max=1)

    x = allel.moving_statistic(pos, statistic=np.mean, size=window_size)    
        
    return(x, fst)
    
def fst_analysis(sample_query_a,sample_query_b, contig, window_size, results_dir='results_cache'):
    
    params = dict(
            sample_query_a=sample_query_a,
            sample_query_b=sample_query_b,
            contig=contig,
            window_size=window_size,
        )

     # construct a key to save the results under
    results_key = hash_params(
        params
    )

    # define paths for results files
    fst_path = f'{results_dir}/{results_key}-fst.csv'
    x_path = f'{results_dir}/{results_key}-x.npy'

    try:
        # try to load previously generated results
        fst = np.load(fst_path)
        x = np.load(x_path)
        return (fst, x)
    except FileNotFoundError:
        # no previous results available, need to run analysis
        print(f'running analysis: {results_key}')
    
    print('setting up inputs')

    results = fst_gwss(**params)

    x = results[0]
    fst = results[1]

    np.save(fst_path, fst)
    np.save(x_path, x)
    print(f'saved results: {results_key}')

    return (fst, x)


def plot_fst(sample_query_a,
             sample_query_b,
             winsize,
             title
             ):

    df_contigs = []

    # Progress bar for contigs
    for contig in filtered_contigs.keys():
        
        f,p = fst_analysis(sample_query_a,sample_query_b,contig, winsize)
        
        scan_df = pd.DataFrame({'contig':contig,'pos' : p, 'fst':f})

        df_contigs.append(scan_df)

    # Concatenate df and save out
    df = pd.concat(df_contigs)

    # Sort contigs by length (longest first)
    contig_lengths = df.groupby('contig')['pos'].max()
    sorted_contigs = contig_lengths.sort_values(ascending=False).index.tolist()
    sorted_lengths = contig_lengths.loc[sorted_contigs]
    contig_offsets = sorted_lengths.cumsum().shift(fill_value=0)

    df['contig'] = pd.Categorical(df['contig'], categories=sorted_contigs, ordered=True)
    df = df.sort_values(['contig', 'pos']).copy()
    df['contig_offset'] = df['contig'].map(contig_offsets).astype(float)
    df['genome_position'] = df['pos'] + df['contig_offset']

    color_map = {
        contig: ('lightblue' if i % 2 == 0 else 'steelblue')
        for i, contig in enumerate(sorted_contigs)
    }
    df['color'] = df['contig'].map(color_map)

    fig = px.line(
        df,
        x='genome_position',
        y='fst',
        color='contig',
        color_discrete_map=color_map,
        labels={'genome_position': 'Genomic Position', 'fst': "Hudson's Fst"},
        hover_data = ['pos'],
        template = 'simple_white',
        title=title
    )

    fig.update_traces(marker=dict(size=3))  
    fig.update_layout(showlegend=False)

    
    fig.show(renderer="iframe")



In [None]:
plot_fst(sample_query_a = 'species_pca == "hinesorum" & admin1_iso == "SB-WE"',
         sample_query_b = 'species_pca == "hinesorum" & admin1_iso == "SB-GU"',
         winsize=100_000,
         title = 'Fst, SB-WE:SB-GU')

In [7]:
plot_fst(sample_query_a = 'species_pca == "hinesorum" & admin1_iso == "SB-WE"',
         sample_query_b = 'species_pca == "hinesorum" & admin1_iso == "PG-MBA"',
         winsize=50_000,
         title = 'Fst, SB-WE:PG-MBA')

running analysis: af32f806db78cd5926faf571b3e84b2b
setting up inputs
saved results: af32f806db78cd5926faf571b3e84b2b
running analysis: ea6ea0cab40eecbfdd6ee232a631eea5
setting up inputs
saved results: ea6ea0cab40eecbfdd6ee232a631eea5
running analysis: 9c9fc73c8fa7172911b1780f4b9f7ed3
setting up inputs
saved results: 9c9fc73c8fa7172911b1780f4b9f7ed3
running analysis: 787b65c21106e95ae9722dc6692332ce
setting up inputs
saved results: 787b65c21106e95ae9722dc6692332ce
running analysis: 4e8c1294b0ed5c839157c49d847cb21c
setting up inputs
saved results: 4e8c1294b0ed5c839157c49d847cb21c
running analysis: 693fddb3984a46f892a9a2094c3418b5
setting up inputs
saved results: 693fddb3984a46f892a9a2094c3418b5
running analysis: 80c59f39afbf6ac4d9750d901f3e7e37
setting up inputs
saved results: 80c59f39afbf6ac4d9750d901f3e7e37
running analysis: 097f7732bc8f329c21f4dc36e6d874c1
setting up inputs
saved results: 097f7732bc8f329c21f4dc36e6d874c1
running analysis: e8ad02f62a3b5c52e75ecf3ad1a39cbf
setting up in