In [10]:
import allel
import seaborn as sns
import zarr
import xarray as xr
import plotly.express as px
import dask.array as da
import numpy as np
import json
import hashlib
import pandas as pd
import bokeh.plotting as bkplt
import bokeh.io as bkio
import bokeh.palettes as bokpalet
import bokeh.models as bkmod
import matplotlib.pyplot as plt
import cartopy
from Bio import SeqIO
from pathlib import Path
import sys

In [11]:
import kaleido
kaleido.get_chrome_sync()  

PosixPath('/home/uqtdenni/.conda/envs/far_hin_1.x/lib/python3.13/site-packages/choreographer/cli/browser_exe/chrome-linux64/chrome')

In [47]:
# Define metadata and qc bool globally to start
#define useful variables
zarr_base_path = f"/scratch/user/uqtdenni/afar_production_bunya/curation/uq-beebe-001/staged_zarr/{{contig}}.zarr"

# Let's start by converting zarrs for the 5 largest contigs - we can do QC on them...
ref_path = '/scratch/user/uqtdenni/afar_production_bunya/reference/VectorBase-54_AfarautiFAR1_Genome.fasta'
# now let's get a list of the contigs that we are going to call over
contig_lengths = {}
for record in SeqIO.parse(ref_path, "fasta"):
    seq_id = record.id
    seq_length = len(record.seq)
    contig_lengths[seq_id] = seq_length
filtered_contigs = {k: v for k, v in sorted(contig_lengths.items(), key=lambda item: item[1], reverse=True) if v > 100000}

# Because these data are unstaged, we need to faff about a bit more and load the unstaged metadata to exclude extra dud samples
#df_samples = pd.read_csv('/scratch/user/uqtdenni/far_hin_1.x/work/metadata_development_20250702/sample-metadata-master.txt', sep = '\t',index_col = 'partner_sample_id')
# And load the final (cleaned) metadata
#qcpass_data = pd.read_table('/scratch/user/uqtdenni/far_hin_1.x/work/metadata_development_20250702/sample_metadata_pass_qc_fix.txt', index_col = 'partner_sample_id')

df_samples = pd.read_csv('/scratch/user/uqtdenni/far_hin_1.x/work/metadata_development_20250702/metadata-staged-speciesconfirmed-20251011.txt',sep='\t')

# Mask removing samples we removed before the staging step of QC (that I haven't done yet)
#qc_bool = df_samples_dirty.index.isin(df_samples.index)

# Zarr location
zarr_base_path = f"/scratch/user/uqtdenni/afar_production_bunya/curation/uq-beebe-001/combined_zarr/{{contig}}.zarr"


In [14]:
# Define helper functions

def hash_params(*args, **kwargs):
    """Helper function to hash analysis parameters."""
    o = {
        'args': args,
        'kwargs': kwargs
    }
    s = json.dumps(o, sort_keys=True).encode()
    h = hashlib.md5(s).hexdigest()
    return h

# Define helper functions
def load_genotype_array(contig, df_samples=df_samples, sample_query = None, n_snps=None):
    # Load gts and remove failed qc samples
    z = zarr.open(zarr_base_path.format(contig=contig))
    
    # Variant-level mask: punctulatus_group_filter_pass
    filter_mask = z[f"{contig}/filter_pass"][:]

    gt = allel.GenotypeChunkedArray(z[f"{contig}/calldata/GT"])
    
    # Apply combined variant mask
    gt = allel.GenotypeChunkedArray(z[f"{contig}/calldata/GT"])
    gt = gt.compress(filter_mask, axis=0)    # Filter variants
    
    # If an additional mask is supplied to subset the data from the finished metadata, apply, else return all samples
    if sample_query is not None:
        bool_query = np.array(df_samples.eval(sample_query))
        gt = gt.compress(bool_query, axis=1)
    if n_snps is not None:
            gt = select_random_genotypes_sorted(gt, n_snps)

    return gt


def select_random_elements_sorted(g, x, replace=False, seed=None):
    """
    Select x random rows from a 2D array, returned in sorted order.

    Parameters:
    - array: array 2d, shape (n_genotypes, n_features)
    - x: int, number of rows to select
    - replace: bool, whether sampling is with replacement (default: False)
    - seed: int, random seed for reproducibility (default: None)

    Returns:
    - np.ndarray of shape (x, n_features)
    """

    # Select random sites from that set
    rng = np.random.default_rng(seed)
    n_rows = g.shape[0]
    if not replace and x > n_rows:
        raise ValueError(f"Cannot select {x} rows without replacement from {n_rows} total rows.")
    indices = rng.choice(n_rows, size=x, replace=replace)
    sorted_indices = np.sort(indices)

    return g[sorted_indices]

def compute_ac(contig, is_biallelic=True, is_segregating=True, min_minor_ac=1, n_snps=None, sample_query=None, to_alt = None):
    
    g = load_genotype_array(contig=contig, sample_query=sample_query)  
    
    ac = g.count_alleles()
    
    mask = None
    
    # Apply biallelic filter
    if is_biallelic:
        biallelic_mask = ac.is_biallelic()
        mask = biallelic_mask if mask is None else mask & biallelic_mask
    
    # Apply segregating filter
    if is_segregating:
        segregating_mask = ac.is_segregating()
        mask = segregating_mask if mask is None else mask & segregating_mask
    
    # Apply minor allele count filter
    if min_minor_ac is not None:
        an = ac.sum(axis=1)
    # Apply minor allele count condition.
        ac_minor = ac[:, 1:].sum(axis=1)
        if isinstance(min_minor_ac, float):
            ac_minor_frac = ac_minor / an
            loc_minor_mask = ac_minor_frac >= min_minor_ac
        else:
            loc_minor_mask = ac_minor >= min_minor_ac
        mask = loc_minor_mask if mask is None else mask & loc_minor_mask
    
    # Apply all filters at once
    if mask is not None:
        gt = g.compress(mask)
    
    # Random selection (if needed)
    if n_snps is not None:  # Fixed: 'if' instead of 'is'
        gt = select_random_elements_sorted(gt, n_snps)
    
    if to_alt is not None:
        return gt.to_n_alt()
    else:
        return gt.count_alleles()
    
def run_pca(contig, sample_df=df_samples, analysis_name='pca', n_snps=50_000, sample_query = None, results_dir='results_cache', min_minor_ac=1, n_components=10):

        # construct a key to save the results under
    results_key = hash_params(
        contig=contig,
        sample_query=sample_query,
        analysis_name=analysis_name,
        min_minor_ac=min_minor_ac,
        n_snps=n_snps,
        n_components=n_components,
    )

    # define paths for results files
    data_path = f'{results_dir}/{results_key}-data.csv'
    evr_path = f'{results_dir}/{results_key}-evr.npy'

    try:
        # try to load previously generated results
        data = pd.read_csv(data_path)
        evr = np.load(evr_path)
        return data, evr
    except FileNotFoundError:
        # no previous results available, need to run analysis
        print(f'running analysis: {results_key}')
    
    print('setting up inputs')

    # Load data

    # Prepare input matrix
    g = load_genotype_array(contig=contig, sample_query=sample_query) # Load genotypes
    ac = g.count_alleles() #Count alleles
    flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1) # Remove singletons and multiallelics
    gf = g.compress(flt, axis=0) #Apply filter
    gn = select_random_elements_sorted(gf, n_snps).to_n_alt() # Select random n snps

    # Subset sample df
    if sample_query is not None:
        df=sample_df.query(sample_query)
    else:
        df=sample_df

    # Run PCA
    coords, model = allel.pca(gn, n_components=10, scaler='patterson') # Run PCA
    df_coords = pd.DataFrame(
        {f"PC{i + 1}": coords[:, i] for i in range(coords.shape[1])}
    )    
    data = pd.concat([df.reset_index(), df_coords.reset_index(drop=True)], axis=1)

    #Save output
    evr = model.explained_variance_ratio_
    data.to_csv(data_path, index=False)
    np.save(evr_path, evr)
    print(f'saved results: {results_key}')

    return data, evr

def plot_pca(data, evr, i='PC1',j='PC2', colourfac='species_pca', palette = px.colors.qualitative.T10, plotname='pca'):


        pc_list = [f"PC{i+1}" for i in range(0, len(evr))]
        pc_dict = dict(zip(pc_list, evr.tolist()))

        xlab = f"{i}, {pc_dict[i] * 100:.2f}%"
        ylab = f"{j}, {pc_dict[j] * 100:.2f}%"


        fig = px.scatter(data,
                    x=i,
                    y=j,
                    color=colourfac,
                    template='simple_white',
                    hover_data = ['derived_sample_id','species','country','admin1_iso','admin1_name','divergence','median_cov', 'latitude','longitude'],
                    color_discrete_map = palette,
                    width = 600, height=500,
        )
        # Add axis titles
        fig.update_layout(
                xaxis_title=xlab,
                yaxis_title=ylab
        )
        fig.update_traces(marker=dict(size=10, opacity=0.7))

        fig.show(renderer="iframe")

        fig.write_image(f"{plotname}.svg")  # This should work with kaleido



In [16]:
# Set colour dict to standardise across plots

iso_colors = {
    'PG-GPK': '#1f77b4', 'PG-NSB': '#ff7f0e', 'SB-GU': '#2ca02c',
    'PG-MRL': '#d62728', 'AU-NT': '#9467bd', 'AU-QLD': '#8c564b',
    'SB-MK': '#e377c2', 'VU-SEE': '#7f7f7f', 'SB-WE': '#bcbd22',
    'PG-NPP': '#17becf', 'PG-MPL': '#aec7e8', 'PG-MPM': '#ffbb78',
    'PG-ESW': '#98df8a', 'PG-WPD': '#ff9896', 'PG-MBA': '#c5b0d5',
    'PG-CPM': '#c49c94', 'PG-HLA': '#f7b6d3', 'PG-SHM': '#c7c7c7',
    'PG-UNK': '#dbdb8d'
}



In [17]:
model_all, evr_all = run_pca(contig = "KI915040", n_snps=50_000)
plot_pca(model_all, evr_all, colourfac = 'species_pca', palette=iso_colors)

In [63]:
# Now let's map our species groupings / potential mis-id back to the metadata
# We can comment all of this out now
# New DF with updated 
df_samples_pass_speciesupdates = df_samples.copy()
#df_samples_pass_speciesupdates.set_index('partner_sample_id', inplace=True)
df_samples_pass_speciesupdates["species_pca"] =df_samples_pass_speciesupdates["species"]
hine_possibles = list(model_all[(model_all["PC2"] < -90) & (model_all.species == 'farauti_ss')].partner_sample_id)
hine_possibles = hine_possibles + ["tor_wQLD_Por_52","far_WPSI_sol160","hin_WPSL_sol135","hin_WPSL_sol160"]
df_samples_pass_speciesupdates.loc[df_samples_pass_speciesupdates.index.isin(hine_possibles), 'species_pca'] = 'hinesorum'
df_samples_pass_speciesupdates.loc[df_samples_pass_speciesupdates.index.isin(["hin_WPSL_sol86"]), 'species_pca'] = 'farauti_ss'


In [64]:
df_samples_pass_speciesupdates.query('partner_sample_id == "hin_WPSL_sol86"')

Unnamed: 0_level_0,la_sample_id,species,group,population,site,latitude,longitude,admin1_name,admin1_iso,country,country_iso,collection_year,collection_method,notes,Unnamed: 15,Unnamed: 16,species_pca
partner_sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
hin_WPSL_sol86,hin_WPSL_sol86,hinesorum,y,,,-8.407,157.338,Western,SB-WE,Solomon Islands,SB,2023,Larval,,,,farauti_ss


In [65]:
df_samples_pass_speciesupdates.to_csv('../metadata_development_20250702/metadata-staged-speciesconfirmed-20251011.txt', sep = '\t')

In [42]:
model_far, evr_far = run_pca(contig = "KI915040", n_snps=50_000, sample_query='species_pca == "farauti_ss"')
plot_pca(model_far, evr_far, colourfac='admin1_iso', palette = iso_colors, plotname = 'farauti_all')

In [43]:
model_far_png, evr_far_png = run_pca(contig = "KI915040", n_snps=50_000, sample_query='species_pca == "farauti_ss" & country == "Papua New Guinea" & admin1_iso != "PG-NSB" & admin1_iso != "PG-MRL" & admin1_iso != "PG-WPD" & partner_sample_id != "far_sPP_GR14-BC138"')
plot_pca(model_far_png, evr_far_png, colourfac='admin1_iso', palette = iso_colors)

In [44]:
model_far_au, evr_far_au = run_pca(contig = "KI915040", n_snps=50_000, sample_query='species_pca == "farauti_ss" & (country_iso == "AU" | admin1_iso == "PG-WPD")')
plot_pca(model_far_au, evr_far_au, colourfac='admin1_iso', palette = iso_colors, plotname = 'farauti_aus')

In [10]:
# Ghastly code to delineate PCA cohorts - for provisional analysis

# based on global pca
cohort_tsi_ng = model_far['partner_sample_id'][(model_far['PC1'] > 79) & (model_far['PC2'] < 10)] # TSI and southern PNG
cohort_au_mld = model_far['partner_sample_id'][(model_far['PC1'] > 95) & (model_far['PC2'] > 10)] # N QLD
cohort_manus = model_far['partner_sample_id'][(model_far['admin1_iso'] == 'PG-MRL')] #manus
cohort_si = model_far['partner_sample_id'][(model_far['country_iso'] == "SB") & (model_far['admin1_iso'] == "PG-NSB")] #solomons


# based on png pca
cohort_huon = model_far_png['partner_sample_id'][(model_far_png['admin1_iso'] == 'PG-MPL') & (model_far_png['admin1_iso'] == 'PG-NPP')] #around huon gulf
cohort_se = model_far_png['partner_sample_id'][(model_far_png['admin1_iso'] == 'PG-CPM') & (model_far_png['admin1_iso'] == 'PG-GPK')] # southeast coast
cohort_mba1 = model_far_png['partner_sample_id'][(model_far_png['PC2'] < 100)] #MBA1
cohort_mba2 = model_far_png['partner_sample_id'][(model_far_png['PC1'] < 50) & (model_far_png['PC1'] > 35)] #MBA2

# Define col for pca group
df_samples['pca_group'] = None
df_samples.loc[df_samples['partner_sample_id'].isin(cohort_tsi_ng), 'pca_group'] = 'tsi_png'
df_samples.loc[df_samples['partner_sample_id'].isin(cohort_au_mld), 'pca_group'] = 'aus_mld'
df_samples.loc[df_samples['partner_sample_id'].isin(cohort_manus), 'pca_group'] = 'manus_i'
df_samples.loc[df_samples['partner_sample_id'].isin(cohort_si), 'pca_group'] = 'sol_is'
df_samples.loc[df_samples['partner_sample_id'].isin(cohort_huon), 'pca_group'] = 'huon_g'
df_samples.loc[df_samples['partner_sample_id'].isin(cohort_se), 'pca_group'] = 'se_png'
df_samples.loc[df_samples['partner_sample_id'].isin(cohort_mba1), 'pca_group'] = 'mba_a'
df_samples.loc[df_samples['partner_sample_id'].isin(cohort_mba2), 'pca_group'] = 'mba_b'




KeyError: 'partner_sample_id'

In [None]:
df_samples.pca_group.unique()

In [10]:
model_hin, evr_hin= run_pca(contig = "KI915040", n_snps=50_000, sample_query='species_pca == "hinesorum"', analysis_name='fixed_again')
plot_pca(model_hin, evr_hin, colourfac='country',palette = iso_colors)

In [11]:
plot_pca(model_hin, evr_hin, colourfac='admin1_iso',palette = iso_colors)

In [15]:
model_hin, evr_hin= run_pca(contig = "KI915040", n_snps=50_000, sample_query='species_pca in ["hinesorum","oreios"]', analysis_name='fixed_again')
plot_pca(model_hin, evr_hin, colourfac='species_pca',palette = iso_colors)

In [17]:
plot_pca(model_hin, evr_hin, i='PC4', j='PC3',colourfac='species_pca',palette = iso_colors)

In [None]:
df_samples.species_pca

In [None]:
for c in list(filtered_contigs.keys())[:10]:
    model_hin, evr_hin= run_pca(contig = c, n_snps=50_000, sample_query='species_pca == "hinesorum"', analysis_name='fixed_again')
    plot_pca(model_hin, evr_hin, colourfac='admin1_iso',palette = iso_colors)

In [None]:
model_hin_sol, evr_hin_sol= run_pca(contig = "KI915040", n_snps=100_000, sample_query='species_pca == "hinesorum" & (country_iso == "SB" | admin1_iso == "PG-NSB")')
plot_pca(model_hin_sol, evr_hin_sol, colourfac='admin1_iso',palette = iso_colors)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(3, 3))  # square figure

# With more options
sns.scatterplot(data=model_hin, x='PC1', y='PC2', 
                color='#808080', alpha=0.7)
sns.despine()
#plt.title('Your Title')
plt.xlabel('')
plt.ylabel('')
plt.title('')
plt.xticks([])
plt.yticks([])
#plt.show()
plt.savefig('all_hin.svg',format='svg', bbox_inches='tight')

In [None]:
model_far_au['size']=10

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(3, 3))  # square figure

# With more options
sns.scatterplot(data=model_far_au, x='PC2', y='PC1', 
                hue='admin1_iso', palette = iso_colors, alpha=0.7)
sns.despine()
#plt.title('Your Title')
plt.xlabel('')
plt.ylabel('')
plt.title('')
plt.xticks([])
plt.yticks([])
#plt.show()
plt.savefig('hin_sb.svg',format='svg', bbox_inches='tight')