In [3]:
import allel
import zarr
import numpy as np
#import malariagen_data
import pandas as pd
import plotly.express as px
from tqdm.notebook import tqdm
import sgkit as sg
import seaborn as sns
import matplotlib.pyplot as plt
from Bio import SeqIO

# Set plot style
sns.set(style="whitegrid")


In [4]:
# Define metadata and qc bool globally to start
#define useful variables
zarr_base_path = f"/scratch/user/uqtdenni/afar_production_bunya/curation/uq-beebe-001/combined_zarr/{{contig}}.zarr"

# Let's start by converting zarrs for the 5 largest contigs - we can do QC on them...
ref_path = '/scratch/user/uqtdenni/afar_production_bunya/reference/VectorBase-54_AfarautiFAR1_Genome.fasta'
# now let's get a list of the contigs that we are going to call over
contig_lengths = {}
for record in SeqIO.parse(ref_path, "fasta"):
    seq_id = record.id
    seq_length = len(record.seq)
    contig_lengths[seq_id] = seq_length
filtered_contigs = {k: v for k, v in sorted(contig_lengths.items(), key=lambda item: item[1], reverse=True) if v > 100000}

# Because these data are unstaged, we need to faff about a bit more and load the unstaged metadata to exclude extra dud samples
df_samples_dirty = pd.read_csv('/scratch/user/uqtdenni/far_hin_1.x/work/metadata_development_20250702/sample_metadata_interim_seq_qc_pass.txt', index_col = 'derived_sample_id')
# And load the final (cleaned) metadata
df_samples = pd.read_table('/scratch/user/uqtdenni/far_hin_1.x/work/metadata_development_20250702/sample_metadata_pass_qc_fix.txt', index_col = 'derived_sample_id')

# Mask removing samples we removed before the staging step of QC (that I haven't done yet)
qc_bool = df_samples_dirty.index.isin(df_samples.index)

# Zarr location
zarr_base_path = f"/scratch/user/uqtdenni/afar_production_bunya/curation/uq-beebe-001/combined_zarr/{{contig}}.zarr"


In [23]:
# Define helper functions
def load_genotype_array(contig, qc_bool=qc_bool, df_samples=df_samples, sample_query = None, n_snps=None, is_segregating=None):
    # Load gts and remove failed qc samples
    z = zarr.open(zarr_base_path.format(contig=contig))
    
    # Variant-level mask: punctulatus_group_filter_pass
    filter_mask = z[f"{contig}/punctulatus_group_filter_pass"][:]
    
    # Apply combined variant mask
    gt = allel.GenotypeChunkedArray(z[f"{contig}/calldata/GT"])
    gt = gt.compress(qc_bool, axis=1)          # Filter samples by QC
    gt = gt.compress(filter_mask, axis=0)    # Filter variants
    
    # If an additional mask is supplied to subset the data from the finished metadata, apply, else return all samples
    if sample_query is not None:
        bool_query = np.array(df_samples.eval(sample_query))
        gt = gt.compress(bool_query, axis=1)

    if is_segregating is not None:
            is_seg = gt.count_alleles()[:].is_segregating()
            gt = gt.compress(is_seg)

    if n_snps is not None:
            gt = select_random_genotypes_sorted(gt, n_snps)

    return gt

def load_position(contig):
        z = zarr.open(zarr_base_path.format(contig = contig))
        filter_mask = z[f"{contig}/punctulatus_group_filter_pass"][:]
        pos =  np.array((z[f"{contig}/variants/POS"]))
        return pos.compress(filter_mask)

def pc1_along_genotype_array(contig, window_size,n_snps_per_window=None, sample_set=None, sample_query=None, metadata=df_samples, colour=None, title=None):
    """
    Calculates PC1 along a chrom, using a stepping window of size `window_size`. Either a chrom 
    ('2L', '3RL', etc) and a sample_set can be passed, 
    or a tuple of genotype array and position array can be passed.
    
    Parameters
    ----------
    chrom: str or tuple
        Either a string contig ('2L', '3RL', etc) and sample_set/sample_query can be passed which will invoke
        malariagen_data, or a list of both a genotype array and an array of genomic positions.
    window_size: int
        Size of the sliding window in snps.
    n_snps_per_window: int
        Number of snps to use in each window. If None, all snps are used.
    sample_set: str
        Sample set to use. If None, all samples are used.
    sample_query: str
        Sample query to use. If None, all samples in the sample set are used.
    metadata: pandas dataframe
        Metadata for the samples. Must be same length and order as genotype array. Columns can be used for plotting colours
    colour: str
        Column name in metadata to use for colouring the plot.
    """

    # Subset metadata if we want to
    if sample_query:
        metadata = metadata.query(sample_query)
    else:
        metadata = metadata
    
    # Load and filter genotypes
    g = load_genotype_array(contig=contig, qc_bool = qc_bool, sample_query=sample_query) # Load genotypes
    ac = g.count_alleles() #Count alleles
    flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1) # Remove singletons and keep only biiallelics
    gf = g.compress(flt, axis=0) #Apply filter
    gn_alt = gf.to_n_alt()
    
    #print("removing any invariant sites")
    #loc_var = np.any(gn_alt != gn_alt[:, 0, np.newaxis], axis=1)
    #gn_var = np.compress(loc_var, gn_alt, axis=0)

    #Load pos
    pos = load_position(contig = contig)
    
    print("computing pcs")
    pc1s = allel.moving_statistic(values=gn_alt, statistic=pca, size=window_size, n_snps_per_window=n_snps_per_window)
    midpoints = allel.moving_statistic(values=pos[flt], statistic=np.median, size=window_size)

    # Align to first window so all are consistent
    pc1s = align_pc1s_across_windows(pc1s)

    pc_df = pd.DataFrame(pc1s).T
    pc_df.columns = midpoints   
    pc_df = pd.concat([metadata.reset_index(), pc_df.reset_index(drop=True)], axis=1)
    pc_df = pc_df.melt(id_vars=metadata.columns, var_name='midpoint', value_name='PC1')
    pc_df['PC1_scaled'] = pc_df.groupby('midpoint', group_keys=False)['PC1'].apply(lambda x: (x-np.mean(x))/(max(x)-min(x)))

    # Set plot style
    #sns.set(style="ticks")

    # Create the plot
    #plt.figure(figsize=(17, 5))
    #sns.lineplot(data=pc_df, 
    #            x="midpoint", 
    #            y="PC1_scaled", 
    #            hue=colour, 
    #            units="sample_id",  # Ensures each individual gets its own line
    #            estimator=None,      # Prevents aggregation
    #            linewidth=0.5,       # Thinner lines
    #            palette=pop_code_cols, # Apply custom colors
    #            alpha=0.5)           # Transparency

    #sns.despine()
    # Improve legend placement
    #plt.legend(title="Cohort", bbox_to_anchor=(1.05, 1), loc='upper left')

    # Show plot
    #plt.xlabel("Position")
    #plt.ylabel("PC1_scaled")
    #plt.title(title)
    #plt.savefig(f'/Users/dennistpw/Projects/AsGARD/figures/{title}.svg', format='svg')
    #plt.show()

    return(pc_df)

def pca(x, n_components=1, n_snps_per_window=None):

    """
    Performs PCA and returns the first principal component.
    """
    # optionally thin snps
    if n_snps_per_window:
        i_mask = np.random.choice(x.shape[0], n_snps_per_window)
        x = x.take(i_mask, axis=0)

    coords, _ = allel.pca(x, n_components=n_components)
    
    # flip axes back so PC1 is same orientation in each window 
    for i in range(n_components):
        c = coords[:, i]
    if np.abs(c.min()) > np.abs(c.max()):
        coords[:, i] = c * -1
                
    return(coords[:, 0])
    
def align_pc1s_across_windows(pc1s):
    """
    Ensures consistent PC1 orientation across windows by aligning to the first window.
    """
    reference_pc1 = pc1s[0]  # Use the first window as reference

    for i in range(1, len(pc1s)):
        if np.corrcoef(reference_pc1, pc1s[i])[0, 1] < 0:
            pc1s[i] *= -1  # Flip sign if correlation is negative

    return pc1s
    

In [24]:
pc_test = pc1_along_genotype_array(contig = "KI915040", window_size = 50_000)

computing pcs


TypeError: Could not convert string 'far8_98_89_10far8_98_89_18far8_98_89_25far8_98_89_26far8_98_89_27far8_98_89_3far8_98_89_5far8_98_89_9far_BOU_MB2300_1far_BOU_MB2300_2far_BOU_MB2300_3far_BOU_MB2300_4far_BOU_T13-2far_BOU_T13-3far_BOU_T13-5far_BOU_Tonu9far_BOU_buka-7far_BOU_buka-8far_BOU_tuna50far_BOU_tuna53far_BOU_tuna56far_BOU_tuna61far_GUA-GIL-85-5far_GUA-LUN-85-3far_GUA-MAK-85-21far_GUA-MAK-85-30far_GUA-VAS-85-2far_GUA1812_19far_GUA1815_3far_GUA1815_4far_GUA1815_8far_GUA1815_9far_GUA183_132far_GUA183_22far_GUA185_2far_GUA187_1far_GUA_HBFW16far_GUA_HBFW25far_GUA_HBFW33far_GUA_HBFW34far_GUA_HBFW4far_Man-2far_Man_Mn_1far_Man_Mn_10far_Man_Mn_2far_Man_Mn_3far_Man_Mn_4far_Man_Mn_5far_Man_Mn_6far_Man_Mn_7far_Man_Mn_8far_Man_Mn_9far_Manus-8-1far_Manus-8-2far_Manus-8-3far_Manus-8-4far_Manus-8-5far_Manus-8-6far_Manus-8-7far_Manus-8-8far_NT-PC35far_NT-cas3far_NT-cas8far_NT103-4far_NT163-2far_NT1680-1far_NT1680-3far_NT2124-2far_NT4-1far_NT_NT103-2far_NT_Nhu_10far_NT_Nhu_10afar_NT_Nhu_11afar_NT_Nhu_15afar_NT_Nhu_1bfar_NT_Nhu_24afar_NT_Nhu_3far_NT_Nhu_4far_NT_Nhu_6afar_NT_Nhu_9far_NT_PK_1far_NT_PK_13afar_NT_PK_14afar_NT_PK_18afar_NT_PK_19afar_NT_PK_3far_NT_PK_5afar_NT_PK_6afar_NT_PK_7afar_NT_PK_9afar_QLD_CB_10far_QLD_CB_11far_QLD_CB_15far_QLD_CB_16far_QLD_CB_17far_QLD_CB_21far_QLD_CB_23far_QLD_CB_7far_QLD_CB_8far_QLD_CB_9far_QLD_CNS-122far_QLD_CNS-125far_QLD_CNS-131far_QLD_CNS-139far_QLD_CNS-148far_QLD_CNS-156far_QLD_CNS-165far_QLD_CNS-67far_QLD_CNS-69far_QLD_CNS-70far_QLD_MK_12far_QLD_MK_13far_QLD_MK_14far_QLD_MK_18far_QLD_MK_24far_QLD_MK_9far_QLD_Mac-18-13far_QLD_Mac-18-19far_QLD_Mac-18-24far_QLD_Mac-18-29far_QLD_Mac1far_QLD_Mac2far_QLD_Mac3far_QLD_NP_1far_QLD_NP_2far_QLD_NP_4far_QLD_NP_5far_QLD_NP_6far_QLD_NP_8far_QLD_NP_9far_QLD_Q673-3far_QLD_Q673-4far_QLD_Q885-2far_QLD_Q885-4far_QLD_Yaps_5far_TSI_324-2far_TSI_354747-2far_TSI_354747_1far_TSI_354747_14far_TSI_354747_15far_TSI_354747_5far_TSI_354747_6far_TSI_354747_7far_TSI_413-1far_TSI_413-2far_TSI_9-2far_TSI_9-6far_TSI_SB20far_TSI_SB22far_TSI_SB25far_TSI_SB_10far_TSI_SB_14far_TSI_SB_15far_TSI_SB_16far_TSI_SB_18far_TSI_SB_19far_TSI_SB_2far_TSI_SB_21far_TSI_SB_28far_TSI_SB_3far_TSI_SB_30far_TSI_SB_6far_TSI_TI354747_12far_TSI_TI354747_9far_Ula_U10far_Ula_U6far_Ula_U7far_Ula_U8far_Ula_U9far_Van_VTB20far_Van_VTB21far_Van_VTB28far_Van_VTB29far_Van_VTB30far_WPSI_HLCNG23_28_1far_WPSI_HLCNG23_28_2far_WPSI_HLCNG23_28_3far_WPSI_HLCNG23_28_4far_WPSI_HLCNG23_28_5far_WPSI_HLCNG_23_5far_WPSI_sol278far_WPSI_sol291far_WPSI_sol292far_WPSI_sol294far_WPSI_sol311far_WPSI_sol315far_WPSI_sol335far_WPSI_sol99far_WPSI_sol_24far_WPSI_sol_25far_WPSI_sol_31far_WPSI_sol_33far_nNG_LR-176BO1far_nNG_LR-176BO2far_nNG_LR-176BO3far_nNG_LR-176BO4far_nNG_LR-176G14far_nNG_LR-176G17far_nNG_LR-88-G6far_nNG_LR-88G5far_nNG_MP-94-2far_nNG_MP-94-4far_nNG_SR-93-92-11far_nNG_SR-93-92-12far_nNG_SR-93-92-13far_nNG_SR-93-92-2far_nNG_f1-LR109-1far_nNG_f1-LR109-2far_nNG_f1-LR109-3far_nNG_f1-LR166-1far_nNG_f1-LR166-2far_nNG_f1-LR166-3far_nNG_f1-LR166-4far_nNG_f1-LR166-5far_nNG_f1-LR166-6far_nPP_96_88_11far_nPP_96_88_18far_nPP_96_88_7far_nPP_f1_96_88_10far_nPP_f1_96_88_14far_nPP_f1_96_88_22far_nPP_f1_96_88_26far_nPP_f1_96_88_6far_sNG_92_13_1far_sNG_92_15_3far_sNG_92_15_4far_sNG_92_17_11far_sNG_92_17_12far_sNG_92_17_13far_sNG_92_17_17far_sNG_92_17_19far_sNG_92_17_2far_sNG_92_17_20far_sNG_92_17_3far_sNG_WP21-347far_sNG_WP21-353far_sNG_WP21-359far_sNG_WP21-363far_sNG_WP21-BC393far_sNG_WP21-BC401far_sNG_WP21-BC404far_sNG_WP465-1far_sNG_WP465-2far_sNG_WP943-1far_sNG_WP943-2far_sNG_f192-15-5far_sNG_f192-15-6far_sNG_f192-15-7far_sNG_f1_92_13_2far_sNG_f1_92_13_3far_sNG_f1_92_13_4far_sNG_f1_92_13_5far_sNG_f1_92_15_1far_sNG_f1_92_15_2far_sNG_f1_92_15_5far_sNG_f1_92_17_1far_sNG_f1_92_17_16far_sPP_99-1far_sPP_99-10far_sPP_99-2far_sPP_99-3far_sPP_99-4far_sPP_99-7far_sPP_99-8far_sPP_99-9far_sPP_BC125far_sPP_BC129far_sPP_BC133far_sPP_BC136far_sPP_CP-98-5-10far_sPP_CP-98-5-11far_sPP_CP-98-5-12far_sPP_CP-98-5-4far_sPP_CP-98-5-9far_sPP_CP98_100_11far_sPP_CP98_100_18far_sPP_CP98_100_25far_sPP_CP98_100_34far_sPP_CP98_100_35far_sPP_CP98_100_4far_sPP_CP98_100_40far_sPP_GR14-BC126far_sPP_GR14-BC138far_sPP_GR14-BC141far_sPP_GR14-BC147far_sPP_GR14-BC157far_sPP_f1-CP54-1far_sPP_f1-CP54-2far_sPP_f1-CP54-3far_sPP_f1-CP54-4far_sPP_f1-CP54-5far_sPP_f1-CP54-6far_sPP_f1-LR171-10far_sPP_f1-LR171-4far_sPP_f1-LR171-7far_sPP_f1-LR171-9hin_BOU_A1hin_BOU_A10hin_BOU_A3hin_BOU_A5hin_BOU_A_6hin_BOU_F2_BOU8hin_BOU_T10hin_BOU_T6hin_BOU_T7hin_BOU_T9hin_BOU_T_5hin_GUA18-3-19hin_GUA18-3-5hin_GUA18-9-18hin_GUA18-9-8hin_GUA1816_18hin_GUA1816_24hin_GUA1818_1hin_GUA181_8hin_GUA183_7hin_GUA183_8hin_GUA188_1hin_GUA189_6hin_GUA_MC-5hin_PP_97_89_13hin_PP_97_89_7hin_PP_98_8_6hin_PP_98_8_7hin_PP_CP113_4hin_PP_CP119_1hin_PP_CP94_1hin_PP_CP98_100_39hin_PP_f2_98_89_11hin_PP_f2_98_89_14hin_PP_f2_98_89_21hin_PP_f2_98_89_2bhin_PP_f2_98_89_8hin_PP_f2_98_8_1bhin_PP_f2_98_8_2hin_PP_f2_98_8_3hin_PP_f2_98_8_4hin_PP_f2_98_8_5hin_QLD_F2_QLD20hin_QLD_MM10hin_QLD_MM11hin_QLD_MM7hin_QLD_MM8hin_QLD_MM9hin_QLD_MtMol_13_2hin_QLD_MtMol_13_8hin_QLD_MtMol_13_9hin_QLD_WCditch_1hin_WPSI_HB13hin_WPSI_HB14hin_WPSI_HB2hin_WPSI_HB3hin_WPSI_HB5hin_WPSI_HB6hin_WPSI_HB7hin_WPSI_HB9hin_WPSI_HB_1hin_WPSI_HLC_30hin_WPSL_sol135hin_WPSL_sol160hin_WPSL_sol241hin_WPSL_sol244hin_WPSL_sol247hin_WPSL_sol251hin_WPSL_sol261hin_WPSL_sol308hin_WPSL_sol86hin_WPSL_sol_19hin_WPSL_sol_23hin_WPSL_sol_29hin_WPSL_sol_30hin_WPSL_sol_76hin_cNG-7-10hin_cNG-7-2hin_cNG-7-6hin_cNG-7-9hin_cNG-8-4hin_cNG-8-8hin_cNG-8-9hin_cNG-9-10hin_cNG-9-7hin_cNG-9-9hin_cNG_94_109_2hin_cNG_94_109_5hin_cNG_94_109_6hin_cNG_94_113_2hin_cNG_94_113_6hin_cNG_94_118_1hin_cNG_94_118_2hin_cNG_94_118_3hin_cNG_94_97_3hin_cNG_94_97_5hin_cNG_94_97_9hin_cNG_f2_94_109_3hin_cNG_f2_94_109_7hin_cNG_f2_94_97_1hin_cNG_f2_94_97_7hin_nNG_MP-76-2hin_nNG_MP-76-4hin_nNG_MP76_3hin_nNG_SR-Nal16hin_nNG_SR-Nal2hin_nNG_SR93-R93Khin_nPP_4-169hin_nPP_BO132_1hin_nPP_BO132_10hin_nPP_BO132_12hin_nPP_BO132_2hin_nPP_BO132_3hin_nPP_BO132_4hin_nPP_BO132_5hin_nPP_BO132_6hin_nPP_BO132_7hin_nPP_BO132_9hin_sNG_92_27_12hin_sNG_92_62_1hin_sNG_92_62_2hin_sNG_92_62_4hin_sNG_92_62_7hin_sNG_GI31_3hin_sNG_GI31_6hin_sNG_S10_10hin_sNG_S10_11hin_sNG_S10_12hin_sNG_S10_13hin_sNG_S10_14hin_sNG_S10_5hin_sNG_S10_8hin_sNG_S1_15hin_sNG_S1_16hin_sNG_S1_17hin_sNG_S1_18hin_sNG_S1_20hin_sNG_WP66_4hin_sNG_f2_92_27_10hin_sNG_f2_92_27_20hin_sNG_f2_92_27_4hin_sNG_f2_92_27_8hin_sNG_f2_92_41_2hin_sNG_f2_92_41_4hin_sNG_f2_92_41_7hin_sNG_f2_92_41_8hin_sNG_f2_92_41_9hin_sNG_f2_92_62_3hin_wQLD_Por_41hin_wQLD_Por_43hin_wQLD_Por_44hin_wQLD_Por_45hin_wQLD_Por_48hin_wQLD_Por_49iren_GUA1812_13ireniren_GUA1812_16ireniren_GUA1826_11ireniren_GUA182_3ireniren_GUA182_6ireniren_GUA183_2ireniren_GUA184_23ireniren_GUA184_36ireniren_GUA187_4ireniren_GUA188_30ireniren_GUA24_3ore_IJ98_10_1ore_IJ98_12_1ore_IJ98_12_3ore_IJ98_12_4ore_IJ98_12_5ore_IJ98_12_6ore_IJ98_2_1ore_IJ98_2_2ore_IJ98_2_3ore_IJ98_2_4ore_IJ_2_6punpun_BO-25-1-OB99pun_SI98-12-2pun_SI98-21-1pun_SI98-8-2pun_SIWP98-13-1tor_NT_Ala_3ator_NT_Ala_4ator_NT_Aly_1tor_NT_Aly_1ator_NT_Aly_2tor_NT_Aly_2ator_NT_Aly_3tor_NT_Aly_4tor_NT_Aly_5tor_NT_Aly_7tor_wQLD_Por_12tor_wQLD_Por_13tor_wQLD_Por_19tor_wQLD_Por_27tor_wQLD_Por_40tor_wQLD_Por_50tor_wQLD_Por_52' to numeric

In [None]:
pc_test