In [1]:
import sgkit as sg
import allel
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import glob
from sklearn.mixture import GaussianMixture
from sklearn.utils.extmath import row_norms

Treemix takes a tab delimited file of allele counts by population - let's define our populations here based on the countries and the pca

In [12]:
ds_chr2 = sg.load_dataset('/Users/dennistpw/Projects/AsGARD/data/variants_combined_cohorts/combined_cohorts.CM023248.zarr')

#load and filter metadata
#load and filter metadata
df_samples = pd.read_csv('/Users/dennistpw/Projects/AsGARD/metadata/cease_combinedmetadata_noqc.20250212.csv')

#extract gts
gt = allel.GenotypeArray(ds_chr2.call_genotype)


#next, ld prune. this takes a wee while

In [13]:
#define stevegen1000 functions
def select_random_genos(
                    ds, 
                    numgenos,):
                     #selects given number of genos at random
                    keep_no = int(numgenos)
                    keep_indices = np.random.choice(ds.call_genotype.shape[0], keep_no, replace=False)
                    keep_indices.sort()
                    thinned_callset = ds.isel(variants=~keep_indices)
                    return(thinned_callset)
                    

def load_geno_ds(chrom, 
                sample_query=None, 
                numgenos=None, 
                sample_list=None, 
                start=None, 
                end=None, 
                min_minor_ac=0,
                df_samples=df_samples):
                 # load sample metadata
     #load ds
     ds = sg.load_dataset(f'/Users/dennistpw/Projects/AsGARD/data/variants_combined_cohorts/combined_cohorts.{chrom}.zarr')

     #if sample query or list are specified, subset accordingly
     if sample_query:
          # locate selected samples
          loc_samples = df_samples.eval(sample_query).values
          df_samples = df_samples.loc[loc_samples, :]
          ds = ds.isel(samples=loc_samples)
     elif sample_list:
          loc_samples = df_samples['sample_id'].isin(sample_list)
          df_samples = df_samples.loc[loc_samples, :]
          ds = ds.isel(samples=loc_samples)
     else:
          pass
          
     #if numgenos is set, subset 
     if numgenos:
          ds_analysis = select_random_genos(ds, numgenos=numgenos)
     else:
          ds_analysis = ds

     #if region is set, subset to region
     if start:
               #subset to region of interest
              print(f"subsetting genos to range {chrom}:{start}-{end}")
              ds_analysis = ds_analysis.set_index(variants=("variant_contig", "variant_position")).sel(variants=(0, slice(start,end)))
     else:
            pass
     
     #if minmaf is specified, select minmaf     
     print(f'subsetting to segregating sites')
     ac = allel.GenotypeArray(ds_analysis['call_genotype']).count_alleles()
     macbool = ac[:,1] >= min_minor_ac
     print(f'selected {np.sum(macbool)} sites with a min mac > {min_minor_ac}')
     ds_analysis = ds_analysis.sel(variants=(macbool))

     #get accessible only
     print('subsetting to accessible sites only')
     accmask = ds_analysis['is_accessible'].compute()
     ds_analysis = ds_analysis.sel(variants=(accmask))

     #return completed ds
     return(df_samples, ds_analysis)


In [14]:
#Get ds and df 
df, ds = load_geno_ds(
             numgenos = 1000000,
             min_minor_ac=1,
             chrom='CM023248',
             df_samples=df_samples)

subsetting to segregating sites
selected 1000000 sites with a min mac > 1
subsetting to accessible sites only


In [None]:
# Initialize an empty dictionary to store row indices for each level
pop_dict = {}

# Iterate through unique levels in the 'factor_column'
for level in df_samples['pop_code'].unique():
    # Get the row indices where the 'factor_column' matches the current level
    indices = df_samples.index[df_samples['pop_code'] == level].tolist()
    
    # Store the indices in the dictionary with the level as the key
    pop_dict[level] = indices

# Remove dictionary entries with fewer than 5 values
pop_dict = {key: value for key, value in pop_dict.items() if len(value) >= 5}

#get all pops too
#pop_dict['all']  = df_samples.index.tolist()

In [21]:
gt = allel.GenotypeArray(ds.call_genotype)
ac_subpop = gt.count_alleles_subpops(pop_dict)


In [25]:
#count alleles by pop & convert to table
ac_subpop = gt.count_alleles_subpops(pop_dict)
#get segregating variants only
#is_seg = ac_subpop['all'].is_segregating()[:]

combined_data = {}

# Process each item in the dictionary
for name, array in ac_subpop.items():
    # Check if the array has at least two columns

    #get seg sites
   #array = array.compress(is_seg)
    if array.shape[1] < 2:
        raise ValueError(f"Array {name} does not have at least two columns")
    
    # Combine each element of the two columns into a single string
    combined_array = np.array([f"{row[0]},{row[1]}" for row in array])
    
    # Store the resulting array in the combined_data dictionary
    combined_data[name] = combined_array

# Convert the combined data into a DataFrame
ac_df = pd.DataFrame(combined_data)

##subsample randomly instead of ld pruning
#df_sample = ac_df.sample(frac=0.1)

#df_sample = df_sample.drop('all', axis=1)


In [19]:
#try to permute across different randomly chosen sets (10 times?)
ac_df.to_csv('/Users/dennistpw/Projects/AsGARD/data/TreeMix_20240926/afs_bypop.txt', sep='\t', quoting=False, index=False)


In [21]:
#Now let's download and install TreeMix
!wget https://bitbucket.org/nygcresearch/treemix/downloads/treemix-1.13.tar.gz
!tar -xvf treemix-1.13.tar.gz

--2024-09-26 13:43:08--  https://bitbucket.org/nygcresearch/treemix/downloads/treemix-1.13.tar.gz
Resolving bitbucket.org (bitbucket.org)... 185.166.142.21, 185.166.142.23, 185.166.142.22, ...
Connecting to bitbucket.org (bitbucket.org)|185.166.142.21|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://bbuseruploads.s3.amazonaws.com/915ee537-3309-4916-8689-5a97111dc17c/downloads/bb0e7778-4379-4820-9e81-9ac3a0c926dc/treemix-1.13.tar.gz?response-content-disposition=attachment%3B%20filename%3D%22treemix-1.13.tar.gz%22&response-content-encoding=None&AWSAccessKeyId=ASIA6KOSE3BNALVYUHQT&Signature=5%2B7Kze7D3y4MvHiDZmAh1scGdbw%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEMb%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLWVhc3QtMSJIMEYCIQCz20AUt%2BfPJ1RgOHtVEV1PEi7M2mODPcyz1whTIg3omwIhANtj5t6qSEQRDpSqRCIFRYG96tQOG%2FAXXzw0cfKv8t4OKrACCP%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEQABoMOTg0NTI1MTAxMTQ2Igy10wr62f%2B%2BBOJBCooqhALR0MouypDvKBW7jkXae5n6VDb7gSCJSUhbfeI8oRdOHOW0WPcj5a%2FHe

In [None]:
#run treemix using the shell scripts
#find optimal M if any using R script
#optimal M looks like 0/1?
#high data robustness...may plot without migration edges