In [1]:
import sgkit
import allel
import pandas as pd
import numpy as np
import zarr
import plotly.express as px

In [130]:
df_samples = pd.read_csv('/Users/dennistpw/Projects/AsGARD/metadata/cease_combinedmetadata_noqc.20250212.csv')

#palettes
pop_code_cols = {
    'SAE' : '#6a3d9a', #dark purple
    'SAR' : '#cab2d6', #ligher purple
    'IRH' : '#c57fc9', #horrible pink
    'IRS' : '#c27a88', #salmon
    'APA' : '#ff7f00', #orange
    'INB' : '#96172e', #darkred
    'INM' : '#f03e5e', #lightred
    'DJI' : '#507d2a', #sap green
    'ETW' : '#a6cee3',#cerulean
    'ETB' : '#007272', #cobalt turq
    'ETS' : '#33a02c',#green
    'SUD' : '#fccf86',#ochre
    'YEM' : '#CC7722'#pinkish
}

# Initialize an empty dictionary to store row indices for each level
pop_dict = {}

# Iterate through unique levels in the 'factor_column'
for level in pop_code_cols.keys():
    # Get the row indices where the 'factor_column' matches the current level
    indices = df_samples.index[df_samples['pop_code'] == level].tolist()
    
    # Store the indices in the dictionary with the level as the key
    pop_dict[level] = indices

# Remove dictionary entries with fewer than 5 values
pop_dict = {key: value for key, value in pop_dict.items() if len(value) >= 5}



In [131]:
annzarr = zarr.open('/Users/dennistpw/Projects/AsGARD/data/variants_combined_cohorts/annotations.zarr')

In [12]:
def getvariantsbyrange(chrom, start, end, feature):
    pos = allel.SortedIndex(annzarr[f'{chrom}/variants/POS'])
    loc_region = pos.locate_range(start, end)
    range = pos[loc_region]
    gt_region = allel.GenotypeArray(annzarr[f'{chrom}/calldata/GT'][loc_region])
    ft_region = pd.DataFrame(annzarr[f'{chrom}/variants/{feature}'][loc_region])
    ft_id = pd.DataFrame(annzarr[f'{chrom}/variants/ANN_Gene_ID'][loc_region])
    im_region = pd.DataFrame(annzarr[f'{chrom}/variants/ANN_Annotation'][loc_region])
    annreg = pd.concat([ft_region, im_region, ft_id], axis=1)
    annreg['pos'] = range.values
    annreg['chrom'] = chrom
    acframe = pd.DataFrame(gt_region.count_alleles(), columns=['major', 'minor'])
    annreg.columns = ['effect', 'type','id','pos', 'chrom']
    annreg = pd.concat([annreg, acframe], axis=1)
    return(annreg, gt_region)


In [13]:
#Define Genes for Heatmap
genes_for_heatmap = {
    'Ace1' : {'chrom' : 'CM023248', 'start' : 60904014, 'end': 60973023},
    'Vgsc' : {'chrom' : 'CM023249', 'start' : 42804885, 'end': 42848176},
    'Rdl' : {'chrom' : 'CM023249', 'start' : 8345440, 'end': 8411439},
    'Cyp9k1' : {'chrom' : 'CM023250', 'start' : 9713374, 'end': 9729212},
    'Cyp6' : {'chrom' : 'CM023248', 'start' : 67470071, 'end': 67514071},
}

In [16]:
def return_plotting_df(chrom, start, end):
    x = getvariantsbyrange(chrom,start,end, feature = 'ANN_HGVS_p')
    #get subpop dict
    maflist = []
    subpop_counts = x[1].count_alleles_subpops(pop_dict)
    for key in subpop_counts:
        maf = pd.DataFrame(subpop_counts[key][:,1] / (subpop_counts[key][:,1] + subpop_counts[key][:,0]))
        maflist.append(maf)   
    mafdf = pd.concat(maflist, axis=1)
    colnames = list(subpop_counts.keys())
    mafdf.columns = colnames
    mafdf = pd.concat([x[0], mafdf], axis=1)
    gene_df = mafdf[mafdf['type'] == 'missense_variant'].round(2)
    gene_df = gene_df.drop(['id','pos','chrom','major','minor','type'], axis=1)
    gene_df.set_index('effect',inplace=True)
    gene_df = gene_df[(gene_df > 0.05).any(axis=1)]
    gene_df = gene_df[['SAE','SAR','IRH','IRS','APA','INM','INB','DJI','ETW','ETB','ETS','SUD','YEM']]
    return(gene_df)


In [17]:
dflist = []
for gene in genes_for_heatmap.keys():
    df =  return_plotting_df(chrom=genes_for_heatmap[gene]['chrom'],
                             start=genes_for_heatmap[gene]['start'],
                             end=genes_for_heatmap[gene]['end']
                            )
    df['gene'] = gene
    dflist.append(df)
irdf = pd.concat(dflist)
irdf.to_csv('../data/irgenes_snps.csv')

In [15]:
# now let's take a closer look at metabolics
metabolics = {'ASTEI20_043445':'CM023248',
'ASTEI20_031508':'CM023249',
'ASTEI20_035158':'CM023249',
'ASTEI20_040637':'CM023249',
'ASTEI20_041705':'CM023249',
'ASTEI20_043207':'CM023249',
'ASTEI20_044800':'CM023249',
'ASTEI20_031989':'CM023249',
'ASTEI20_033264':'CM023249',
'ASTEI20_035228':'CM023249',
'ASTEI20_041777':'CM023249',
'ASTEI20_032647':'CM023248',
'ASTEI20_035171':'CM023248',
'ASTEI20_035508':'CM023248',
'ASTEI20_035875':'CM023248',
'ASTEI20_036300':'CM023248',
'ASTEI20_036499':'CM023248',
'ASTEI20_038121':'CM023248',
'ASTEI20_040013':'CM023248',
'ASTEI20_041713':'CM023248',
'ASTEI20_043191':'CM023248',
'ASTEI20_031614':'CM023250',
'ASTEI20_032673':'CM023249',
'ASTEI20_036053':'CM023249',
'ASTEI20_037434':'CM023249',
'ASTEI20_040345':'CM023249',
'ASTEI20_044864':'CM023249',
'ASTEI20_044946':'CM023249',
'ASTEI20_045507':'CM023249'}

In [152]:
# Load gene id and chrom
gene_id = 'ASTEI20_043445'


def get_missense_snps(gene_id, 
                      chrom,
                      min_maf = 0.05
                      ):

    chrom = metabolics[gene_id]

    # Create bool array for filtering on gene id
    id_mask = annzarr[f'{chrom}/variants/ANN_Gene_ID'][:] == gene_id

    # Get annotation type array
    ann = pd.DataFrame(annzarr[f'{chrom}/variants/ANN_Annotation'][:][id_mask])

    # Get features
    eff = pd.DataFrame(annzarr[f'{chrom}/variants/ANN_HGVS_p'][:][id_mask])

    # Get range
    pos = pd.DataFrame(annzarr[f'{chrom}/variants/POS'][:][id_mask])

    #bind into df
    eff_df = pd.concat([pos, eff, ann], axis=1)
    eff_df['gene_id'] = gene_id
    eff_df['chrom'] = chrom

    # Rename cols and reorder
    eff_df.columns = ['pos','effect','annotation','gene_id','chrom']
    eff_df = eff_df[['chrom','pos','gene_id','annotation','effect']]

    # Get genotypes
    gt_region = allel.GenotypeArray(annzarr[f'{chrom}/calldata/GT']).compress(id_mask)

    # Get per subpop allel counts
    subpop_counts = gt_region.count_alleles_subpops(pop_dict)

    #convert subpop counts to allele frequencies
    maflist = []
    for key in subpop_counts:
        maf = pd.DataFrame(subpop_counts[key][:,1] / (subpop_counts[key][:,1] + subpop_counts[key][:,0]))
        maflist.append(maf) 

    # Add colnames 
    maf_df = pd.concat(maflist, axis=1)
    maf_df.columns = pop_code_cols.keys()

    # Bind all
    maf_df_all = pd.concat([eff_df] + [maf_df], axis=1)

    # Apply minmaf filter
    maf_df_all = maf_df_all[(maf_df_all.iloc[:, 5:] > min_maf).any(axis=1)]

    maf_missense = maf_df_all[maf_df_all['annotation'] == 'missense_variant']

    return(maf_missense)
    


In [153]:
test = get_missense_snps('ASTEI20_036300','CM023248')

In [155]:
metabolic_resistance_df_list = []
for gene in metabolics.keys():
    df = get_missense_snps(gene_id=gene,
                           chrom=metabolics[gene]
                           )
    metabolic_resistance_df_list.append(df)


In [159]:
metabolic_ir_df = pd.concat(metabolic_resistance_df_list)
metabolic_ir_df.to_csv('~/Projects/AsGARD/data/metabolic_snps.csv')