In [9]:
import sgkit
import allel
import pandas as pd
import numpy as np
import zarr
import plotly.express as px

In [10]:
df_samples = pd.read_csv('/Users/dennistpw/Projects/AsGARD/metadata/cease_combinedmetadata_noqc.20250212.csv')
# Initialize an empty dictionary to store row indices for each level
pop_dict = {}

# Iterate through unique levels in the 'factor_column'
for level in df_samples['pop_code'].unique():
    # Get the row indices where the 'factor_column' matches the current level
    indices = df_samples.index[df_samples['pop_code'] == level].tolist()
    
    # Store the indices in the dictionary with the level as the key
    pop_dict[level] = indices

# Remove dictionary entries with fewer than 5 values
pop_dict = {key: value for key, value in pop_dict.items() if len(value) >= 5}

#get all pops too
pop_dict['all']  = df_samples.index.tolist()

In [11]:
annzarr = zarr.open('/Users/dennistpw/Projects/AsGARD/data/variants_combined_cohorts/annotations.zarr')

In [12]:
def getvariantsbyrange(chrom, start, end, feature):
    pos = allel.SortedIndex(annzarr[f'{chrom}/variants/POS'])
    loc_region = pos.locate_range(start, end)
    range = pos[loc_region]
    gt_region = allel.GenotypeArray(annzarr[f'{chrom}/calldata/GT'][loc_region])
    ft_region = pd.DataFrame(annzarr[f'{chrom}/variants/{feature}'][loc_region])
    ft_id = pd.DataFrame(annzarr[f'{chrom}/variants/ANN_Gene_ID'][loc_region])
    im_region = pd.DataFrame(annzarr[f'{chrom}/variants/ANN_Annotation'][loc_region])
    annreg = pd.concat([ft_region, im_region, ft_id], axis=1)
    annreg['pos'] = range.values
    annreg['chrom'] = chrom
    acframe = pd.DataFrame(gt_region.count_alleles(), columns=['major', 'minor'])
    annreg.columns = ['effect', 'type','id','pos', 'chrom']
    annreg = pd.concat([annreg, acframe], axis=1)
    return(annreg, gt_region)


In [13]:
#Define Genes for Heatmap
genes_for_heatmap = {
    'Ace1' : {'chrom' : 'CM023248', 'start' : 60904014, 'end': 60973023},
    'Vgsc' : {'chrom' : 'CM023249', 'start' : 42804885, 'end': 42848176},
    'Rdl' : {'chrom' : 'CM023249', 'start' : 8345440, 'end': 8411439},
    'Cyp9k1' : {'chrom' : 'CM023250', 'start' : 9713374, 'end': 9729212},
    'Cyp6' : {'chrom' : 'CM023248', 'start' : 67470071, 'end': 67514071},
}

In [16]:
def return_plotting_df(chrom, start, end):
    x = getvariantsbyrange(chrom,start,end, feature = 'ANN_HGVS_p')
    #get subpop dict
    maflist = []
    subpop_counts = x[1].count_alleles_subpops(pop_dict)
    for key in subpop_counts:
        maf = pd.DataFrame(subpop_counts[key][:,1] / (subpop_counts[key][:,1] + subpop_counts[key][:,0]))
        maflist.append(maf)   
    mafdf = pd.concat(maflist, axis=1)
    colnames = list(subpop_counts.keys())
    mafdf.columns = colnames
    mafdf = pd.concat([x[0], mafdf], axis=1)
    gene_df = mafdf[mafdf['type'] == 'missense_variant'].round(2)
    gene_df = gene_df.drop(['id','pos','chrom','major','minor','type'], axis=1)
    gene_df.set_index('effect',inplace=True)
    gene_df = gene_df[(gene_df > 0.05).any(axis=1)]
    gene_df = gene_df[['SAE','SAR','IRH','IRS','APA','INM','INB','DJI','ETW','ETB','ETS','SUD','YEM']]
    return(gene_df)


In [17]:
dflist = []
for gene in genes_for_heatmap.keys():
    df =  return_plotting_df(chrom=genes_for_heatmap[gene]['chrom'],
                             start=genes_for_heatmap[gene]['start'],
                             end=genes_for_heatmap[gene]['end']
                            )
    df['gene'] = gene
    dflist.append(df)
irdf = pd.concat(dflist)
irdf.to_csv('../data/irgenes_snps.csv')

In [None]:
irdf

Unnamed: 0_level_0,SAE,SAR,IRN,APA,INM,INB,DJI,ETW,ETB,ETS,SUD,YEM,gene
effect,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Leu417Met,0.07,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,Ace1
Val197Ile,0.00,0.00,0.01,0.08,0.07,0.06,0.02,0.00,0.00,0.00,0.00,0.05,Ace1
Asn185Asp,1.00,1.00,0.93,0.95,1.00,0.81,1.00,0.99,1.00,1.00,1.00,1.00,Ace1
Gly94Ser,0.00,0.00,0.00,0.06,0.57,0.25,0.43,0.65,0.47,0.87,0.61,0.55,Ace1
Gly60Ser,0.07,0.00,0.00,0.01,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,Ace1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
Glu70Lys,0.00,0.00,0.00,0.00,0.07,0.00,0.00,0.00,0.00,0.00,0.01,0.02,Cyp6
Asp58Tyr,0.95,0.57,0.10,0.05,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,Cyp6
Leu54Met,1.00,1.00,0.29,0.14,0.07,0.00,0.00,0.00,0.00,0.00,0.00,0.00,Cyp6
His50Tyr,0.00,0.00,0.57,0.67,0.29,0.50,0.64,0.66,0.71,0.84,0.56,0.66,Cyp6
