In [1]:
import malariagen_data
import allel
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import numpy as np
import seaborn as sns

In [2]:
ag3 = malariagen_data.Ag3(
    results_cache="/Users/dennistpw/Projects/drc_genomic_surveillance/data/",
    pre=True
)
ag3

MalariaGEN Ag3 API client,MalariaGEN Ag3 API client
"Please note that data are subject to terms of use,  for more information see the MalariaGEN website or contact support@malariagen.net.  See also the Ag3 API docs.","Please note that data are subject to terms of use,  for more information see the MalariaGEN website or contact support@malariagen.net.  See also the Ag3 API docs..1"
Storage URL,gs://vo_agam_release/
Data releases available,"3.0, 3.1, 3.10, 3.11, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9"
Results cache,/Users/dennistpw/Projects/drc_genomic_surveillance/data
Cohorts analysis,20240717
AIM analysis,20220528
Site filters analysis,dt_20200416
Software version,malariagen_data 0.0.0.post782+5e502bf
Client location,"Queensland, Australia"


In [6]:
#Define all the sample sets we want to include and exclude based on Poppy's analyses
drc_sample_sets = [
    '1264-VO-CD-WATSENGA-VMF00161',
    '1264-VO-CD-WATSENGA-VMF00164',
]
ref_sample_sets = [
    # 'AG1000G-CD',
    'AG1000G-CF',
    'AG1000G-TZ',
    'AG1000G-UG',
#    'AG1000G-GA-A',
#    'AG1000G-MZ',
    'AG1000G-CM-A',
    'AG1000G-BF-A',
    '1273-VO-ZM-MULEBA-VMF00176',
]
sample_sets = drc_sample_sets + ref_sample_sets

locations_blacklist = [
    'Mayos',
    'Daiguene',
    'Gbadolite',
    'Muheza',
]

samples_blacklist = [
    "AB0203-C",
    "AB0177-C",
    "AN0085-C",
    "AN0107-C",
    "AC0223-C",
    "AC0240-C",
    "AB0207-C",
    "AB0158-Cx",
]

drc_samples_blacklist = ['VBS48708-6367STDY9888400', 'VBS48725-6367STDY9888417']

cat_ordered = [
    'Burkina Faso',
    'Cameroon',
    'Central African Republic',
    'Uganda',
    'Tanzania',
    'Zambia',
    'DRC Far North',
    'DRC South East',
    'DRC South West',
    'DRC Forest'
]

In [7]:
# Now define sample analysis pop from metadata

drc_metadata = ag3.sample_metadata(sample_query=(
        "taxon == 'gambiae' and "
        f"sample_set in {sample_sets} and "
        f"location not in {locations_blacklist} and "
        f"sample_id not in {samples_blacklist + drc_samples_blacklist}"
    ))

# add coloring category
drc_metadata.loc[:, "group"] = drc_metadata["country"].copy()
drc_metadata.loc[(drc_metadata["admin1_name"] == "Nord-Ubangi"), "group"] = "DRC Far North"
drc_metadata.loc[(drc_metadata["admin1_name"] == "Upper Uele"), "group"] = "DRC Far North"
drc_metadata.loc[(drc_metadata["admin1_name"] == "Kongo-Central"), "group"] = "DRC South West"
drc_metadata.loc[(drc_metadata["admin1_name"] == "Kinshasa"), "group"] = "DRC South West"
drc_metadata.loc[(drc_metadata["admin1_name"] == "Mai-Ndombe"), "group"] = "DRC South West"
drc_metadata.loc[(drc_metadata["admin1_name"] == "Sankuru"), "group"] = "DRC South West"
drc_metadata.loc[(drc_metadata["admin1_name"] == "South Kivu"), "group"] = "DRC South East"
drc_metadata.loc[(drc_metadata["admin1_name"] == "Central Kasai"), "group"] = "DRC South East"
drc_metadata.loc[(drc_metadata["admin1_name"] == "Tanganyika"), "group"] = "DRC South East"
drc_metadata.loc[(drc_metadata["admin1_name"] == "Haut-Katanga"), "group"] = "DRC South East"
drc_metadata.loc[(drc_metadata["admin1_name"] == "Tshopo"), "group"] = "DRC Forest (Tshopo)"
drc_metadata.groupby("group").size()

group
Burkina Faso                 95
Cameroon                     95
Central African Republic     55
DRC Far North                46
DRC Forest (Tshopo)         118
DRC South East              320
DRC South West              187
Tanzania                     32
Uganda                      205
Zambia                      201
dtype: int64

In [10]:
def prep_cnv_tab(contig):

  disco_calls = ag3.cnv_discordant_read_calls(
      contig=contig,
      sample_sets = sample_sets
  )
  #reindex df
  #drc_df_samples.reset_index(inplace=True)

  cohort_dict = {}
  #define dict of indices
  for pop in drc_metadata['group'].unique():
    cohort_dict[pop] = drc_metadata[drc_metadata['group'] == pop].index.tolist()

  cnv_gt = allel.HaplotypeArray(disco_calls['call_genotype'])
  ac = cnv_gt.count_alleles()

  ac_bool = ac[:,1] > 0
  gt_seg = cnv_gt[ac_bool]

  dupids = disco_calls['variant_id'][ac_bool].compute()

  ac_bypop = gt_seg.count_alleles_subpops(cohort_dict, max_allele=1)
  #gt_bypop

  freq_dict = {}

  # Iterate over each key-value pair in the original dictionary
  for key, array in ac_bypop.items():
      # Calculate the sum of the columns
      row_sum = np.sum(array, axis=1)
      # Divide the second column by the sum of the corresponding row
      result = array[:, 1] / row_sum
      # Store the result in the new dictionary
      freq_dict[key] = result

  # Output the result dictionary

  df = pd.DataFrame(freq_dict)
  df['dup'] = dupids
  return(df)

In [12]:
list_dfout = []
for chrom in ['2L', '2R', '3R', 'X']:
  out_df = prep_cnv_tab(chrom)
  list_dfout.append(out_df)

cnv_df = pd.concat(list_dfout)
cnv_df = cnv_df.set_index('dup')

In [None]:
cnv_df.to_csv('')

Unnamed: 0_level_0,DRC Forest (Tshopo),DRC South East,DRC South West,DRC Far North,Zambia,Burkina Faso,Central African Republic,Cameroon,Tanzania,Uganda
dup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Coeaexf_Dup0,0.0,0.009375,0.0,0.021739,0.004975,0.000000,0.000000,0.000000,0.0,0.004878
Coeaexf_Dup1,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
Coeaexf_Dup2,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
Coeaexf_Dup3,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
Coeaexf_Dup4,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...
Cyp9k1_Dup24,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
Cyp9k1_Dup25,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
Cyp9k1_Dup26,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
Cyp9k1_Dup27,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000


In [15]:
fig = px.imshow(cnv_df, width=1200, height=1200)
fig.show()