In [1]:
import math
import warnings

import matplotlib.pyplot as plt
import numpy as np
import palettable
import pandas as pd
import pybedtools
import statsmodels.api as sm
from liftover import get_lifter
from matplotlib import patches
import matplotlib.gridspec as gridspec
from matplotlib.lines import Line2D
from scipy import stats

bold_10 = palettable.cartocolors.qualitative.Bold_10.mpl_colors

from scripts import aesthetics, sv_plot

aesthetics.activate_paper_rcParams()

pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)

%reload_ext autoreload
%autoreload 2

# MYCN and 11q Neuroblastoma

Similar to EWSR1-FLI1, MYCN is somatically amplified in a subset of neuroblastoma (although its prevalence is significantly lower than Ewing's).

The logic here is that perhaps there is something similar happening. There are a few literature notable cases in this context. In particular, there's this paper: https://pubmed.ncbi.nlm.nih.gov/31474320/. Which implicates both a MYCN (chr2) amplification and a 16p11 deletion. We'll take a look at these loci.

__Note:__ The paper listed above actually collapsed large CNVs. We'll need to think about whether that is necessary here.

# Load in SVs and samples

In [2]:
svs = pd.read_csv(
    "gs://vanallen-pedsv-analysis/beds/PedSV.v1.1.validation_cohort.analysis_samples.wAFs.bed.gz",
    sep="\t",
)
dosages = pd.read_csv(
    "gs://vanallen-pedsv-analysis/beds/PedSV.v1.1.validation_cohort.analysis_samples.wAFs.allele_dosages.bed.gz",
    sep="\t",
    index_col=False,
)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
metadata = pd.read_csv(
    "gs://vanallen-pedsv-analysis/sample_info/gatk_sv_pediatric_cancers_combined_cohort_metadata_3_31_23.w_control_assignments.txt",
    sep="\t",
)
validation_samples = pd.read_csv(
    "gs://vanallen-pedsv-analysis/sample_info/PedSV.v1.validation_cohort_final_samples.list",
    header=None,
)[0].to_list()

# extract out the samples prsent in the dosage matrix
nb_samples = metadata[metadata["disease"] == "neuroblastoma"]["entity:sample_id"].to_list()
nb_samples = sorted(set(validation_samples) & set(nb_samples))

control_samples = metadata[metadata["neuroblastoma_control"]]["entity:sample_id"].to_list()
control_samples = sorted(set(validation_samples) & set(control_samples))

# create a new sex column for modeling, XY = 1
metadata["sex"] = (metadata["chrX_CopyNumber"].round() < 2).astype(int)

len(nb_samples), len(control_samples)

(529, 2098)

# Identifying regions of interest

Unlike EWSR1-FLI1, we aren't simply looking at genes here. The germline amplification noted in that paper is incredibly large, as large as 10 Mb. Similarly, the noted microdeletion on 16p is nearly 10 Mb in size.

This is tricky because I don't know how GATK handles CNVs--I assume it uses read depth, in which case these events should easily span MYCN and a narrow region on 16p, but we'll cast a large net either way.

Luckily, this paper did us a favor and categorized their large CNVs:

In [9]:
large_nb_cnvs = pd.read_excel('ref/10.1016-j.ajhg.2019.07.020-supplementary/1-s2.0-S0002929719303039-mmc2.xlsx', skiprows=3)

In [13]:
large_nb_cnvs[large_nb_cnvs['Region'].str.contains('chr16') & (large_nb_cnvs['Cytoband'] == 'p11.2')]

Unnamed: 0,Chip Barcode,USIa,Group,Region,Event,Length,Cytoband,Probe Median,Probe Count,Minimum Size,Minimum Region,Maximum Size,Maximum Region,Call P-value
42,1552042589_A,PANWZA,Discovery Cohort Case,"chr16:29,529,922-30,270,939",CN Loss,741018,p11.2,-0.526034,36,530466,"chr16:29,647,342-30,177,807",951569,"chr16:29,412,503-30,364,071",1.06e-94
63,1557556242_A,PAMXAW,Discovery Cohort Case,"chr16:29,529,922-30,270,939",CN Gain,741018,p11.2,0.241063,36,530466,"chr16:29,647,342-30,177,807",951569,"chr16:29,412,503-30,364,071",4.54e-12
72,1557556599_A,PALDDZ,Discovery Cohort Case,"chr16:29,529,922-30,270,939",CN Loss,741018,p11.2,-0.662461,36,530466,"chr16:29,647,342-30,177,807",951569,"chr16:29,412,503-30,364,071",8.29e-106
103,1562865001_A,PANZTE,Discovery Cohort Case,"chr16:29,529,922-30,270,939",CN Loss,741018,p11.2,-0.511567,36,530466,"chr16:29,647,342-30,177,807",951569,"chr16:29,412,503-30,364,071",1.6400000000000002e-43
176,1800835549_A,PAIAIL,Discovery Cohort Case,"chr16:29,651,603-30,270,939",CN Loss,619337,p11.2,-0.588504,35,521944,"chr16:29,655,864-30,177,807",716730,"chr16:29,647,342-30,364,071",8.5e-124
205,1853216342_A,PAIMSZ,Discovery Cohort Case,"chr16:29,651,603-30,270,939",CN Loss,619337,p11.2,-0.5797,35,521944,"chr16:29,655,864-30,177,807",716730,"chr16:29,647,342-30,364,071",2.17e-148
249,4068230735_B,PALPHU,Discovery Cohort Case,"chr16:29,529,922-30,270,939",CN Loss,741018,p11.2,-0.481691,36,530466,"chr16:29,647,342-30,177,807",951569,"chr16:29,412,503-30,364,071",1.3499999999999998e-63
271,4079300011_B,PAPWZE,Discovery Cohort Case,"chr16:29,651,603-30,270,939",CN Loss,619337,p11.2,-0.43282,35,521944,"chr16:29,655,864-30,177,807",716730,"chr16:29,647,342-30,364,071",8.7e-47
396,4861473056_R02C02,PASGGD,Discovery Cohort Case,"chr16:29,529,922-30,270,939",CN Loss,741018,p11.2,-0.263059,36,530466,"chr16:29,647,342-30,177,807",951569,"chr16:29,412,503-30,364,071",1.7100000000000002e-28
413,5003155027_R02C02,PASFEU,Discovery Cohort Case,"chr16:29,529,922-30,270,939",CN Loss,741018,p11.2,-0.243013,36,530466,"chr16:29,647,342-30,177,807",951569,"chr16:29,412,503-30,364,071",5.550000000000001e-23


In [None]:
# get 0.5 Mb on either side of the gene
window_size = 1e6

# expand the gene "locations" to include this window
expanded_gene_locs = gene_locs.copy()
expanded_gene_locs["start"] = (expanded_gene_locs["start"] - window_size / 2).astype(
    int
)
expanded_gene_locs["end"] = (expanded_gene_locs["end"] + window_size / 2).astype(int)

# do the intersection with pybedtools. pybedtools requires either a temp file or
# a string, and we choose the latter.
expanded_gene_bed = pybedtools.BedTool(
    expanded_gene_locs.to_csv(sep="\t", index=False, header=False), from_string=True
)
sv_bed = pybedtools.BedTool(
    svs[["#chrom", "start", "end", "name"]].to_csv(sep="\t", index=False, header=False),
    from_string=True,
)

svs_in_regions = sv_bed.intersect(expanded_gene_bed, wo=True)
svs_in_regions = svs_in_regions.to_dataframe()

# drop unnecessary columns
svs_in_regions = svs_in_regions.iloc[:, [0, 1, 2, 3, 7]].copy()
svs_in_regions.columns = ["chrom", "start", "end", "name", "gene"]