In [1]:
import gseapy as gp
import pandas as pd
import numpy as np
import scipy.stats as stats
import glob
from biomart import BiomartServer
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import gseapy as gp

from sklearn import decomposition
from sklearn import preprocessing

from scripts import aesthetics

aesthetics.activate_paper_rcParams()

pd.set_option("display.max_columns", 1000)

import warnings
warnings.filterwarnings('ignore')

# RNA SV Impact

In this notebook, we'll explore the effect of SVs on expression.

I'd like to try examining the impact of _ALL_ SVs on expression. This is going to take a little bit, but it should give us a sense for how some of these SVs are operating

# Read in data

In [2]:
# define SVs and dosages for discovery and validation
sv_path = "gs://vanallen-pedsv-analysis/beds/PedSV.v2.5.3.full_cohort.analysis_samples.sites.bed.gz"
dosages_path = "gs://vanallen-pedsv-analysis/beds/PedSV.v2.5.3.full_cohort.analysis_samples.allele_dosages.bed.gz"

# define metadata
metadata_path = "gs://vanallen-pedsv-analysis/sample_info/PedSV.v2.5.3.cohort_metadata.w_control_assignments.tsv.gz"

In [3]:
# list of genes that are annotated - drop ensembl IDs
gene_ref = pd.read_csv("ref/gencode_hg38_protein_coding_genes_for_annotation_7_31_23.txt")
gene_ref = gene_ref[~gene_ref['value'].str.startswith('ENSG00')]
gene_ref = gene_ref['value'].tolist()

Load metadata and SVs

In [4]:
metadata = pd.read_csv(
    metadata_path,
    sep="\t",
)

# add a sex label to metadata
metadata["sex"] = (metadata["chrX_CopyNumber"].round() < 2).astype(int)

First, we define the samples for each analysis

In [5]:
cohort_labels = ['gmkf_neuroblastoma', 'stjude_neuroblastoma', 'stjude_ewing']
sample_dict = {}

for l in cohort_labels:
    cohort = 'GMKF' if 'gmkf' in l else 'StJude'
    disease = 'neuroblastoma' if 'neuroblastoma' in l else 'ewing'
    cohort_samples = metadata[(metadata['study'] == cohort) & (metadata['disease'] == disease)]
    
    # get cases and controls
    cohort_samples = cohort_samples.query(f'{disease}_case == 1 | {disease}_control == 1')
    
    sample_dict[l] = cohort_samples['entity:sample_id'].tolist()
    
all_cohort_samples = [s for l in list(sample_dict.values()) for s in l]

Now we load the counts to establish what samples are in our dataset. We have three different datasets to keep track of.

In [6]:
labels = ['gmkf_neuroblastoma_normalized_counts', 'gmkf_neuroblastoma_tpms', 'gmkf_neuroblastoma_normalized_tpms',
          'stjude_neuroblastoma_normalized_counts', 'stjude_neuroblastoma_tpms', 'stjude_neuroblastoma_normalized_tpms',
          'stjude_ewing_normalized_counts', 'stjude_ewing_tpms', 'stjude_ewing_normalized_tpms']
paths = [f'data/{l}.csv' for l in labels]

rna_dict = {}
for p, l in zip(paths, labels):
    data = pd.read_csv(p, index_col = 0)
    rna_dict[l] = data

So some variation in number of genes expressed.

With that said, we now need to handle our samples (ugh). First, how many samples actually overlap our cases for each disease type?

In [8]:
for l, data in rna_dict.items():
    disease = l.split('_')[1]
    samples = set(data.columns)
    
    included_samples = metadata[(metadata['entity:sample_id'].isin(samples)) &
                                (metadata[f'{disease}_case'] == True)]['entity:sample_id'].tolist()

    print(l, len(included_samples), len(data.columns))

gmkf_neuroblastoma_normalized_counts 89 209
gmkf_neuroblastoma_tpms 89 209
gmkf_neuroblastoma_normalized_tpms 89 209
stjude_neuroblastoma_normalized_counts 60 101
stjude_neuroblastoma_tpms 60 101
stjude_neuroblastoma_normalized_tpms 60 101
stjude_ewing_normalized_counts 18 24
stjude_ewing_tpms 18 24
stjude_ewing_normalized_tpms 18 24


So 167 samples total, and not great recovery. For now, we drop samples that aren't in our SV dataset (we could go back later and change this, but it makes things easy)

In [9]:
total_samples = []
for l, data in rna_dict.items():
    disease = l.split('_')[1]
    samples = set(data.columns)
    
    included_samples = metadata[(metadata['entity:sample_id'].isin(samples)) &
                                (metadata[f'{disease}_case'] == True)]['entity:sample_id'].tolist()

    total_samples += included_samples

    rna_dict[l] = data[included_samples]
    
total_samples = sorted(total_samples)

These are the samples that are in our dosage matrix (others were removed upstream for poor QC). Now we load the SVs. We only keep SVs and dosages that are in our count matrix.

In [10]:
###############
### Dosages ###
###############
dosage_cols = pd.read_csv(
    dosages_path,
    sep="\t",
    index_col=False,
    nrows = 0
)

usecols = ['#chr', 'start', 'end', 'ID'] + [s for s in all_cohort_samples if s in dosage_cols.columns]
full_dosages = pd.read_csv(
                    dosages_path,
                    sep="\t",
                    index_col=False,
                    usecols = usecols
                )

# load in the dosage data for the samples in counts
dosage_dict = {}
for l, data in rna_dict.items():
    
    # get the dosages for just this cohort
    samples = list(data.columns)
    cohort_dosages = full_dosages[['#chr', 'start', 'end', 'ID'] + samples]
    dosage_dict[l] = cohort_dosages
    cohort_dosages.to_csv(f'data/{l}-dosages-for-sv-rna-analysis.csv', index = False)
    
    print(l, len(data.columns), len(cohort_dosages.columns))

gmkf_neuroblastoma_normalized_counts 89 93
gmkf_neuroblastoma_tpms 89 93
gmkf_neuroblastoma_normalized_tpms 89 93
stjude_neuroblastoma_normalized_counts 60 64
stjude_neuroblastoma_tpms 60 64
stjude_neuroblastoma_normalized_tpms 60 64
stjude_ewing_normalized_counts 18 22
stjude_ewing_tpms 18 22
stjude_ewing_normalized_tpms 18 22


Great, so all our samples are there. Now we do some filtering on the dosage matrix, removing SVs that are poorly genotype.

In [11]:
# svs aren't filtered, so we can just keep track of all the SVs we need
svs_to_analyze = {}
for l, cohort_dosages in dosage_dict.items():

    # identify SVs that have non-zero counts in the dosage matrix 
    temp_dosages = cohort_dosages.iloc[:, 4:].copy()

    # SVs that are poorly genotyped in more than 20% of samples will be excluded
    nan_svs = np.isnan(temp_dosages).mean(axis = 1) > 0.20

    # SVs that have no counts will be excluded
    nocount_svs = (temp_dosages.fillna(0) != 0).sum(axis = 1) == 0

    kept_svs = ~(nan_svs | nocount_svs)
    
    cohort_dosages = cohort_dosages[kept_svs]

    svs_to_analyze[l] = cohort_dosages['ID'].tolist()
    
    # store
    print(l, len(temp_dosages) - len(cohort_dosages), 'SVs dropped')
    dosage_dict[l] = cohort_dosages

gmkf_neuroblastoma_normalized_counts 197277 SVs dropped
gmkf_neuroblastoma_tpms 197277 SVs dropped
gmkf_neuroblastoma_normalized_tpms 197277 SVs dropped
stjude_neuroblastoma_normalized_counts 196206 SVs dropped
stjude_neuroblastoma_tpms 196206 SVs dropped
stjude_neuroblastoma_normalized_tpms 196206 SVs dropped
stjude_ewing_normalized_counts 208887 SVs dropped
stjude_ewing_tpms 208887 SVs dropped
stjude_ewing_normalized_tpms 208887 SVs dropped


In [12]:
svs_to_analyze_total = set()
for sv_list in svs_to_analyze.values():
    svs_to_analyze_total.update(sv_list)

In [13]:
len(svs_to_analyze_total)

44136

In [14]:
###############
##### SVs #####
###############
svs = pd.read_csv(
    sv_path,
    sep="\t",
)

# subset down to all SVs in our cohort
svs = svs[svs['name'].isin(svs_to_analyze_total)].reset_index(drop = True)
svs.to_csv('data/svs-for-sv-rna-analysis.csv', index=False)

With that, let's get into it!

# Think briefly about what information we want to carry forward

We want to know how SVs affect the expression of genes around them. Most of our SVs will be singleton or very rare. We need to think about how to systematically examine gene expressions, keeping the info that we want. We have a bit of an expanding problem, in that:

1. A given SV can affect multiple genes. I expect the most interesting effects to be on single gene SVs, but we should look at all of them.
2. Multiple samples can have an SV. Common SVs can affect multiple SVs and could probably be handled by a MWU, but singleton or rare SVs cannot.
3. SVs can have different dosages. We'll be tracking CNVs here, which can vary dramatically in terms of their dosages.

It's very difficult to combine all this information into a single dataframe. In addition, we occasionally want different information. For common SVs, we can directy compare expression, but for rare and singleton SVs, a rank-based approach is likely to work better.

I'm just going to make this up as I go along.

# Identify SVs that affect genes

Here, we'll identify SVs that nominally affect genes. At the end of this process, I want to end up with an SV x gene flat dataframe, where each row contains information about the SV and its relationship to the gene. This will carry forward a lot of redundant information about the SV, but that's ok.

In [15]:
coding_cols = ['PREDICTED_COPY_GAIN', 'PREDICTED_INTRAGENIC_EXON_DUP', 'PREDICTED_LOF', 'PREDICTED_PARTIAL_EXON_DUP']
noncoding_cols = ['PREDICTED_NEAREST_TSS', 'PREDICTED_INTRONIC', 'PREDICTED_PROMOTER', 'PREDICTED_UTR']

# some coding columns are not included in the CWAS, but should be included here
unimportant_coding_cols = ['PREDICTED_DUP_PARTIAL', 'PREDICTED_MSV_EXON_OVERLAP', 'PREDICTED_TSS_DUP', 'PREDICTED_INV_SPAN', 'PREDICTED_BREAKEND_EXONIC']

# transform this into a lookup
gene_rel_lookup_dict = {}
for label, assignments in zip(['coding', 'noncoding', 'unimportant_coding'], 
                              [coding_cols, noncoding_cols, unimportant_coding_cols]):
    for a in assignments:
        gene_rel_lookup_dict[a] = label
        
gene_rel_lookup_dict['PREDICTED_COPY_GAIN']

'coding'

In [16]:
svs_that_affect_genes = (~pd.isnull(svs[coding_cols + noncoding_cols + unimportant_coding_cols])).sum(axis = 1) > 0

In [17]:
gene_svs = svs[svs_that_affect_genes]
gene_svs.shape[0], gene_svs.shape[0] / svs.shape[0] 

(44136, 1.0)

Somewhat surprisingly, all of these SVs are associated with a gene (in some fashion). But I imagine these predicted effects are extremely broad, such that all SVs get some classification here.

In [18]:
coding_svs = (~pd.isnull(svs[coding_cols])).sum(axis = 1) > 0
print(coding_svs.sum())

print(((~pd.isnull(svs[unimportant_coding_cols])).sum(axis = 1) > 0).sum())

902
334


That's better. Alright, let's cobble this together. Let's define the genes that are affected by the SVs, breaking them up into their own rows.

In [19]:
sv_gene_df = []

gene_effect_df = gene_svs[['name'] + list(gene_rel_lookup_dict.keys())].set_index('name').copy()
for sv_name, row in gene_effect_df.iterrows():
    row = row[~pd.isnull(row)]
    
    for genic_rel, gene_list in row.iteritems():
        genic_cat = gene_rel_lookup_dict[genic_rel]
        
        for gene in gene_list.split(','):
            sv_gene_df.append([sv_name, genic_cat, genic_rel, gene])
        
sv_gene_df = pd.DataFrame(sv_gene_df, columns = ['name', 'sv_effect', 'genic_relationship', 'gene'])

for sv_effect in ['coding', 'noncoding', 'unimportant_coding']:
    counts = pd.DataFrame(sv_gene_df.query(f'sv_effect == "{sv_effect}"').groupby('name').size().astype(int), 
                          columns = [f'sv_{sv_effect}_counts']).reset_index()
    sv_gene_df = sv_gene_df.merge(counts, how = 'left')
    sv_gene_df[f'sv_{sv_effect}_counts'] = sv_gene_df[f'sv_{sv_effect}_counts'].fillna(0)

# add some info about the SVs themselves
sv_gene_df = sv_gene_df.merge(gene_svs[['#chrom', 'start', 'end', 'name', 'svtype']], on = ['name'], how = 'left')

In [20]:
sv_gene_df.head(2)

Unnamed: 0,name,sv_effect,genic_relationship,gene,sv_coding_counts,sv_noncoding_counts,sv_unimportant_coding_counts,#chrom,start,end,svtype
0,PedSV.2.5.2_CNV_chr1_1,noncoding,PREDICTED_NEAREST_TSS,OR4F5,0.0,1.0,0.0,chr1,12000,30001,CNV
1,PedSV.2.5.2_DUP_chr1_1,noncoding,PREDICTED_NEAREST_TSS,OR4F5,0.0,1.0,0.0,chr1,12000,40001,DUP


In [21]:
(sv_gene_df.groupby(['name']).size() == 1).sum() / gene_svs.shape[0]

0.9891018669566793

## Remove SVs too far from their genes

Some SVs are too far from their genes to reasonably have an impact on expression. We remove those here.

In [22]:
intergenic_sv_distances = pd.read_csv('data/cwas-results/intergenic-sv-to-gene-distances.csv').rename(columns = {'gene_name': 'gene'})

In [23]:
sv_gene_df = sv_gene_df.merge(intergenic_sv_distances[['name', 'gene', 'distance']], on = ['name', 'gene'], how = 'left')

In [24]:
rows_to_drop = sv_gene_df[(sv_gene_df['genic_relationship'] == "PREDICTED_NEAREST_TSS") &
                          (sv_gene_df['distance'].fillna(1e6) > 5e5)]
rows_to_drop.head(2)

Unnamed: 0,name,sv_effect,genic_relationship,gene,sv_coding_counts,sv_noncoding_counts,sv_unimportant_coding_counts,#chrom,start,end,svtype,distance
357,PedSV.2.5.2_CPX_chr1_37,noncoding,PREDICTED_NEAREST_TSS,C10orf95,0.0,2.0,0.0,chr1,13427116,13427117,CPX,
358,PedSV.2.5.2_CPX_chr1_37,noncoding,PREDICTED_NEAREST_TSS,PRAMEF20,0.0,2.0,0.0,chr1,13427116,13427117,CPX,


In [25]:
sv_gene_df = sv_gene_df.drop(index = rows_to_drop.index).reset_index(drop = True)

## Add SV info (AF, etc)

So we can see that the majority of SVs actually only affect one gene (and this remains true for coding genes too).

I'd like to add some info about AF in our cohort subset here. Calculating these are _EXTREMELY_ annoying, since we have mixed CNVs and short SVs. We have to do this for each cohort, since the number of samples in each is different.

In [26]:
cohort_sv_gene_dict = {}

for l in rna_dict.keys():
    print(l)
    
    cohort = '_'.join(l.split('_')[:2])
    
    # get all the cohort samples (cases and controls) to calculate the AF
    cohort_samples = sample_dict[cohort]
    cohort_dosages = full_dosages[['ID'] + cohort_samples].set_index('ID')
    
    # to calculate how many of our RNA eligible samples are affected, we need the dosages of just those.
    # we also only analyze these.
    cohort_rna_dosages = dosage_dict[l].iloc[:, 3:].set_index('ID')
    
    cohort_svs_to_analyze = list(cohort_rna_dosages.index)
    
    # subset to just these SVs
    cohort_dosages = cohort_dosages.loc[cohort_svs_to_analyze]
    
    sv_allele_fractions = []
    
    for sv_name in cohort_svs_to_analyze:

        sv_full_dosages = cohort_dosages.loc[sv_name].dropna()
        sv_rna_dosages = cohort_rna_dosages.loc[sv_name].dropna()

        if '_CNV' in sv_name:
            average_cn = np.mean(sv_full_dosages)
            af = np.nan

        else:
            allele_counts = sv_full_dosages.value_counts().reindex([0, 1, 2]).fillna(0)
            alt = (allele_counts * np.array([0, 1, 2])).sum()
            ref = len(sv_full_dosages) * 2

            af = alt/ref
            average_cn = np.nan
            
        # calculate the number of affected samples with RNA
        num_rna_samples_affected = (sv_rna_dosages > 0).sum()

        sv_allele_fractions.append([l, sv_name, num_rna_samples_affected, af, average_cn])

    sv_allele_fractions = pd.DataFrame(sv_allele_fractions, columns = ['cohort', 'name', 'num_samples_with_rna_affected', 'cohort_af', 'average_cn'])
    
    # get the cohort SVs
    cohort_sv_gene_df = sv_gene_df[sv_gene_df['name'].isin(cohort_svs_to_analyze)].copy()
    cohort_sv_gene_df = cohort_sv_gene_df.merge(sv_allele_fractions, on = 'name', how = 'left')

    cohort_sv_gene_dict[l] = cohort_sv_gene_df

gmkf_neuroblastoma_normalized_counts
gmkf_neuroblastoma_tpms
gmkf_neuroblastoma_normalized_tpms
stjude_neuroblastoma_normalized_counts
stjude_neuroblastoma_tpms
stjude_neuroblastoma_normalized_tpms
stjude_ewing_normalized_counts
stjude_ewing_tpms
stjude_ewing_normalized_tpms


# Time to look at the RNA

I want to do this fully systematically, and then afterwards we can go back and reassess. We have a few different issues that we need to handle:

1. Different numbers of samples affected. Some SVs affect many samples, and some affect very few.
2. SV dosages. While rare SVs will usually only have `0` or `1` as dosages, others will have `1/2`. 
3. `CNV`s have way crazier dosages. They should be modelled more holistically.
4. Modelling TPMs/counts as an outcome is really annoying (requiring something like DESeq optimally), and non-parametric approaches like the MWU cannot handle multiple covariates.

This is a ton to keep track of, as we want to handle these scenarios differently. Here's what we'll do. For each `SV` and `gene` pair, we'll generate the following:

1. The average rank of affected samples (non-zero allele). This will be incorrect for CNVs that can have a range of copy numbers that includes negative ones.
    * We also include the expression of affected vs. not samples
2. MWU test between affected (non-zero allele) and not. This will be incorrect for SVs that have few samples and for SVs that have more dosages.
3. An ordinal logistic regression model, incorporating dosage and using the ranks of gene expression as the outcome. This will be broadly incorrect because it does not appropriately model counts.
4. The average expression of the gene

In [27]:
from statsmodels.miscmodels.ordinal_model import OrderedModel

In [28]:
len(svs_to_analyze['gmkf_neuroblastoma_tpms'])

31610

I hate this nested code structure, but anything else is too complicated. We do this for every cohort.

In [29]:
results = []
for l, data in rna_dict.items():
    print(l)
    
    analysis_dosages = dosage_dict[l].iloc[:, 3:].set_index('ID')
    svs_to_analyze_cohort = svs_to_analyze[l]
    
    sv_gene_df_cohort = cohort_sv_gene_dict[l]
    
    # drop genes not present in our counts
    sv_gene_df_cohort = sv_gene_df_cohort[sv_gene_df_cohort['gene'].isin(data.index)]
    
    cohort_results = []

    base_row = [l]
    
    for i, (index, row) in enumerate(sv_gene_df_cohort.iterrows()):
        if i % 500 == 0:
            print(i, end = ', ')

        gene = row['gene']
        sv = row['name']
        gr = row['genic_relationship']

        storage_row = base_row + [sv, gene, gr]

        # get the dosages
        sv_dosages = analysis_dosages.loc[sv].dropna()

        # get the expression
        gene_expression = data.loc[gene, sv_dosages.index]
        mean_expression = gene_expression.mean()

        storage_row.append(mean_expression)

        ###########################
        ### RANK-BASED APPROACH ###
        ###########################
        affected_samples = sv_dosages[sv_dosages > 0].index
        unaffected_samples = sv_dosages[sv_dosages <= 0].index

        # rank the expression
        expression_ranks = (gene_expression.rank(ascending = False) - 1)
        norm_expression_ranks =  expression_ranks / (len(gene_expression) - 1)

        avg_affected_rank = norm_expression_ranks.loc[affected_samples].mean()

        # store this data
        storage_row += [len(sv_dosages), len(affected_samples), avg_affected_rank]
        
        # store the expression of affected and unaffected
        gene_exp_affected = gene_expression.loc[affected_samples]
        gene_exp_unaffected = gene_expression.loc[unaffected_samples]
        storage_row += [gene_exp_affected.mean(), gene_exp_unaffected.mean()]
        
        ##################
        ### EXIT CHECK ###
        ##################
        
        # If there are fewer than 3 affected samples, a MWU or ordinal regression
        # doesn't make sense. We exit then.
        if len(affected_samples) == 1:
            storage_row += [np.nan]
            cohort_results.append(storage_row)
            continue

        ################
        ### MWU TEST ###
        ################ 
        try:
            p = stats.mannwhitneyu(gene_exp_affected, gene_exp_unaffected)[1]
        except:
            p = np.nan

        storage_row += [p]

        ###################################
        ### LOGISTIC ORDINAL REGRESSION ###
        ###################################
    
#         data = pd.DataFrame([expression_ranks, sv_dosages], index = ['rank', 'dose']).T

#         # have to convert rank to an ordered variable
#         data['rank'] = pd.Categorical(data['rank'], categories=sorted(set(data['rank']))[::-1], ordered=True)

#         try:
#             mod_log = OrderedModel(data['rank'],
#                            data[['dose']],
#                            distr='logit')

#             res_log = mod_log.fit(method='bfgs', disp=False)

#             p = res_log.pvalues.loc['dose']
#             coef = res_log.params.loc['dose']

#         except:
#             p, coef = np.nan, np.nan

#         storage_row += [p, coef]

        cohort_results.append(storage_row)
        
    columns = ['cohort', 'name', 'gene', 'genic_relationship', 'mean_exp', 'num_rna_genotyped', 'num_greater_0_dosage', 'mean_greater_0_dosage_rank', 'mean_greater_0_exp', 'mean_leq_0_exp', 
               'mwu_p']
    
    cohort_results = pd.DataFrame(cohort_results, columns = columns)
    
    # merge with the cohort's sv information
    sv_gene_df_cohort = sv_gene_df_cohort.merge(cohort_results, on = ['name', 'gene', 'genic_relationship', 'cohort'], how = 'left').drop(columns = 'num_samples_with_rna_affected')
    
    results.append(sv_gene_df_cohort)

results = pd.concat(results)

gmkf_neuroblastoma_normalized_counts
0, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000, 10500, 11000, 11500, 12000, 12500, 13000, 13500, 14000, 14500, 15000, 15500, 16000, 16500, 17000, 17500, 18000, 18500, 19000, 19500, 20000, 20500, 21000, 21500, 22000, 22500, 23000, gmkf_neuroblastoma_tpms
0, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000, 10500, 11000, 11500, 12000, 12500, 13000, 13500, 14000, 14500, 15000, 15500, 16000, 16500, 17000, 17500, 18000, 18500, 19000, 19500, 20000, 20500, 21000, 21500, 22000, 22500, 23000, gmkf_neuroblastoma_normalized_tpms
0, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000, 10500, 11000, 11500, 12000, 12500, 13000, 13500, 14000, 14500, 15000, 15500, 16000, 16500, 17000, 17500, 18000, 18500, 19000, 19500, 20000, 20500, 21000, 21500, 22000, 225

Finally, I do want to swap the rank around, such that `0 = low | 1 = high`. 

In [30]:
results['mean_greater_0_dosage_rank'] = 1 - results['mean_greater_0_dosage_rank']

In [31]:
results.to_csv('data/sv-expression-results/sv-gene-rna-results-all-cohorts-all-analyses.csv', index=False)