In [2]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import glob

from sklearn import decomposition
from sklearn import preprocessing

import warnings
warnings.filterwarnings('ignore')

# RNA SV Impact

In this notebook, we explore the effect of SVs on expression. Rather than examine specifics right now, this notebook is dedicated to assessing the effect of _every_ SV on expression. Other notebooks will parse and examine specific questions.

# Read in data

In [3]:
# define SVs and dosages for discovery and validation
sv_path = "gs://vanallen-pedsv-analysis/beds/PedSV.v2.5.4.full_cohort.analysis_samples.sites.bed.gz"
dosages_path = "gs://vanallen-pedsv-analysis/beds/PedSV.v2.5.4.full_cohort.analysis_samples.allele_dosages.bed.gz"

# define metadata
metadata_path = "gs://vanallen-pedsv-analysis/sample_info/PedSV.v2.5.4.cohort_metadata.w_control_assignments.tsv.gz"

In [4]:
# list of genes that are annotated - drop ensembl IDs
gene_ref_path = "data/gencode_hg38_protein_coding_genes_for_annotation.txt"
gene_ref = pd.read_csv(gene_ref_path)
gene_ref = gene_ref[~gene_ref['value'].str.startswith('ENSG00')]
gene_ref = gene_ref['value'].tolist()

Load metadata and SVs

In [5]:
metadata = pd.read_csv(
    metadata_path,
    sep="\t",
)

# add a sex label to metadata
metadata["sex"] = (metadata["chrX_CopyNumber"].round() < 2).astype(int)

First, we define the samples for GMKF neuroblastoma

In [6]:
cohort_samples = metadata.query('study == "GMKF" & neuroblastoma_case == 1')

# get cases
cohort_samples = cohort_samples['entity:sample_id'].tolist()

In [7]:
len(cohort_samples)

276

Next we load the RNA data

In [8]:
rna_data = pd.read_csv('data/rna/gmkf_neuroblastoma_tpms.csv', index_col = 0)

First, we determine how many samples overlap our SV dataset and the expression dataset

In [9]:
samples = set(rna_data.columns)

rna_samples = list(set(samples) & set(cohort_samples))
len(rna_samples)
# included_samples = metadata[(metadata['entity:sample_id'].isin(samples)) &
#                             (metadata[f'{disease}_case'] == True)]['entity:sample_id'].tolist()

# print(l, len(included_samples), len(data.columns))

89

So 89/688 cases have RNA (or 89 out of 276 GMKF samples)

Now we load the SVs. We only keep SVs and dosages for samples in our 89 samples.

In [10]:
###############
### Dosages ###
###############

# get the dosage columns first
dosage_cols = pd.read_csv(
    dosages_path,
    sep="\t",
    index_col=False,
    nrows = 0
)

# read in the full data but only for the columns that we care about
usecols = ['#chr', 'start', 'end', 'ID'] + [s for s in rna_samples if s in dosage_cols.columns]
cohort_dosages = pd.read_csv(
                    dosages_path,
                    sep="\t",
                    index_col=False,
                    usecols = usecols
                )

# export
cohort_dosages.to_csv(f'data/rna/dosages-for-sv-rna-analysis.csv', index = False)

In [11]:
cohort_dosages.head(2)

Unnamed: 0,#chr,start,end,ID,PT_1X6CJ589,PT_2QB9MP9J,PT_2RZN4HR2,PT_4Y3P2N1P,PT_5E269C8Z,PT_7APMD0HG,...,PT_D5BYDHZ9,PT_6DHGCDRP,PT_70BK6DFW,PT_7BAFX5PZ,PT_HZ4VWQP5,PT_SDPQ63J1,PT_K0BJPWY9,PT_9A9Q2YB3,PT_AGYJR7PZ,PT_26E4RFYV
0,chr1,12001,30001,PedSV.2.5.2_CNV_chr1_1,0.0,0.0,1.0,1.0,0.0,0.0,...,2.0,2.0,1.0,0.0,1.0,0.0,1.0,3.0,1.0,2.0
1,chr1,12001,40001,PedSV.2.5.2_DUP_chr1_1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,,0.0,0.0,,,0.0,


Now we do some filtering on the dosage matrix, removing SVs that are poorly genotyped.

In particular, SVs that are not genotyped in more than 20% of samples and ones that have no counts in cases (obviously) are removed.

In [12]:
# SVs that are poorly genotyped in more than 20% of samples will be excluded
print(f'{len(cohort_dosages)} SVs before dropping...', end = ' ')

temp = cohort_dosages.iloc[:, 4:]

nan_svs = np.isnan(temp).mean(axis = 1) > 0.20

# SVs that have no counts will be excluded
nocount_svs = (temp.fillna(0) != 0).sum(axis = 1) == 0

# only keep others
kept_svs = ~(nan_svs | nocount_svs)

cohort_dosages = cohort_dosages[kept_svs]

svs_to_analyze = cohort_dosages['ID'].tolist()

print(len(svs_to_analyze), 'after')

228858 SVs before dropping... 31610 after


Now we get ALL the data for these SVs.

In [13]:
###############
##### SVs #####
###############
svs = pd.read_csv(
    sv_path,
    sep="\t",
)

# subset down to all SVs in our cohort
svs = svs[svs['name'].isin(svs_to_analyze)].reset_index(drop = True)
svs.to_csv('data/rna/svs-for-sv-rna-analysis.csv', index=False)

With that, let's get into it!

# Identify SVs that affect genes

Here, we'll identify SVs that nominally affect genes. At the end of this process, we will end up with a SV x gene flate dataframe, where each row contains information abotu the SV and its relationship to the gene.

In [14]:
# the genic relationships that we'll examine--these are the ones that are defined in the CWAS (other gene-SV relationships are ignored)
coding_cols = ['PREDICTED_COPY_GAIN', 'PREDICTED_INTRAGENIC_EXON_DUP', 'PREDICTED_LOF', 'PREDICTED_PARTIAL_EXON_DUP']
noncoding_cols = ['PREDICTED_NEAREST_TSS', 'PREDICTED_INTRONIC', 'PREDICTED_PROMOTER', 'PREDICTED_UTR']

# some coding columns are not included in the CWAS, but should be included here to fully assess "all coding SVs"
unimportant_coding_cols = ['PREDICTED_DUP_PARTIAL', 'PREDICTED_MSV_EXON_OVERLAP', 'PREDICTED_TSS_DUP', 'PREDICTED_INV_SPAN', 'PREDICTED_BREAKEND_EXONIC']

# transform this into a lookup
gene_rel_lookup_dict = {}
for label, assignments in zip(['coding', 'noncoding', 'unimportant_coding'], 
                              [coding_cols, noncoding_cols, unimportant_coding_cols]):
    for a in assignments:
        gene_rel_lookup_dict[a] = label
        
gene_rel_lookup_dict['PREDICTED_COPY_GAIN']

'coding'

Identify all the SVs that have an entry in at least one of these columns

In [15]:
svs_that_affect_genes = (~pd.isnull(svs[coding_cols + noncoding_cols + unimportant_coding_cols])).sum(axis = 1) > 0

In [16]:
# an example
svs[svs_that_affect_genes][coding_cols + noncoding_cols + unimportant_coding_cols].head(2)

Unnamed: 0,PREDICTED_COPY_GAIN,PREDICTED_INTRAGENIC_EXON_DUP,PREDICTED_LOF,PREDICTED_PARTIAL_EXON_DUP,PREDICTED_NEAREST_TSS,PREDICTED_INTRONIC,PREDICTED_PROMOTER,PREDICTED_UTR,PREDICTED_DUP_PARTIAL,PREDICTED_MSV_EXON_OVERLAP,PREDICTED_TSS_DUP,PREDICTED_INV_SPAN,PREDICTED_BREAKEND_EXONIC
0,,,,,OR4F5,,,,,,,,
1,,,,,OR4F5,,,,,,,,


The first two SVs "affect" ORF45 (these are noncoding SVs where the nearest TSS is for ORF45)

In [17]:
gene_svs = svs[svs_that_affect_genes]
gene_svs.shape[0], gene_svs.shape[0] / svs.shape[0] 

(31610, 1.0)

By this approach all SVs are associated with a gene. This is primarily due to noncoding effects. What about coding effects alone?

In [18]:
coding_svs = (~pd.isnull(svs[coding_cols])).sum(axis = 1) > 0
print(coding_svs.sum())

print(((~pd.isnull(svs[unimportant_coding_cols])).sum(axis = 1) > 0).sum())

594
272


We now convert this to a flat dataframe, and further break out SVs that affect more than one gene into their own rows

In [19]:
sv_gene_df = []

# extract out just the SVs and their predicted effect on genes
gene_effect_df = gene_svs[['name'] + list(gene_rel_lookup_dict.keys())].set_index('name').copy()

# loop through this dataframe
for sv_name, row in gene_effect_df.iterrows():
    row = row[~pd.isnull(row)]
    
    # look through all the possible genic interactions
    for genic_rel, gene_list in row.iteritems():
        genic_cat = gene_rel_lookup_dict[genic_rel]
        
        # if multiple genes are present, sequentially append them
        for gene in gene_list.split(','):
            sv_gene_df.append([sv_name, genic_cat, genic_rel, gene])

# combine this data all together
sv_gene_df = pd.DataFrame(sv_gene_df, columns = ['name', 'sv_effect', 'genic_relationship', 'gene'])

In [20]:
sv_gene_df.head(2)

Unnamed: 0,name,sv_effect,genic_relationship,gene
0,PedSV.2.5.2_CNV_chr1_1,noncoding,PREDICTED_NEAREST_TSS,OR4F5
1,PedSV.2.5.2_DUP_chr1_1,noncoding,PREDICTED_NEAREST_TSS,OR4F5


In [21]:
sv_gene_df.head(4)

Unnamed: 0,name,sv_effect,genic_relationship,gene
0,PedSV.2.5.2_CNV_chr1_1,noncoding,PREDICTED_NEAREST_TSS,OR4F5
1,PedSV.2.5.2_DUP_chr1_1,noncoding,PREDICTED_NEAREST_TSS,OR4F5
2,PedSV.2.5.2_DUP_chr1_4,coding,PREDICTED_COPY_GAIN,OR4F5
3,PedSV.2.5.2_CNV_chr1_2,unimportant_coding,PREDICTED_MSV_EXON_OVERLAP,OR4F5


Now we add some features that help us filter SVs in the future:

In [24]:
# for the classes of SV effects
for sv_effect in ['coding', 'noncoding', 'unimportant_coding']:
    
    # count how many times each SV affects a gene with an effect of that class (a little confusing)
    counts = pd.DataFrame(sv_gene_df.query(f'sv_effect == "{sv_effect}"').groupby('name').size().astype(int), 
                          columns = [f'sv_{sv_effect}_counts']).reset_index()
    sv_gene_df = sv_gene_df.merge(counts, how = 'left')
    sv_gene_df[f'sv_{sv_effect}_counts'] = sv_gene_df[f'sv_{sv_effect}_counts'].fillna(0)

# add some info about the SVs themselves
sv_gene_df = sv_gene_df.merge(gene_svs[['#chrom', 'start', 'end', 'name', 'svtype']], on = ['name'], how = 'left')

In [25]:
sv_gene_df.head(2)

Unnamed: 0,name,sv_effect,genic_relationship,gene,sv_coding_counts,sv_noncoding_counts,sv_unimportant_coding_counts,#chrom_x,start_x,end_x,svtype_x,#chrom_y,start_y,end_y,svtype_y
0,PedSV.2.5.2_CNV_chr1_1,noncoding,PREDICTED_NEAREST_TSS,OR4F5,0.0,1.0,0.0,chr1,12000,30001,CNV,chr1,12000,30001,CNV
1,PedSV.2.5.2_DUP_chr1_1,noncoding,PREDICTED_NEAREST_TSS,OR4F5,0.0,1.0,0.0,chr1,12000,40001,DUP,chr1,12000,40001,DUP


The point is that now we have a way to remove SVs that affect too many genes.

## Remove SVs too far from their genes

Some noncoding SVs are too far from their genes to reasonably have an impact on expression. We remove those here.

In [26]:
intergenic_sv_distances = pd.read_csv('data/cwas/intergenic-sv-to-gene-distances.csv').rename(columns = {'gene_name': 'gene'})

In [27]:
sv_gene_df = sv_gene_df.merge(intergenic_sv_distances[['name', 'gene', 'distance']], on = ['name', 'gene'], how = 'left')

In [28]:
rows_to_drop = sv_gene_df[(sv_gene_df['genic_relationship'] == "PREDICTED_NEAREST_TSS") &
                          (sv_gene_df['distance'].fillna(1e6) > 5e5)]
rows_to_drop.head(2)

Unnamed: 0,name,sv_effect,genic_relationship,gene,sv_coding_counts,sv_noncoding_counts,sv_unimportant_coding_counts,#chrom_x,start_x,end_x,svtype_x,#chrom_y,start_y,end_y,svtype_y,distance
281,PedSV.2.5.2_CPX_chr1_37,noncoding,PREDICTED_NEAREST_TSS,C10orf95,0.0,2.0,0.0,chr1,13427116,13427117,CPX,chr1,13427116,13427117,CPX,
282,PedSV.2.5.2_CPX_chr1_37,noncoding,PREDICTED_NEAREST_TSS,PRAMEF20,0.0,2.0,0.0,chr1,13427116,13427117,CPX,chr1,13427116,13427117,CPX,


In [29]:
sv_gene_df = sv_gene_df.drop(index = rows_to_drop.index).reset_index(drop = True)

# Synthesizing with the RNA

We'll be analyzing only singletons later, but we'll crunch all the data now. For each gene-SV pair, I want to calculate the average expression rank of samples that have the SV.

## An example

A simple example first, using a singleton SV.

In [30]:
# examine a deletion
test_sv = "PedSV.2.5.2_DEL_chr1_315"
sv_gene_df.query(f'name == "{test_sv}"')

Unnamed: 0,name,sv_effect,genic_relationship,gene,sv_coding_counts,sv_noncoding_counts,sv_unimportant_coding_counts,#chrom_x,start_x,end_x,svtype_x,#chrom_y,start_y,end_y,svtype_y,distance
63,PedSV.2.5.2_DEL_chr1_315,coding,PREDICTED_LOF,ATAD3B,1.0,0.0,0.0,chr1,1493988,1509995,DEL,chr1,1493988,1509995,DEL,


We extract the expression of this gene, the samples that are affected by the SV, and then calculate their average rank

In [31]:
# first extract the dosage of this SV in our 89 samples
sv_dosages = cohort_dosages.query(f'ID == "{test_sv}"').iloc[0, 4:]
sv_dosages

PT_1X6CJ589    0.0
PT_2QB9MP9J    0.0
PT_2RZN4HR2    0.0
PT_4Y3P2N1P    0.0
PT_5E269C8Z    0.0
              ... 
PT_SDPQ63J1    0.0
PT_K0BJPWY9    NaN
PT_9A9Q2YB3    NaN
PT_AGYJR7PZ    0.0
PT_26E4RFYV    NaN
Name: 222, Length: 89, dtype: object

In [32]:
affected_samples = sv_dosages[sv_dosages > 0].index
unaffected_samples = sv_dosages[sv_dosages <= 0].index

len(affected_samples), len(unaffected_samples)

(1, 80)

So this is a singleton SV - only one sample is affected (80 are not, and 8 are not genotyped)

Now get the gene expression of this gene:

In [33]:
gene_expression = rna_data.loc["ATAD3B", sv_dosages.index]
gene_expression

PT_1X6CJ589    15.08
PT_2QB9MP9J     1.65
PT_2RZN4HR2     8.09
PT_4Y3P2N1P     9.82
PT_5E269C8Z    15.19
               ...  
PT_SDPQ63J1    17.69
PT_K0BJPWY9     9.92
PT_9A9Q2YB3     8.81
PT_AGYJR7PZ    18.54
PT_26E4RFYV    10.69
Name: ATAD3B, Length: 89, dtype: float64

In [34]:
 # rank the expression
expression_ranks = (gene_expression.rank() - 1)
norm_expression_ranks =  expression_ranks / (len(gene_expression) - 1)

avg_affected_rank = norm_expression_ranks.loc[affected_samples].mean()

# ascending = False results in highest to lowest, so 
avg_affected_rank

0.09090909090909091

So the sample affected by this SV has a lower level of expression (it is in the 10th percentile for expression of this gene)

## Generalize to all SVs

We simply apply this process systematically

In [35]:
results = []
    
analysis_dosages = cohort_dosages.iloc[:, 3:].set_index('ID')

# drop genes not present in our counts
sv_gene_df = sv_gene_df[sv_gene_df['gene'].isin(rna_data.index)]

for i, (index, row) in enumerate(sv_gene_df.iterrows()):
    if i % 500 == 0:
        print(i, end = ', ')

    gene = row['gene']
    sv = row['name']
    gr = row['genic_relationship']

    storage_row = [sv, gene, gr]

    # get the dosages for this sv
    sv_dosages = analysis_dosages.loc[sv].dropna()

    # get the expression for this gene
    gene_expression = rna_data.loc[gene, sv_dosages.index]
    mean_expression = gene_expression.mean()

    storage_row.append(mean_expression)

    ###########################
    ### RANK-BASED APPROACH ###
    ###########################
    affected_samples = sv_dosages[sv_dosages > 0].index
    unaffected_samples = sv_dosages[sv_dosages <= 0].index

    # rank the expression
    expression_ranks = (gene_expression.rank() - 1)
    norm_expression_ranks =  expression_ranks / (len(gene_expression) - 1)

    avg_affected_rank = norm_expression_ranks.loc[affected_samples].mean()

    # store this data
    storage_row += [len(sv_dosages), len(affected_samples), avg_affected_rank]

    # store the expression of affected and unaffected
    gene_exp_affected = gene_expression.loc[affected_samples]
    gene_exp_unaffected = gene_expression.loc[unaffected_samples]
    storage_row += [gene_exp_affected.mean(), gene_exp_unaffected.mean()]

    ##################
    ### EXIT CHECK ###
    ##################

    # If there are fewer than 3 affected samples, a MWU
    # doesn't make sense. We exit then.
    if len(affected_samples) == 1:
        storage_row += [np.nan]
        results.append(storage_row)
        continue

    ################
    ### MWU TEST ###
    ################ 
    
    # although the paper does not examine SVs with more than one sample, I initially tried
    # a MWU test for this purpose (which I suspect was not statistically sound)
    try:
        p = stats.mannwhitneyu(gene_exp_affected, gene_exp_unaffected)[1]
    except:
        p = np.nan

    storage_row += [p]
    results.append(storage_row)

columns = ['name', 'gene', 'genic_relationship', 'mean_exp', 'num_rna_genotyped', 'num_greater_0_dosage', 'mean_greater_0_dosage_rank', 'mean_greater_0_exp', 'mean_leq_0_exp', 
           'mwu_p']

results = pd.DataFrame(results, columns = columns)

0, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000, 10500, 11000, 11500, 12000, 12500, 13000, 13500, 14000, 14500, 15000, 15500, 16000, 16500, 17000, 17500, 18000, 18500, 19000, 19500, 20000, 20500, 21000, 21500, 22000, 22500, 23000, 

In [36]:
results.head(2)

Unnamed: 0,name,gene,genic_relationship,mean_exp,num_rna_genotyped,num_greater_0_dosage,mean_greater_0_dosage_rank,mean_greater_0_exp,mean_leq_0_exp,mwu_p
0,PedSV.2.5.2_CNV_chr1_6,OR4F29,PREDICTED_NEAREST_TSS,0.266629,89,15,0.492045,0.247333,0.270541,0.912668
1,PedSV.2.5.2_CNV_chr1_7,OR4F29,PREDICTED_NEAREST_TSS,0.266629,89,62,0.489736,0.255484,0.292222,0.620135


In [37]:
# merge with the cohort's sv information
sv_gene_df = sv_gene_df.merge(results, on = ['name', 'gene', 'genic_relationship'], how = 'left')

In [38]:
sv_gene_df.head(4)

Unnamed: 0,name,sv_effect,genic_relationship,gene,sv_coding_counts,sv_noncoding_counts,sv_unimportant_coding_counts,#chrom_x,start_x,end_x,...,end_y,svtype_y,distance,mean_exp,num_rna_genotyped,num_greater_0_dosage,mean_greater_0_dosage_rank,mean_greater_0_exp,mean_leq_0_exp,mwu_p
0,PedSV.2.5.2_CNV_chr1_6,noncoding,PREDICTED_NEAREST_TSS,OR4F29,0.0,1.0,0.0,chr1,257666,262901,...,262901,CNV,187839.0,0.266629,89,15,0.492045,0.247333,0.270541,0.912668
1,PedSV.2.5.2_CNV_chr1_7,noncoding,PREDICTED_NEAREST_TSS,OR4F29,0.0,1.0,0.0,chr1,262333,288667,...,288667,CNV,162073.0,0.266629,89,62,0.489736,0.255484,0.292222,0.620135
2,PedSV.2.5.2_CNV_chr1_8,noncoding,PREDICTED_NEAREST_TSS,OR4F29,0.0,1.0,0.0,chr1,431000,436001,...,436001,CNV,14739.0,0.266629,89,7,0.330357,0.128571,0.278415,0.112705
3,PedSV.2.5.2_CNV_chr1_9,noncoding,PREDICTED_NEAREST_TSS,OR4F16,0.0,1.0,0.0,chr1,585988,596501,...,596501,CNV,89215.0,0.266629,89,6,0.532197,0.275,0.266024,0.78703


In [39]:
sv_gene_df.to_csv('data/rna/sv-gene-rna-results.csv', index=False)