### Selection of misexpression-associated SVs 

1. **Selection of misexpression-associated and control SVs**
    * Misexpression variants: 
        * To enrich for variants associated with misexpression, we selected variants with a median TPM > 0.5 and median z-score > 2 while removing variants where a carrier has expression < 0.1 TPM. 
        * This retains variants with some variability around the misexpression cutoff but removes variants with bimodal expression distributions that are likely non-causal. 

    * Control variants:
        * Selected variants in +\- 200kb in cis to gene that were misexpressed across all INTERVAL samples
        * Selected variants with a maximum TPM = 0 across all inactive genes within 200kb
    
2. **Annotation of misexpression-associated SVs with VEP**
3. **Annotation of misexpression-associated SVs by position**
4. **Comparison of misexpression-associated and control SV length**

In [1]:
import sys 
import pandas as pd 
import pysam
import numpy as np
from pathlib import Path
from collections import Counter
from scipy.stats import mannwhitneyu

In [2]:
wkdir="/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3/"
wkdir_path = Path(wkdir)
# inputs 
sv_info_path = "/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/lof_missense/data/sv_vcf/info_table/final_sites_critical_info_allele.txt"
express_info_dir = wkdir_path.joinpath("4_vrnt_enrich/sv_count_carriers/gene_body/200kb_window/express_carrier_info")
misexp_genes_path = wkdir_path.joinpath("2_misexp_qc/misexp_metrics/misexp_genes_tpm0.5_z2.txt")
vep_msc_path = wkdir_path.joinpath("4_vrnt_enrich/sv_vep/msc/SV_vep_hg38_msc_parsed.tsv")
vep_all_path = wkdir_path.joinpath("4_vrnt_enrich/sv_vep/all/SV_vep_hg38_all_parsed.tsv")
gencode_path = wkdir_path.joinpath("reference/gencode/gencode.v31.annotation.sorted.gtf.gz")
# output 
output_dir = wkdir_path.joinpath("5_misexp_vrnts/test_cntrl_sets/")
output_dir.mkdir(parents=True, exist_ok=True)

In [3]:
# constants 
SV_TYPES = ["DEL", "DUP", "INV", "MEI"]
CHROMOSOMES = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6',
               'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12',
               'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18',
               'chr19', 'chr20', 'chr21', 'chr22']
# MAF cutoffs 
af_lower = 0
af_upper = 0.01
# misexpression cutoffs 
z_cutoff_misexp = 2 
tpm_cutoff_misexp = 0.5

In [4]:
# load structural variant info
sv_info_df = pd.read_csv(sv_info_path, sep="\t", dtype={"plinkID": str}).rename(columns={"plinkID":"vrnt_id"})
sv_info_id_af_df =  sv_info_df[["vrnt_id", "AF"]]
# misexpressed genes
misexp_genes = pd.read_csv(misexp_genes_path, header=None)[0].unique()
print(f"Number of misexpressed genes: {len(misexp_genes)}")

Number of misexpressed genes: 4437


In [5]:
# combine all variant-gene-sample pairs in 200kb windows 
all_sv_in_window_df_list = []
misexp_sv_gene_list, cntrl_sv_gene_list = [], []
express_info_path = Path(express_info_dir)
for chrom in CHROMOSOMES:     
    print(chrom)
    sv_intersect_express_path = express_info_path.joinpath(f"{chrom}_express_carrier_info.tsv")
    sv_intersect_express_df = pd.read_csv(sv_intersect_express_path, sep="\t")
    sv_intersect_express_genes_df = sv_intersect_express_df[sv_intersect_express_df.gene_id.isin(misexp_genes)]
    # add SV information 
    sv_intersect_express_info_df = pd.merge(sv_intersect_express_genes_df, 
                                            sv_info_id_af_df,
                                            how="inner", 
                                            on=["vrnt_id", "AF"])
    sv_in_window_df = sv_intersect_express_info_df[(sv_intersect_express_info_df["AF"] >= af_lower) &
                                                   (sv_intersect_express_info_df["AF"] < af_upper) & 
                                                   (sv_intersect_express_info_df.genotype.isin(['(0, 1)', '(1, 1)']))].copy()
    # add variant-gene ID  
    sv_in_window_df["vrnt_gene_id"] = sv_in_window_df.vrnt_id + "|" + sv_in_window_df.gene_id 
    # add rare SVs in cis to misexpressed genes 
    all_sv_in_window_df_list.append(sv_in_window_df)

chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr20
chr21
chr22


In [6]:
# combine all chromosomes  
all_sv_in_window_df = pd.concat(all_sv_in_window_df_list)
# all variants in 200 kb windows
vrnt_id_in_windows = set(all_sv_in_window_df.vrnt_id.unique())
print(f"Total number of unique variants in windows: {len(vrnt_id_in_windows)}")

# control variants: all variants with max expression of gene in cis window is TPM = 0 
all_sv_in_window_max_tpm_df = all_sv_in_window_df.groupby("vrnt_id", as_index=False).TPM.max()
vrnt_gene_id_cntrl_df = all_sv_in_window_max_tpm_df[all_sv_in_window_max_tpm_df.TPM == 0]
vrnt_id_cntrl = set(vrnt_gene_id_cntrl_df.vrnt_id.unique())
print(f"Number of control variants: {len(vrnt_id_cntrl)}")
# checks
print(f"Max TPM of gene in cis to controls: {all_sv_in_window_df[all_sv_in_window_df.vrnt_id.isin(vrnt_id_cntrl)].TPM.max()}")
print(f"Max z-score of gene in cis to controls: {all_sv_in_window_df[all_sv_in_window_df.vrnt_id.isin(vrnt_id_cntrl)]['z-score'].max()}")

Total number of unique variants in windows: 23159
Number of control variants: 20150
Max TPM of gene in cis to controls: 0.0
Max z-score of gene in cis to controls: -0.0187738852838987


In [7]:
# misexpression SVs, median TPM > 0.5 and z-score > 2
vrnt_gene_median_tpm_z_df = all_sv_in_window_df.groupby(["vrnt_gene_id"], as_index=False)[["TPM", "z-score"]].quantile(q=0.5, interpolation='linear')
vrnt_gene_median_tpm05_z2 = set(vrnt_gene_median_tpm_z_df[(vrnt_gene_median_tpm_z_df.TPM > tpm_cutoff_misexp) & 
                                                          (vrnt_gene_median_tpm_z_df["z-score"] > z_cutoff_misexp)].vrnt_gene_id.unique())
vrnt_id_pass_median = set([vrnt_gene_id.split("|")[0] for vrnt_gene_id in vrnt_gene_median_tpm05_z2])
print(f"Number of variants passing quantile cutoff: {len(vrnt_id_pass_median)}")
# remove SVs with a carrier with TPM < 0.1 
vrnt_gene_id_median_to_rmv = set(all_sv_in_window_df[(all_sv_in_window_df.vrnt_gene_id.isin(vrnt_gene_median_tpm05_z2)) & 
                                                     (all_sv_in_window_df.TPM < 0.1)].vrnt_gene_id.unique())
vrnt_gene_id_median_pass = vrnt_gene_median_tpm05_z2 - vrnt_gene_id_median_to_rmv
vrnt_id_misexp = set([vrnt_gene_id.split("|")[0] for vrnt_gene_id in vrnt_gene_id_median_pass])
print(f"Number of misexpression variants: {len(vrnt_id_misexp)}")
# check overlap between control and misexpression variants 
print(f"Overlap between control and misexpression SVs: {len(vrnt_id_cntrl.intersection(vrnt_id_misexp))}")

Number of variants passing quantile cutoff: 124
Number of misexpression variants: 105
Overlap between control and misexpression SVs: 0


In [8]:
# total variant carriers pairs
misexp_vrnt_carrier_df = all_sv_in_window_df[all_sv_in_window_df.vrnt_gene_id.isin(vrnt_gene_id_median_pass)]
print(f"Number of misexpression variant carriers: {len(misexp_vrnt_carrier_df)}")

Number of misexpression variant carriers: 150


In [9]:
# write variant carrier information to file 
misexp_vrnt_carrier_info_df = misexp_vrnt_carrier_df[["vrnt_id", "egan_id", "rna_id", "genotype", "gene_id", "TPM", "z-score"]]
misexp_vrnt_carrier_info_path = output_dir.joinpath("misexp_vrnt_gene_smpl.tsv")
misexp_vrnt_carrier_info_df.to_csv(misexp_vrnt_carrier_info_path, sep="\t", index=False)

In [10]:
# misexpression-associated gene-SV pairs 
misexp_sv_gene_pairs_df = misexp_vrnt_carrier_df[["vrnt_id", "gene_id"]].drop_duplicates()

**Count number of SV types in control and misexpression groups**

In [11]:
all_sv_misexp_cntrl = vrnt_id_misexp.union(vrnt_id_cntrl)
print(f"Total number of SVs in controls and misexpression groups: {len(all_sv_misexp_cntrl)}")
sv_info_misexp_cntrl_df = sv_info_df[sv_info_df.vrnt_id.isin(all_sv_misexp_cntrl)].copy()
if sv_info_misexp_cntrl_df.shape[0] != len(all_sv_misexp_cntrl): 
    raise ValueError("Number of variants in SV info dataframe does not match total variants.")
sv_info_misexp_cntrl_df["group"] = np.where(sv_info_misexp_cntrl_df.vrnt_id.isin(vrnt_id_misexp), "Misexpression", "Control")
sv_misexp_cntrl_count_types_df = sv_info_misexp_cntrl_df.groupby(["SVTYPE", "group"], as_index=False).vrnt_id.nunique()
sv_misexp_cntrl_count_types_df = sv_misexp_cntrl_count_types_df.rename(columns={"vrnt_id": "vrnt_count"})

Total number of SVs in controls and misexpression groups: 20255


In [12]:
# add labels for plot
def label_threshold(row): 
    if row["group"] == "Control": 
        if row["vrnt_count"] > 1000: 
            return str(row["vrnt_count"])
        else:
            return np.nan
    else:
        if row["vrnt_count"] > 10: 
            return str(row["vrnt_count"])
        else:
            return np.nan
        
sv_misexp_cntrl_count_types_df["label"] = sv_misexp_cntrl_count_types_df.apply(label_threshold, axis=1)

In [13]:
# write to file 
misexp_sv_in_window_path = output_dir.joinpath("misexp_contrl_sv_type_count.tsv")
sv_misexp_cntrl_count_types_df.to_csv(misexp_sv_in_window_path, sep="\t", index=False)

**Write list of selected variants and bedfiles**

In [14]:
# write all control and misexpression-associated SVs to file 
all_sv_in_window_path = output_dir.joinpath("vrnt_id_in_window_cntrl_misexp_genes.txt")
with open(all_sv_in_window_path, "w") as f_out: 
    for vrnt_id in all_sv_misexp_cntrl: 
        f_out.write(f"{vrnt_id}\n")
        
# write all misexpression-associated variants to file  
misexp_sv_in_window_path = output_dir.joinpath("vrnt_id_misexp_tpm_zscore_median.txt")
with open(misexp_sv_in_window_path, "w") as f_out: 
    for vrnt_id in vrnt_id_misexp: 
        f_out.write(f"{vrnt_id}\n")
        
# write all tested SVs to bed file (chrN format)
vrnt_id_bed_df = sv_info_misexp_cntrl_df[["chr", "pos", "end", "vrnt_id"]]
vrnt_id_bed_path = output_dir.joinpath(f"vrnt_id_in_windows_misexp_genes.bed")
vrnt_id_bed_df.to_csv(vrnt_id_bed_path, sep="\t", header=None, index=False)

# write all tested SVs to bed file (chromosome as number format)
vrnt_id_bed_chr_num_df = vrnt_id_bed_df.copy()
vrnt_id_bed_chr_num_df["chr"] = vrnt_id_bed_chr_num_df.chr.str.split("chr").str[1]
vrnt_id_chr_num_bed_path = output_dir.joinpath(f"vrnt_id_in_windows_chr_num_misexp_genes.bed")
vrnt_id_bed_chr_num_df.to_csv(vrnt_id_chr_num_bed_path, sep="\t", header=None, index=False)

# write all control and misexpression-associated SV information to file 
sv_info_misexp_cntrl_path = output_dir.joinpath(f"vrnt_id_misexp_cntrl_info.tsv")
sv_info_misexp_cntrl_df.to_csv(sv_info_misexp_cntrl_path, sep="\t", index=False)

### VEP consequence by gene 

In [15]:
vep_msc_ranks = [
    "transcript_ablation",
    "splice_acceptor_variant",
    "splice_donor_variant",
    "stop_gained",
    "frameshift_variant",
    "stop_lost",
    "start_lost",
    "transcript_amplification",
    "inframe_insertion",
    "inframe_deletion",
    "missense_variant",
    "protein_altering_variant",
    "splice_donor_5th_base_variant", 
    "splice_region_variant",
    "splice_donor_region_variant", 
    "splice_polypyrimidine_tract_variant"
    "incomplete_terminal_codon_variant",
    "start_retained_variant",
    "stop_retained_variant",
    "synonymous_variant",
    "coding_sequence_variant",
    "mature_miRNA_variant",
    "5_prime_UTR_variant",
    "3_prime_UTR_variant",
    "non_coding_transcript_exon_variant",
    "intron_variant",
    "NMD_transcript_variant",
    "non_coding_transcript_variant",
    "coding_transcript_variant",
    "upstream_gene_variant",
    "downstream_gene_variant",
    "TFBS_ablation",
    "TFBS_amplification",
    "TF_binding_site_variant",
    "regulatory_region_ablation",
    "regulatory_region_amplification",
    "feature_elongation",
    "regulatory_region_variant",
    "feature_truncation",
    "intergenic_variant",
    "sequence_variant",
    "no_predicted_effect"
]

# from all VEP consequences that have no gene annotation
vep_msc_not_linked_to_gene = ['TFBS_ablation', 
                              'TF_binding_site_variant',
                              'regulatory_region_variant', 
                              'TFBS_amplification',
                              'intergenic_variant', 
                              'regulatory_region_ablation',
                              'regulatory_region_amplification']

In [16]:
# load all VEP consequences 
vep_all_df = pd.read_csv(vep_all_path, sep="\t")
sv_gene_pairs_vep_df = pd.merge(misexp_sv_gene_pairs_df, 
                                vep_all_df[["vrnt_id", "gene_id", "Consequence"]], 
                                on=["vrnt_id", "gene_id"], 
                                how="left").fillna("no_predicted_effect")
# group consequences together for each variant-gene pair
sv_gene_pairs_vep_df["gene_consequence"] = sv_gene_pairs_vep_df.groupby(["vrnt_id", "gene_id"])['Consequence'].transform(lambda x: ','.join(x))

In [17]:
# remove duplicates arising from variant have different consequences across gene transcripts 
sv_gene_pairs_vep_collapse_df = sv_gene_pairs_vep_df.drop(columns=["Consequence"]).drop_duplicates()
if sv_gene_pairs_vep_collapse_df.shape[0] != misexp_sv_gene_pairs_df.shape[0]: 
    raise ValueError("Variant gene pair number does not match number of variant gene pairs with consequence.")

In [18]:
# assign each variant-gene pair a unique consequence based on VEP rank  
vrnt_gene_pair_msc_consq = []
for index, row in sv_gene_pairs_vep_collapse_df.iterrows(): 
    gene_consequence = row["gene_consequence"]
    gene_consequence_list = gene_consequence.split(",")
    for consq in vep_msc_ranks: 
        if consq in gene_consequence_list: 
            break 
    vrnt_gene_pair_msc_consq.append(consq)
if sv_gene_pairs_vep_collapse_df.shape[0] != len(vrnt_gene_pair_msc_consq): 
    raise ValueError("Number of MSC per gene does not match number of variant-gene pairs.")
sv_gene_pairs_vep_gene_msc_df = sv_gene_pairs_vep_collapse_df.copy()
sv_gene_pairs_vep_gene_msc_df["consequence"] = vrnt_gene_pair_msc_consq
sv_gene_pairs_vep_gene_msc_df = sv_gene_pairs_vep_gene_msc_df.drop(columns=["gene_consequence"])
# split into variants with annotated gene effect and no predicted gene effect
vrnt_gene_effect_df = sv_gene_pairs_vep_gene_msc_df[sv_gene_pairs_vep_gene_msc_df.consequence != "no_predicted_effect"]
no_predicted_effect_df = sv_gene_pairs_vep_gene_msc_df[sv_gene_pairs_vep_gene_msc_df.consequence == "no_predicted_effect"]
# in cases where variant has no predicted effect add most severe consequence if regulatory 
# load VEP MSC annotations
vep_msc_df = pd.read_csv(vep_msc_path, sep="\t").rename(columns={"Uploaded_variation": "vrnt_id", "Consequence": "msc"})
# merge VEP MSC with no predicted effect 
no_prediced_effect_df = pd.merge(no_predicted_effect_df[["vrnt_id", "gene_id"]], 
                                 vep_msc_df[["vrnt_id", "msc"]], 
                                 on="vrnt_id", 
                                 how="left")
# check for NaNs 
if no_prediced_effect_df.msc.isnull().values.any(): 
    raise ValueError("Variants missing VEP most severe consequence.")
no_prediced_effect_df["consequence"] = np.where(no_prediced_effect_df.msc.isin(vep_msc_not_linked_to_gene), 
                                                no_prediced_effect_df.msc, 
                                                "no_predicted_effect")
no_prediced_effect_trunc_df = no_prediced_effect_df[["vrnt_id", "gene_id", "consequence"]]
# combine variants with annotated gene effect and variants with no predicted effect with updated regulatory consequence
vrnt_gene_pair_consq_msc_added_df = pd.concat([vrnt_gene_effect_df, no_prediced_effect_trunc_df])
if vrnt_gene_pair_consq_msc_added_df[["vrnt_id", "gene_id"]].drop_duplicates().shape[0] != misexp_sv_gene_pairs_df.shape[0]: 
    raise ValueError("Number of gene pairs in expression input does not match number of gene pairs with annotated consequence.")
# add SV type
vrnt_gene_pair_consq_msc_added_df = pd.merge(vrnt_gene_pair_consq_msc_added_df, 
                                             sv_info_df[["vrnt_id", "SVTYPE"]], 
                                             on="vrnt_id", 
                                             how="inner")
# write to file 
vrnt_id_gene_consq_msc_path = output_dir.joinpath("misexp_vrnt_gene_msc_consq.tsv")
vrnt_gene_pair_consq_msc_added_df.to_csv(vrnt_id_gene_consq_msc_path, sep="\t", index=False)

In [19]:
# consequence names 
consequence_names_dict = {
    '3_prime_UTR_variant': "3' UTR variant",
    '5_prime_UTR_variant': "5' UTR variant",
    'TFBS_ablation': "TFBS ablation",
    'TF_binding_site_variant': "TF binding site variant",
    'coding_sequence_variant': "Coding sequence variant",
    'downstream_gene_variant': "Downstream gene variant",
    'frameshift_variant': "Frameshift variant",
    'inframe_deletion': "Inframe deletion",
    'intergenic_variant': "Intergenic variant",
    'intron_variant': "Intron variant",
    'no_predicted_effect': "No predicted effect on gene",
    'non_coding_transcript_exon_variant': "Non-coding transcript exon variant",
    'regulatory_region_ablation': "Regulatory region ablation",
    'regulatory_region_variant': "Regulatory region variant",
    'stop_lost': "Stop lost",
    'transcript_ablation': "Transcript ablation",
    'upstream_gene_variant': "Upstream gene variant",
    'TFBS_amplification': "TFBS amplification",
    'regulatory_region_amplification': "Regulatory region amplification",
    'transcript_amplification': "Transcript amplification"
}

# consequence groups 
conseq_group_dict = {
    '3_prime_UTR_variant': "3' UTR variant",
    '5_prime_UTR_variant': "5' UTR variant",
    'TFBS_ablation': "Regulatory",
    'TF_binding_site_variant': "Regulatory",
    'coding_sequence_variant': "Coding",
    'downstream_gene_variant': "Downstream (5 kb)",
    'frameshift_variant': "Coding",
    'inframe_deletion': "Coding",
    'intergenic_variant': "Non-coding",
    'intron_variant': "Non-coding",
    'no_predicted_effect': "No predicted effect",
    'non_coding_transcript_exon_variant': "Non-coding",
    'regulatory_region_ablation': "Regulatory",
    'regulatory_region_variant': "Regulatory",
    'stop_lost': "Coding",
    'transcript_ablation': "Transcript ablation",
    'upstream_gene_variant': "Upstream (5 kb)",
    'TFBS_amplification': "Regulatory",
    'regulatory_region_amplification': "Regulatory",
    'transcript_amplification': "Transcript amplification"    
}

# count variant consequences
vrnt_id_gene_conseq_count_df = vrnt_gene_pair_consq_msc_added_df.groupby(["SVTYPE", "consequence"], as_index=False).vrnt_id.count()
vrnt_id_gene_conseq_count_df["consq_name"] = vrnt_id_gene_conseq_count_df.consequence.replace(consequence_names_dict)
vrnt_id_gene_conseq_count_df["consq_group"] = vrnt_id_gene_conseq_count_df.consequence.replace(conseq_group_dict)
# check for NaNs 
if not vrnt_id_gene_conseq_count_df[vrnt_id_gene_conseq_count_df["consq_name"].isna()].empty: 
    raise ValueError("Contains NaNs - missing consequence names.")
if not vrnt_id_gene_conseq_count_df[vrnt_id_gene_conseq_count_df["consq_group"].isna()].empty: 
    raise ValueError("Contains NaNs - missing consequence groups.")
vrnt_id_gene_conseq_count_df = vrnt_id_gene_conseq_count_df.rename(columns={"vrnt_id": "vrnt_gene_count"})

# calculate proportion of each variant consequence relative to total in SVTYPE group 
count_vrnt_in_group_df = vrnt_id_gene_conseq_count_df.groupby(["SVTYPE"], as_index=False).vrnt_gene_count.sum()
count_vrnt_in_group_df = count_vrnt_in_group_df.rename(columns={"vrnt_gene_count":"vrnt_gene_in_svtype_group"})
vrnt_id_gene_conseq_count_proport_df = pd.merge(vrnt_id_gene_conseq_count_df, 
                                        count_vrnt_in_group_df, 
                                        on=["SVTYPE"], 
                                        how="inner")
vrnt_id_gene_conseq_count_proport_df["proport"] = vrnt_id_gene_conseq_count_proport_df.vrnt_gene_count / vrnt_id_gene_conseq_count_proport_df.vrnt_gene_in_svtype_group

In [20]:
# write to file 
vrnt_id_gene_conseq_count_path = output_dir.joinpath("vrnt_consq_count.tsv")
vrnt_id_gene_conseq_count_proport_df.to_csv(vrnt_id_gene_conseq_count_path, sep="\t", index=False)

In [21]:
# counts of only deletions and duplications 
del_dup_gene_conseq_count_proport_df = vrnt_id_gene_conseq_count_proport_df[vrnt_id_gene_conseq_count_proport_df.SVTYPE.isin(["DEL", "DUP"])]
del_dup_gene_conseq_count_proport_path = output_dir.joinpath("del_dup_consq_count.tsv")
del_dup_gene_conseq_count_proport_df.to_csv(del_dup_gene_conseq_count_proport_path, sep="\t", index=False)

In [22]:
# count consequences by group 
del_dup_gene_conseq_group_count_proport_df = del_dup_gene_conseq_count_proport_df.groupby(["SVTYPE", "consq_group"], as_index=False).vrnt_gene_count.sum()
del_dup_gene_conseq_group_count_proport_path = output_dir.joinpath("del_dup_consq_group_count.tsv")
del_dup_gene_conseq_group_count_proport_df.to_csv(del_dup_gene_conseq_group_count_proport_path, sep="\t", index=False)

### Positions of misexpression-associated SVs relative to misexpressed gene 

In [23]:
# add SV info 
vrnt_gene_pair_consq_msc_info_df = pd.merge(vrnt_gene_pair_consq_msc_added_df,
                                            sv_info_df[["vrnt_id", "chr", "pos", "end", "SVLEN"]], 
                                            on="vrnt_id", 
                                            how="inner"
                                           )

In [24]:
# add gene position 
gene_ids_misexp_cntrl = vrnt_gene_pair_consq_msc_info_df.gene_id.unique()
overlap_gene_coord_dict, count = {}, 0
for gtf in pysam.TabixFile(str(gencode_path)).fetch(parser = pysam.asGTF()): 
    if gtf.feature == "gene" and gtf.gene_id.split(".")[0] in gene_ids_misexp_cntrl: 
        overlap_gene_coord_dict[count] = [gtf.gene_id.split(".")[0], gtf.contig, gtf.start, gtf.end, gtf.strand]
        count += 1

In [25]:
gene_coord_colnames = ["gene_id", "gene_chrom", "gene_start", "gene_end", "gene_strand"]
gene_coord_df = pd.DataFrame.from_dict(overlap_gene_coord_dict, orient="index", columns=gene_coord_colnames)
vrnt_gene_pair_consq_msc_info_gene_df = pd.merge(vrnt_gene_pair_consq_msc_info_df, 
                                                 gene_coord_df, 
                                                 on=["gene_id"], 
                                                 how="inner")
# check chromosomes match
if not vrnt_gene_pair_consq_msc_info_gene_df[vrnt_gene_pair_consq_msc_info_gene_df.chr != vrnt_gene_pair_consq_msc_info_gene_df.gene_chrom].empty:
    raise ValueError("Chromosomes do not match")

In [26]:
### check whether misexpressed gene is overlapped by variant 
vrnt_gene_pairs_df = vrnt_gene_pair_consq_msc_info_gene_df[["vrnt_id", "gene_id", "chr", "pos", "end", "SVTYPE", "consequence", "gene_start", "gene_end", "gene_strand"]].drop_duplicates()
vrnt_id_pos_to_gene, count = {}, 0
for index, row in vrnt_gene_pairs_df.iterrows(): 
    vrnt_id, gene_id, msc = row["vrnt_id"], row["gene_id"], row["consequence"]
    gene_strand, sv_type = row["gene_strand"], row["SVTYPE"]
    sv_start, sv_end =  row["pos"], row["end"]
    gene_chrom, gene_start, gene_end = row["chr"], row["gene_start"], row["gene_end"]
    if gene_strand == "+":
        # upstream 
        if sv_start < gene_start and sv_end < gene_start: 
            position = "Upstream"
        # downstream
        elif sv_start > gene_end and sv_end > gene_end: 
            position = "Downstream"
        # internal 
        elif sv_start > gene_start and gene_end > sv_end:
            position = "Internal"
        # entire gene 
        elif sv_start < gene_start and gene_end < sv_end: 
            position = "Entire gene"
        # 5 prime overlap
        elif sv_start <= gene_start and sv_end > gene_start and gene_end > sv_end:  
            position = "Partial overlap 5' end"
        # 3 prime overlap 
        elif sv_start < gene_end and sv_start > gene_start and sv_end >= gene_end: 
            position = "Partial overlap 3' end"
        else: 
            raise ValueError(f"{vrnt_id} {gene_id} no position found.")
    if gene_strand == "-": 
        # upstream 
        if sv_start > gene_end and sv_end > gene_end: 
            position = "Upstream"
        # downstream 
        elif sv_start < gene_start and sv_end < gene_start: 
            position = "Downstream"
        # internal 
        elif sv_start > gene_start and gene_end > sv_end:
            position = "Internal"
        # entire gene 
        elif sv_start <= gene_start and gene_end <= sv_end: 
            position = "Entire gene"
        # 5 prime overlap 
        elif sv_start > gene_start and gene_end > sv_start and gene_end <= sv_end: 
            position = "Partial overlap 5' end"
        # 3 prime overlap
        elif sv_start <= gene_start and gene_end > sv_end and gene_start < sv_end:  
            position = "Partial overlap 3' end"
        else: 
            raise ValueError(f"{vrnt_id} {gene_id} no position found.")
    vrnt_id_pos_to_gene[count] = [gene_chrom, vrnt_id, sv_start, sv_end, gene_id, 
                                         gene_start, gene_end, gene_strand, position, msc, sv_type]
    count += 1
    
vrnt_id_pos_cols = ["chrom", "vrnt_id", "sv_start", "sv_end", "gene_id", "gene_start", 
                    "gene_end", "gene_strand", "position", "consequence", "SVTYPE"]
vrnt_id_pos_to_gene_df = pd.DataFrame.from_dict(vrnt_id_pos_to_gene, 
                                                       orient="index", 
                                                       columns=vrnt_id_pos_cols)

In [27]:
vrnt_id_pos_to_gene_path = output_dir.joinpath("misexp_vrnt_gene_position.tsv")
vrnt_id_pos_to_gene_df.to_csv(vrnt_id_pos_to_gene_path, sep="\t", index=False)

In [28]:
# count positions of misexpression-associated variants  
vrnt_id_pos_to_gene_count_df = vrnt_id_pos_to_gene_df.groupby(["SVTYPE", "position"], as_index=False).vrnt_id.count()
vrnt_id_pos_to_gene_count_df = vrnt_id_pos_to_gene_count_df.rename(columns={"vrnt_id": "vrnt_count"})
# add proportion
vrnt_id_pos_total_df = vrnt_id_pos_to_gene_count_df.groupby(["SVTYPE"], as_index=False).vrnt_count.sum()
vrnt_id_pos_total_df = vrnt_id_pos_total_df.rename(columns={"vrnt_count": "total_in_group"})
vrnt_id_pos_to_gene_proport_df = pd.merge(vrnt_id_pos_to_gene_count_df, 
                                          vrnt_id_pos_total_df, 
                                          on=["SVTYPE"], 
                                          how="inner")
vrnt_id_pos_to_gene_proport_df["proport"] = vrnt_id_pos_to_gene_proport_df.vrnt_count / vrnt_id_pos_to_gene_proport_df.total_in_group
# write to file 
vrnt_id_pos_to_gene_proport_path = output_dir.joinpath("vrnt_id_pos_to_gene_proport.tsv")
vrnt_id_pos_to_gene_proport_df.to_csv(vrnt_id_pos_to_gene_proport_path, sep="\t", index=False)

In [29]:
# proportion of DUPs overlapping the misexpressed gene entirely or partially 
partial_entire_overlap = ["Entire gene", "Partial overlap 3' end", "Partial overlap 5' end" , "internal"]
proport_dup_gene = vrnt_id_pos_to_gene_proport_df[(vrnt_id_pos_to_gene_proport_df.position.isin(partial_entire_overlap)) & 
                                                  (vrnt_id_pos_to_gene_proport_df.SVTYPE == "DUP")
                                                 ].proport.sum()
print(f"Percentage of DUPs overlap the misexpressed gene: {proport_dup_gene*100}")
proport_del_not_gene = vrnt_id_pos_to_gene_proport_df[(vrnt_id_pos_to_gene_proport_df.position.isin(partial_entire_overlap)) & 
                                                  (vrnt_id_pos_to_gene_proport_df.SVTYPE == "DEL")
                                                 ].proport.sum()
print(f"Percentage of DELs that do overlap the misexpressed gene: {proport_del_not_gene*100}")

Percentage of DUPs overlap the misexpressed gene: 72.22222222222223
Percentage of DELs that do overlap the misexpressed gene: 7.526881720430108


In [30]:
# write information to file 
vrnt_gene_pair_consq_msc_info_gene_pos_df = pd.merge(vrnt_gene_pair_consq_msc_info_gene_df, 
                                                 vrnt_id_pos_to_gene_df[["vrnt_id", "gene_id", "position"]], 
                                                 on=["vrnt_id", "gene_id"], 
                                                 how="inner")
vrnt_gene_pair_consq_msc_info_gene_pos_path = output_dir.joinpath("vrnt_gene_pair_misexp_cntrl_info.tsv")
vrnt_gene_pair_consq_msc_info_gene_pos_df.to_csv(vrnt_gene_pair_consq_msc_info_gene_pos_path, sep="\t", index=False)

In [31]:
# write deletions and duplications only to file 
dup_del_gene_pair_consq_msc_info_gene_pos_df = vrnt_gene_pair_consq_msc_info_gene_pos_df[vrnt_gene_pair_consq_msc_info_gene_pos_df.SVTYPE.isin(["DEL", "DUP"])]
dup_del_gene_pair_consq_msc_info_gene_pos_path = output_dir.joinpath("dup_del_gene_pair_misexp_cntrl_info.tsv")
dup_del_gene_pair_consq_msc_info_gene_pos_df.to_csv(dup_del_gene_pair_consq_msc_info_gene_pos_path, sep="\t", index=False)

In [32]:
### comparison of position and VEP prediction 

In [33]:
misexp_vrnt_gene_pair_consq_msc_info_gene_pos_df = vrnt_gene_pair_consq_msc_info_gene_pos_df
misexp_vrnt_gene_pair_consq_msc_info_gene_pos_df.groupby(["SVTYPE", "consequence", "position"], as_index=False).vrnt_id.count()

Unnamed: 0,SVTYPE,consequence,position,vrnt_id
0,DEL,TFBS_ablation,Upstream,2
1,DEL,coding_sequence_variant,Partial overlap 5' end,3
2,DEL,downstream_gene_variant,Downstream,1
3,DEL,intergenic_variant,Upstream,9
4,DEL,intron_variant,Internal,5
5,DEL,no_predicted_effect,Downstream,21
6,DEL,no_predicted_effect,Upstream,35
7,DEL,non_coding_transcript_exon_variant,Partial overlap 5' end,4
8,DEL,regulatory_region_ablation,Upstream,3
9,DEL,regulatory_region_variant,Downstream,3


### Comparison of SV lengths (all SVs)

In [34]:
# one-sided Mann-Whitney U-test comparison of control and misexpression length
for sv_type in ["DEL", "DUP"]: 
    sv_type_info_misexp_cntrl_df = sv_info_misexp_cntrl_df[sv_info_misexp_cntrl_df.SVTYPE == sv_type]
    length_controls = sv_type_info_misexp_cntrl_df[sv_type_info_misexp_cntrl_df.group == "Control"].SVLEN.to_numpy()
    length_misexp = sv_type_info_misexp_cntrl_df[sv_type_info_misexp_cntrl_df.group == "Misexpression"].SVLEN.to_numpy()
    # ‘less’: the distribution underlying x is stochastically less than the distribution 
    # underlying y, i.e. F(u) > G(u) for all u.
    stat, pval = mannwhitneyu(length_controls, length_misexp, alternative="less")
    print(f"{sv_type} one-side Mann-whitney length difference p-value: {pval}")
    # fold change in length 
    mean_length_controls = np.average(length_controls)
    print(f"Control {sv_type} mean length: {mean_length_controls}")
    mean_length_misexp = np.average(length_misexp)
    print(f"Misexpression {sv_type} mean length: {mean_length_misexp}")
    print(f"Average fold-change in {sv_type} length: {mean_length_misexp/mean_length_controls}")

DEL one-side Mann-whitney length difference p-value: 6.029879821698881e-07
Control DEL mean length: 9745.010125307675
Misexpression DEL mean length: 24515.011494252874
Average fold-change in DEL length: 2.5156476164747823
DUP one-side Mann-whitney length difference p-value: 3.9317030193917875e-06
Control DUP mean length: 58723.69349112426
Misexpression DUP mean length: 199619.3125
Average fold-change in DUP length: 3.399297636654467


In [35]:
# one-sided Mann-Whitney U-test (restricted to singletons)
for sv_type in ["DEL", "DUP"]: 
    sv_type_info_misexp_cntrl_df = sv_info_misexp_cntrl_df[(sv_info_misexp_cntrl_df.SVTYPE == sv_type) & 
                                                           (sv_info_misexp_cntrl_df.AC == 1)
                                                          ]
    length_controls = sv_type_info_misexp_cntrl_df[sv_type_info_misexp_cntrl_df.group == "Control"].SVLEN.to_numpy()
    length_misexp = sv_type_info_misexp_cntrl_df[sv_type_info_misexp_cntrl_df.group == "Misexpression"].SVLEN.to_numpy()
    # ‘less’: the distribution underlying x is stochastically less than the distribution 
    # underlying y, i.e. F(u) > G(u) for all u.
    stat, pval = mannwhitneyu(length_controls, length_misexp, alternative="less")
    print(f"{sv_type} one-side Mann-whitney length difference p-value: {pval}")
    # fold change in length 
    mean_length_controls = np.average(length_controls)
    print(f"Control {sv_type} mean length: {mean_length_controls}")
    mean_length_misexp = np.average(length_misexp)
    print(f"Misexpression {sv_type} mean length: {mean_length_misexp}")
    print(f"Fold-change in {sv_type} length: {mean_length_misexp/mean_length_controls}")

DEL one-side Mann-whitney length difference p-value: 0.00023181779155355376
Control DEL mean length: 14621.333463110765
Misexpression DEL mean length: 41628.117647058825
Fold-change in DEL length: 2.8470807913714204
DUP one-side Mann-whitney length difference p-value: 0.0022695242582722016
Control DUP mean length: 72434.5397225725
Misexpression DUP mean length: 144444.36363636365
Fold-change in DUP length: 1.9941365568082845
