### Check SNP/indel variant count 

In [1]:
from pathlib import Path
import pandas as pd
from pybedtools import BedTool
from io import StringIO
import pysam

In [2]:
wkdir = "/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3/"
wkdir_path = Path(wkdir)

chrom = "chr21"
gene_id = "ENSG00000269950"
z_cutoff_list = [2, 10, 20, 30, 40, 50]
window_start = 0
gene_window_size = 10000
window_step_size = 200000
window_max = 1000000
maf_range_list = [[0, 0.01], [0.01, 0.05], [0.05, 0.1], [0.1, 0.5]]
tpm_cutoff = 0.5

ge_mtx_path = wkdir_path.joinpath("2_misexp_qc/misexp_gene_cov_corr/tpm_zscore_4568smpls_8610genes_flat_misexp_corr_qc.csv")
vcf_path = f"/lustre/scratch126/humgen/projects/interval_wgs/final_release_freeze/gt_phased/interval_wgs.{chrom}.gt_phased.vcf.gz"
gt_info_root_path = wkdir_path.joinpath("4_vrnt_enrich/snp_indel_count_carriers/vrnts_gts_intersect")
wgs_rna_paired_smpls_path = wkdir_path.joinpath("1_rna_seq_qc/wgs_rna_match/paired_wgs_rna_postqc_prioritise_wgs.tsv")
genes_bed_path = wkdir_path.joinpath(f"4_vrnt_enrich/snp_indel_count_carriers/genes_bed/{chrom}_genes.bed")
vrnts_bed_path = wkdir_path.joinpath(f"4_vrnt_enrich/snp_indel_count_carriers/vrnts_bed/{chrom}_vrnts.bed")


In [3]:
# carriers 
#vrnt_gt_intersect_path = Path(gt_info_root_path).joinpath(f"{chrom}/{chrom}_{gene_id}_vrnts_gts_intersect.tsv")
#vrnt_gt_intersect_df = pd.read_csv(vrnt_gt_intersect_path, sep="\t")
#vrnt_gt_intersect_df["gene_id"] = gene_id

In [4]:
### assign gene-variant pairs a window  
vrnts_bed = BedTool(vrnts_bed_path)
genes_bed = BedTool(genes_bed_path)


intersect_bed_columns={0:"chrom_gene", 1:"start_gene", 2:"end_gene", 3: "gene_id", 4: "score", 5: "strand", 
                       6:"chrom_vrnt", 7:"start_vrnt", 8:"end_vrnt", 9:"vrnt_id"}
vrnt_gene_pairs_in_windows_df_list = []
for direction in ["upstream", "downstream"]:
    window_size = window_start
    seen_vrnt_gene_pairs = set()
    while window_size <= window_max:
        print(f"Counting variants {direction}, {window_size}bp ...")
        window_name = f"{direction}_{window_size}"
        if direction == "upstream": 
            l, r = window_size, 0 
        else: 
            l, r = 0, window_size
        vrnts_window_intersect_str = StringIO(str(genes_bed.window(vrnts_bed, 
                                                                   l=l, 
                                                                   r=r,
                                                                   sw=True
                                                                  )))        
        intersect_bed_df = pd.read_csv(vrnts_window_intersect_str, sep="\t", header=None).rename(columns=intersect_bed_columns)
        intersect_bed_df = intersect_bed_df[["vrnt_id", "gene_id"]].copy()
        # variant gene ID column 
        intersect_bed_df["vrnt_gene_id"] = intersect_bed_df["vrnt_id"].astype(str) + "," + intersect_bed_df["gene_id"].astype(str)
        # subset to variant gene pairs that have not been seen in previous windows    
        vrnt_gene_pairs_in_window = set(intersect_bed_df.vrnt_gene_id.unique())
        new_vrnt_gene_pairs = vrnt_gene_pairs_in_window - seen_vrnt_gene_pairs
        new_vrnt_gene_pairs_df = intersect_bed_df[intersect_bed_df.vrnt_gene_id.isin(new_vrnt_gene_pairs)].copy()
        new_vrnt_gene_pairs_df["window"] = window_name
        vrnt_gene_pairs_in_windows_df_list.append(new_vrnt_gene_pairs_df)
        seen_vrnt_gene_pairs = seen_vrnt_gene_pairs.union(new_vrnt_gene_pairs)
        window_size += window_step_size

vrnt_gene_pairs_in_windows_df = pd.concat(vrnt_gene_pairs_in_windows_df_list)
rename_window = {"upstream_0": "gene_body", "downstream_0": "gene_body"}
vrnt_gene_pairs_in_windows_df.window = vrnt_gene_pairs_in_windows_df.window.replace(rename_window)
vrnt_gene_pairs_in_windows_df = vrnt_gene_pairs_in_windows_df.drop_duplicates()

Counting variants upstream, 0bp ...
Counting variants upstream, 200000bp ...
Counting variants upstream, 400000bp ...
Counting variants upstream, 600000bp ...
Counting variants upstream, 800000bp ...
Counting variants upstream, 1000000bp ...
Counting variants downstream, 0bp ...
Counting variants downstream, 200000bp ...
Counting variants downstream, 400000bp ...
Counting variants downstream, 600000bp ...
Counting variants downstream, 800000bp ...
Counting variants downstream, 1000000bp ...


In [5]:
# all possible SNP/indel carriers 
windows_maf_vrnt_type, count = {}, 0
for window in vrnt_gene_pairs_in_windows_df.window.unique(): 
    for maf_bin in ["0-1", "1-5", "5-10", "10-50"]:
        for vrnt_type in ["snp", "indel"]: 
            windows_maf_vrnt_type[count] = [window, maf_bin, vrnt_type]
            count += 1
windows_maf_vrnt_type_df = pd.DataFrame.from_dict(windows_maf_vrnt_type, orient="index", columns=["window", "maf_bin", "vrnt_type"])

In [6]:
## gene expression 
ge_mtx_df = pd.read_csv(ge_mtx_path)

In [7]:
### subset gene expresstion matrix to genes on chromsome 
misexp_genes = pd.read_csv(genes_bed_path, sep="\t", header=None)[3].unique()
num_misexp_genes = len(misexp_genes)
print(f"Number of misexpressed genes on chromosome: {num_misexp_genes}")
ge_mtx_chrom_df = ge_mtx_df[ge_mtx_df.gene_id.isin(misexp_genes)]

### subset to gene expression matrix to samples with genotyping data 
# get EGAN IDs with RNA data 
wgs_rna_paired_smpls_df = pd.read_csv(wgs_rna_paired_smpls_path, sep="\t")
egan_ids_with_rna = wgs_rna_paired_smpls_df.egan_id.unique()
# load vcf
vcf = pysam.VariantFile(vcf_path, mode = "r")
vcf_samples = [sample for sample in vcf.header.samples]
vcf_egan_ids_with_rna = set(egan_ids_with_rna).intersection(set(vcf_samples))
# subset egan ID and RNA ID links to samples with genotyping data and passing QC 
wgs_rna_paired_smpls_in_vcf_df = wgs_rna_paired_smpls_df[wgs_rna_paired_smpls_df.egan_id.isin(vcf_egan_ids_with_rna)]
ge_matrix_flat_chrom_egan_df = pd.merge(ge_mtx_chrom_df, wgs_rna_paired_smpls_in_vcf_df, how="inner", on="rna_id")
# add gene sample pairs 
ge_matrix_flat_chrom_egan_df["gene_smpl_pair"] = ge_matrix_flat_chrom_egan_df.gene_id + "," + ge_matrix_flat_chrom_egan_df.egan_id

rna_ids_pass_gt_rna_qc = ge_matrix_flat_chrom_egan_df.rna_id.unique()
gene_ids_pass_qc = ge_matrix_flat_chrom_egan_df.gene_id.unique()
print(f"Samples with genotyping data and passing RNA-seq QC: {len(rna_ids_pass_gt_rna_qc)}")
if ge_matrix_flat_chrom_egan_df.shape[0] != num_misexp_genes * len(rna_ids_pass_gt_rna_qc): 
    raise ValueError("Number of genes and samples passsing QC does not match gene expression matrix size.")

Number of misexpressed genes on chromosome: 82
Samples with genotyping data and passing RNA-seq QC: 2821


In [29]:
count_misexp_cntrl_carriers_windows_df_list = []
for z_cutoff in z_cutoff_list:
    # misexpression events 
    print(f"Z-score threshold: {z_cutoff}")
    misexp_df = ge_matrix_flat_chrom_egan_df[(ge_matrix_flat_chrom_egan_df["z-score"] > z_cutoff) & 
                                             (ge_matrix_flat_chrom_egan_df["TPM"] > tpm_cutoff)]
    misexp_genes = misexp_df.gene_id.unique()
    print(f"\tNumber of misexpressed genes: {len(misexp_genes)}")
    for gene_id in misexp_genes:
        print(f"\t{gene_id}")
        gene_misexp_df = misexp_df[misexp_df.gene_id == gene_id]
        # misexpressed samples 
        misexp_rna_id = gene_misexp_df.rna_id.unique()
        misexp_gene_smpl = gene_misexp_df.gene_smpl_pair.unique()
        # control events
        cntrl_df = ge_matrix_flat_chrom_egan_df[(ge_matrix_flat_chrom_egan_df.gene_id == gene_id) & 
                                                (~ge_matrix_flat_chrom_egan_df.gene_smpl_pair.isin(misexp_gene_smpl))]
        cntrl_gene_smpl = cntrl_df.gene_smpl_pair.unique() 
        # load genotypes 
        vrnt_gt_intersect_path = Path(gt_info_root_path).joinpath(f"{chrom}/{chrom}_{gene_id}_vrnts_gts_intersect.tsv")
        vrnt_gt_intersect_df = pd.read_csv(vrnt_gt_intersect_path, sep="\t")
        vrnt_gt_intersect_df["gene_id"] = gene_id
        # assign variants MAF bins 
        maf_bins = [0, 0.01, 0.05, 0.1, 0.5]
        af_bin_labels = ["0-1", "1-5", "5-10", "10-50"]
        vrnt_gt_maf_lt50_df = vrnt_gt_intersect_df[vrnt_gt_intersect_df.AF < 0.5].copy()
        vrnt_gt_maf_lt50_df["maf_bin"] = pd.cut(vrnt_gt_maf_lt50_df.AF, bins=maf_bins, labels=af_bin_labels, right=False)
        maf_bins = [1-0.5, 1-0.1, 1-0.05, 1-0.01, 1-0]
        af_bin_labels =["10-50", "5-10", "1-5", "0-1"]
        vrnt_gt_maf_gt50_df = vrnt_gt_intersect_df[vrnt_gt_intersect_df.AF >= 0.5].copy()
        vrnt_gt_maf_gt50_df["maf_bin"] = pd.cut(vrnt_gt_maf_gt50_df.AF, bins=maf_bins, labels=af_bin_labels, right=True)
        vrnt_gt_maf_df = pd.concat([vrnt_gt_maf_gt50_df, vrnt_gt_maf_lt50_df])
        # add position windows 
        vrnt_gt_af_pos_df = pd.merge(vrnt_gt_maf_df, 
                                     vrnt_gene_pairs_in_windows_df, 
                                     on=["vrnt_id", "gene_id"], 
                                     how="inner")
        # add gene-sample IDs 
        vrnt_gt_af_pos_df["gene_smpl_pair"] = vrnt_gt_af_pos_df.gene_id + "," + vrnt_gt_af_pos_df.egan_id
        # misexp gene-pairs 
        misexp_carriers_df = vrnt_gt_af_pos_df[vrnt_gt_af_pos_df.gene_smpl_pair.isin(misexp_gene_smpl)]
        
        count_misexp_carriers_df = pd.DataFrame(misexp_carriers_df.groupby(["window", "vrnt_type", "maf_bin"]).gene_smpl_pair.nunique())
        count_misexp_carriers_df = count_misexp_carriers_df.reset_index()
        count_misexp_carriers_df = count_misexp_carriers_df.rename(columns={"gene_smpl_pair": "misexp_carrier_check"})
        
        # cntrl gene_pairs 
        cntrl_carriers_df = vrnt_gt_af_pos_df[vrnt_gt_af_pos_df.gene_smpl_pair.isin(cntrl_gene_smpl)]
        count_cntrl_carriers_df = pd.DataFrame(cntrl_carriers_df.groupby(["window", "vrnt_type", "maf_bin"]).gene_smpl_pair.nunique())
        count_cntrl_carriers_df = count_cntrl_carriers_df.reset_index()
        count_cntrl_carriers_df = count_cntrl_carriers_df.rename(columns={"gene_smpl_pair": "control_carrier_check"})
        
        count_misexp_cntrl_carrier_df = pd.merge(count_misexp_carriers_df, 
                                                 count_cntrl_carriers_df, 
                                                 on=["window", "maf_bin", "vrnt_type"], 
                                                 how="outer"
                                                )
        count_misexp_cntrl_carrier_all_df = pd.merge(windows_maf_vrnt_type_df, 
                                                     count_misexp_cntrl_carrier_df, 
                                                     on=["window", "maf_bin", "vrnt_type"], 
                                                     how="left")
        count_misexp_cntrl_carrier_all_df["gene_id"] = gene_id
        count_misexp_cntrl_carrier_all_df["misexp_total_check"] = len(misexp_gene_smpl)
        count_misexp_cntrl_carrier_all_df["control_total_check"] = len(cntrl_gene_smpl)
        count_misexp_cntrl_carrier_all_df["z_cutoff"] = z_cutoff
        count_misexp_cntrl_carriers_windows_df_list.append(count_misexp_cntrl_carrier_all_df)
    break 
    

Z-score threshold: 2
	Number of misexpressed genes: 72
	ENSG00000206102
	ENSG00000224100
	ENSG00000229289
	ENSG00000229986
	ENSG00000274248
	ENSG00000237338
	ENSG00000229382
	ENSG00000142182
	ENSG00000232360
	ENSG00000182591
	ENSG00000227702
	ENSG00000236471
	ENSG00000224541
	ENSG00000233393
	ENSG00000275874
	ENSG00000184856
	ENSG00000226956
	ENSG00000225330
	ENSG00000237735
	ENSG00000205439
	ENSG00000241123
	ENSG00000280095
	ENSG00000223400
	ENSG00000274749
	ENSG00000272804
	ENSG00000188694
	ENSG00000224574
	ENSG00000230379
	ENSG00000187766
	ENSG00000231324
	ENSG00000187026
	ENSG00000267857
	ENSG00000231620
	ENSG00000227075
	ENSG00000231986
	ENSG00000231106
	ENSG00000224269
	ENSG00000206105
	ENSG00000223608
	ENSG00000237664
	ENSG00000236545
	ENSG00000259981
	ENSG00000273115
	ENSG00000198390
	ENSG00000230978
	ENSG00000261706
	ENSG00000233480
	ENSG00000277693
	ENSG00000198054
	ENSG00000236332
	ENSG00000183640
	ENSG00000224413
	ENSG00000230794
	ENSG00000231867
	ENSG00000160202
	ENSG00000

In [30]:
count_misexp_cntrl_carriers_windows_df = pd.concat(count_misexp_cntrl_carriers_windows_df_list) 
count_misexp_cntrl_carriers_windows_df.maf_bin = count_misexp_cntrl_carriers_windows_df.maf_bin.astype(str)
count_misexp_cntrl_carriers_windows_df = count_misexp_cntrl_carriers_windows_df.fillna(0)

In [31]:
### load previous results 
snp_indel_carriers_results_path = wkdir_path.joinpath(f"4_vrnt_enrich/snp_indel_count_carriers/count_snp_indel_carriers_af50/{chrom}_carrier_count.tsv")
snp_indel_carriers_results_df = pd.read_csv(snp_indel_carriers_results_path, sep="\t")

In [32]:
rename_maf_range = {'0-0.01': "0-1", '0.01-0.05': "1-5", '0.05-0.1': "5-10", '0.1-0.5': "10-50"}
snp_indel_carriers_results_df["maf_bin"] = snp_indel_carriers_results_df.maf_range.replace(rename_maf_range)
snp_indel_carriers_results_df = snp_indel_carriers_results_df.drop(columns=["maf_range"])

In [33]:
snp_indel_count_window_check_df = pd.merge(snp_indel_carriers_results_df, 
                                    count_misexp_cntrl_carriers_windows_df, 
                                    on=["z_cutoff", "window", "gene_id", "vrnt_type", "maf_bin"],
                                    how = "inner")

In [34]:
# check carrier counts
if not snp_indel_count_window_check_df.control_total.astype(int).equals(snp_indel_count_window_check_df.control_total_check.astype(int)):
    raise ValueError("Difference in total control events.")
if not snp_indel_count_window_check_df.misexp_total.astype(int).equals(snp_indel_count_window_check_df.misexp_total_check.astype(int)):
    raise ValueError("Difference in total misexpression events.")
if not snp_indel_count_window_check_df.misexp_carrier.astype(int).equals(snp_indel_count_window_check_df.misexp_carrier_check.astype(int)):
    raise ValueError("Difference in total misexpression carriers.")
if not snp_indel_count_window_check_df.control_carrier.astype(int).equals(snp_indel_count_window_check_df.control_carrier_check.astype(int)):
    raise ValueError("Difference in total control carriers.")