### Analysis of rare SV carrier misexpression events

In [1]:
import pandas as pd
from pathlib import Path

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.style as style
from matplotlib import pyplot

In [2]:
wkdir = "/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v2/"
wkdir_path = Path(wkdir)
express_carrier_dir = wkdir_path.joinpath("4_vrnt_enrich/sv_count_carriers/gene_body/200kb_window/express_carrier_info")

out_dir_path = wkdir_path.joinpath("4_vrnt_enrich/sv_carrier_metrics")
out_dir_path.mkdir(parents=True, exist_ok=True)

In [3]:
CHROMOSOMES = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6',
               'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12',
               'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18',
               'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY',
               ]
af_lower, af_upper = (0, 0.01)
tpm_cutoff = 0.5
z_score_cutoff = 2

In [4]:
misexp_rare_carriers_df_list = []
express_carrier_path = Path(express_carrier_dir)
for chrom in CHROMOSOMES[:22]: 
    print(chrom)
    chrom_express_carrier = express_carrier_path.joinpath(f"{chrom}_express_carrier_info.tsv")
    chrom_express_carrier_df = pd.read_csv(chrom_express_carrier, sep="\t")
    misexp_rare_carriers_df = chrom_express_carrier_df[(chrom_express_carrier_df.AF >= af_lower) & 
                         (chrom_express_carrier_df.AF < af_upper) & 
                         (chrom_express_carrier_df.TPM > tpm_cutoff) & 
                         (chrom_express_carrier_df["z-score"] > z_score_cutoff) &
                         (chrom_express_carrier_df.genotype.isin(["(0, 1)", "(1, 1)"]))
                        ].copy()
    misexp_rare_carriers_df_list.append(misexp_rare_carriers_df)

chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr20
chr21
chr22


In [5]:
all_chrom_misexp_carriers_df = pd.concat(misexp_rare_carriers_df_list)

In [6]:
# SVs affecting multiple genes 
vrnt_misexp_gene_carrier_df = all_chrom_misexp_carriers_df[["vrnt_id", "gene_id"]].drop_duplicates()
vrnt_misexp_gene_count_df = pd.DataFrame(vrnt_misexp_gene_carrier_df.groupby("vrnt_id", as_index=False)["gene_id"].count()).rename(columns={"gene_id": "gene_count"})
total_vrnt_misexp_carrier = vrnt_misexp_gene_count_df.shape[0]
print(f"Total number of misexpression carriers: {total_vrnt_misexp_carrier}")
total_vrnt_misexp_carrier_one_gene = vrnt_misexp_gene_count_df[vrnt_misexp_gene_count_df.gene_count == 1].shape[0]
print(f"Total number of variants affecting one gene: {total_vrnt_misexp_carrier_one_gene}")
print(f"Percentage of variants affecting one gene: {total_vrnt_misexp_carrier_one_gene/total_vrnt_misexp_carrier}")

Total number of misexpression carriers: 312
Total number of variants affecting one gene: 297
Percentage of variants affecting one gene: 0.9519230769230769


In [7]:
# write to file for plot 
vrnt_misexp_gene_count_path = out_dir_path.joinpath("vrnt_misexp_gene_count.tsv")
vrnt_misexp_gene_count_df.to_csv(vrnt_misexp_gene_count_path, sep="\t", index=False)

In [8]:
# samples containing many misexpression events 
smpl_misexp_gene_carrier_df = all_chrom_misexp_carriers_df[["rna_id", "gene_id"]].drop_duplicates()
smpl_misexp_gene_carrier_count_df = pd.DataFrame(smpl_misexp_gene_carrier_df.groupby("rna_id", as_index=False)["gene_id"].count()).rename(columns={"gene_id": "gene_count"})
total_smpls_misexp_carrier = smpl_misexp_gene_carrier_count_df.shape[0]
print(f"Total samples with misexpression carriers: {total_smpls_misexp_carrier}")
total_smpl_misexp_carrier_one_gene = smpl_misexp_gene_carrier_count_df[smpl_misexp_gene_carrier_count_df.gene_count == 1].shape[0]
print(f"Total samples with one misexpression carrier: {total_smpl_misexp_carrier_one_gene}")
print(f"Total samples with more than one misexpression carrier: {(total_smpls_misexp_carrier - total_smpl_misexp_carrier_one_gene)}")
print(f"Percentage of samples  affecting one gene: {(total_smpls_misexp_carrier - total_smpl_misexp_carrier_one_gene)/total_smpls_misexp_carrier}")

Total samples with misexpression carriers: 206
Total samples with one misexpression carrier: 183
Total samples with more than one misexpression carrier: 23
Percentage of samples  affecting one gene: 0.11165048543689321


In [9]:
# write to file for plot 
smpl_misexp_gene_carrier_count_path = out_dir_path.joinpath("smpl_misexp_gene_carrier_count.tsv")
smpl_misexp_gene_carrier_count_df.to_csv(smpl_misexp_gene_carrier_count_path, sep="\t", index=False)