In [None]:
import logging
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from pnet.data_processing import filter_variants, prostate_data_loaders, utils

sys.path.insert(0, '../..') # add project_config to path
import project_config

try:
    import wandb
    _wandb_available = True
except ImportError:
    _wandb_available = False
    print("Warning: wandb is not installed. W&B functionality will be unavailable.")

logging.basicConfig(
            format='%(asctime)s %(levelname)-8s %(message)s',
            level=logging.INFO,
            datefmt='%Y-%m-%d %H:%M:%S')

logger = logging.getLogger()
logger.setLevel(logging.INFO)

%load_ext autoreload
%autoreload 2

In [None]:
# # wandb setup
# if _wandb_available:
#     os.environ['WANDB_NOTEBOOK_NAME'] = "pipeline_full_vcf_to_germline_patho_vcfs.ipynb"
#     wandb.login()
#     run = wandb.init(
#         project="prostate_met_status",
#         group="data_prep_germline_tier12_and_somatic",
#     )

In [None]:
# hyperparameters
dry_run = True
DATADIR = project_config.GERMLINE_DATA_DIR # data directory containing input files for the empirical analysis with P1000 data
FIGDIR = project_config.FIGURE_DIR
PROCESSED_GERMLINE_VCFS_DIR = project_config.PROCESSED_GERMLINE_VCFS_DIR # os.path.join(DATADIR, "processed_germline_vcfs") # location to save processed germline VCFs

FULL_VCF_F = os.path.join(DATADIR, "raw/germline_samples_final_post_ancestry_max_ratio_1330_vt2_VEP_Genotypes.txt") # may only have the compressed .gz version
GENE_LIST_F = os.path.join(DATADIR, "pathogenic_germline/germline_tier_12_and_somatic.csv")

PROPORTION_THRESHOLD = 0.05
ID_COL="Uploaded_variation"

# Subset the full VCF by a list of genes
Since the full VCF is quite large, we will load it in chunks at a time. This initial filtering process can be time consuming, so we will save down the intermediate files to prevent needing to re-run this code.

In [None]:
subset_id = utils.filename(GENE_LIST_F) # use the filename as the subset identifier
genes = utils.read_gene_list_from_csv(GENE_LIST_F) # get the gene list
savepath = os.path.join(PROCESSED_GERMLINE_VCFS_DIR, f"prostate_germline_vcf_subset_to_{subset_id}.txt") # make the savepath

# start process of making VCF subset
logging.info(f"working on file {GENE_LIST_F}, with genes: {genes}")
logging.info(f"will save to {savepath}")


# utils.filter_annotated_vcf_by_gene_list_chunking(annot_vcf_f = FULL_VCF_F, 
#                                             gene_list=genes, 
#                                             save_filtered_df_path=savepath)

## Apply universal variant quality filters
- variant quality (dp, gq, VAF)
- remove artifacts, e.g. exact variant in more than 5% of samples

In [None]:
VCF_F = os.path.join(PROCESSED_GERMLINE_VCFS_DIR, "prostate_germline_vcf_subset_to_germline_tier_12_and_somatic.txt")

save_name = utils.filename(VCF_F)
print(save_name)

In [None]:
# doing universal variant filtering steps
logging.info("\nFilter out low-quality variants (universal variant filtering steps)")

vcf_df = pd.read_csv(VCF_F, sep="\t", low_memory=False)
vcf_df = filter_variants.variant_quality_filter(vcf_df, min_dp=10, min_gq=20, min_vaf=0.25, failed_qc_fill="./.")

logging.info("Filter out benign/likely benign variants using ClinVar annotations")
vcf_df = filter_variants.subset_to_non_benign(vcf_df, clinsig_col = 'ClinVar_updated_2021Jun_CLNSIG')

logging.info("Merge near-duplicate rows")
vcf_df = filter_variants.merge_near_duplicate_rows_in_vcf(vcf_df, ID_COL)

logging.info(f"Filter out likely artifactual variants (in >{PROPORTION_THRESHOLD} proportion of the dataset's samples)")
vcf_df = filter_variants.remove_vars_too_common_in_dataset_from_annotated_vcf(vcf_df, proportion_threshold = PROPORTION_THRESHOLD)

save_name += "_passed-universal-filters"
save_f = os.path.join(PROCESSED_GERMLINE_VCFS_DIR, f'{save_name}.txt')
logging.info(f"Saving the VCF after universal filtering steps to {save_f}\n")
passed_universal_filters_vcf = vcf_df.copy()
display(passed_universal_filters_vcf)

if not dry_run:
    logging.info(f"Saving the VCF after universal filtering steps to {save_f}\n")
    passed_universal_filters_vcf.to_csv(save_f, index=False, sep="\t")

In [None]:
# If you've already run and don't want to re-run the filtering steps, you can load the saved file instead
passed_universal_filters_vcf = pd.read_csv(os.path.join(PROCESSED_GERMLINE_VCFS_DIR, f'{utils.filename(VCF_F)}_passed-universal-filters.txt'), sep="\t", low_memory=False)
passed_universal_filters_vcf

## Apply pathogenicity filtration workflow

In [None]:
varQC_passed_and_patho_only_vcf = filter_variants.variant_selection_workflow(passed_universal_filters_vcf, genes_to_subset=passed_universal_filters_vcf.SYMBOL.unique().tolist())
display(varQC_passed_and_patho_only_vcf)
save_name = f'{utils.filename(VCF_F)}_passed-universal-filters_patho-vars-only'

save_f = os.path.join(PROCESSED_GERMLINE_VCFS_DIR, f'{save_name}.txt')
if not dry_run:
    logging.info(f"Saving the VCF after variant QC and pathogenicity filtering steps (shape: {varQC_passed_and_patho_only_vcf.shape}) to: \n{save_f}")
    varQC_passed_and_patho_only_vcf.to_csv(save_f, index=False, sep="\t")


## Subset variants by some combination of prevalence and impact
- rare vs common
- high-impact vs moderate impact (LOF vs missense)

In [None]:
def make_final_vcf(vcf_df, restrict_to_rare, restrict_to_common, keep_high_impact, keep_moderate_impact, base_save_name="", save_dir=""):
    final_vcf = vcf_df.copy()
    to_add = []
    save_name = base_save_name
    logging.info("Filtering VCFs by variant MAF")
    if restrict_to_rare:
        logging.info("Filter to rare variants (<1% in gnomad)")
        save_name += "_rare"
        rare_vcf = filter_variants.subset_to_low_frequency(final_vcf)
        logging.debug(f'rare_vcf.shape: {rare_vcf.shape}')
        to_add.append(rare_vcf)

    if restrict_to_common:
        logging.info("Filter to common variants (>=1% in gnomad)")
        save_name += "_common"
        common_vcf = filter_variants.subset_to_high_frequency(final_vcf)
        logging.debug(f'common_vcf.shape: {common_vcf.shape}')
        to_add.append(common_vcf)

    # combine what we have so far - want the impact to be a subset of what we have already
    logging.info(f"Merge {len(to_add)} VCF subsets into one VCF; we will now further subset by variant impact")
    final_vcf = pd.concat(to_add, ignore_index=True)
    logging.debug("shape before drop dups: {}".format(final_vcf.shape))
    final_vcf = final_vcf.drop_duplicates(subset=[ID_COL])
    logging.debug("shape after drop dups: {}".format(final_vcf.shape))
    to_add=[]

    logging.info("Filtering VCFs by variant predicted impact (VEP)")
    if keep_high_impact: # keep high impact aka LOF
        logging.info("Get the VEP high-impact variants (mostly LOF)")
        save_name += "_high-impact"
        high_impact_vcf = filter_variants.subset_to_severe_consequence(final_vcf)
        logging.debug(f'high_impact_vcf.shape: {high_impact_vcf.shape}')
        to_add.append(high_impact_vcf)

    if keep_moderate_impact: # keep moderate aka missense
        logging.info("Get the VEP moderate-impact variants (mostly missense)")
        save_name += "_moderate-impact"
        moderate_impact_vcf = filter_variants.subset_to_moderate_consequence(final_vcf)
        logging.debug(f'moderate_impact_vcf.shape: {moderate_impact_vcf.shape}')
        to_add.append(moderate_impact_vcf)

    logging.info(f"Merge {len(to_add)} VCF subsets into one VCF")
    final_vcf = pd.concat(to_add, ignore_index=True)
    logging.debug("shape before drop dups: {}".format(final_vcf.shape))
    final_vcf = final_vcf.drop_duplicates(subset=[ID_COL])
    final_vcf = filter_variants.add_consolidated_consequence(final_vcf)
    logging.debug("shape after drop dups: {}".format(final_vcf.shape))

    if final_vcf.shape[0] == 0:
        logging.info("No variants left after filtering by impact")
        return None

    if save_dir != "":
        save_f = os.path.join(save_dir, f'{save_name}.txt')
        logging.info(f"Saving the filtered VCF to {save_f}\n")
        final_vcf.to_csv(save_f, index=False, sep="\t")
    logging.info("Final VCF shape: {}".format(final_vcf.shape))
    return final_vcf

In [None]:
base_save_name = f'{utils.filename(VCF_F)}_passed-universal-filters_patho-vars-only'
save_dir = PROCESSED_GERMLINE_VCFS_DIR
print(save_dir, '\n', base_save_name)

# NOTE: no common LOF exists in the P1000 dataset. However, leaving this in for completeness in case other datasets have common LOF variants.
# TODO: add check for input DF equivalence to avoid redundant runs? Raise warning, maybe if merging with an empty DF during the creation process 
all_combos_to_run = [
    {'rare': val[0], 'common': val[1], 'high-impact': val[2], 'moderate-impact': val[3]}
        for val in [
            [True, False, True, False], # rare LOF
            [True, False, False, True], # rare missense
            [False, True, True, False], # common LOF 
            [False, True, False, True], # common missense
            [True, True, True, False], # all LOF (rare + common LOF)
            [True, True, False, True], # all missense (rare + common missense)
            [True, False, True, True], # all rare (rare LOF + rare missense)
            [False, True, True, True], # all common (common LOF + common missense)
            [True, True, True, True], # all LOF and missense, common and rare
    ]            
]

for d in all_combos_to_run:
    logging.info("\nWorking on {}".format(d))
    if not dry_run:
        make_final_vcf(varQC_passed_and_patho_only_vcf, d['rare'], d['common'], d['high-impact'], d['moderate-impact'], 
                   base_save_name=base_save_name, save_dir=save_dir)
    else:
        make_final_vcf(varQC_passed_and_patho_only_vcf, d['rare'], d['common'], d['high-impact'], d['moderate-impact'], 
                   base_save_name=base_save_name)


## Done with generating filtered, pathogenic-only VCFs.


# Downstream exploration and work
## Convert variant-level VCF to gene-level binarized genotype matrix (samples x genes)


In [None]:
FINAL_VCF_F = os.path.join(PROCESSED_GERMLINE_VCFS_DIR, f'{base_save_name}_rare_common_high-impact_moderate-impact.txt')
germline_df = prostate_data_loaders.get_germline_mutation(FINAL_VCF_F)
display(germline_df)

## Gene-level attrition
Note: initial gene set had 824 genes, but only 335 remain after our quality filtration and pathogenicity filtration.

In [None]:
# loading this takes time, so comment out if you don't need it
# vcf_df = pd.read_csv(VCF_F, sep="\t", low_memory=False)
# passed_universal_filters_vcf = pd.read_csv(os.path.join(PROCESSED_GERMLINE_VCFS_DIR, f'{utils.filename(VCF_F)}_passed-universal-filters.txt'), sep="\t", low_memory=False)
FINAL_VCF_F = os.path.join(PROCESSED_GERMLINE_VCFS_DIR, f'{base_save_name}_rare_common_high-impact_moderate-impact.txt')
v3 = prostate_data_loaders.load_germline_mut(FINAL_VCF_F)

print(f"Initial: {vcf_df.SYMBOL.nunique()} genes, shape: {vcf_df.shape}")
print(f"After quality filtering: {passed_universal_filters_vcf.SYMBOL.nunique()} genes left, shape: {passed_universal_filters_vcf.shape}")
print(f"After quality and pathogenicity filtering: {v3.SYMBOL.nunique()} genes left, shape: {v3.shape}")

In [None]:
if _wandb_available:
    run.finish()

## Plot: variants per sample and samples per variant pre and post pathogenicity filtration

In [None]:
vcf_pre_patho_f = os.path.join(PROCESSED_GERMLINE_VCFS_DIR, f'{utils.filename(VCF_F)}_passed-universal-filters.txt')
vcf_post_patho_f = os.path.join(PROCESSED_GERMLINE_VCFS_DIR, f'{utils.filename(VCF_F)}_passed-universal-filters_patho-vars-only.txt') # before filtering to _rare_common_high-impact_moderate-impact

logging.info(f"Loading pre-pathogenicity filtering VCF from {vcf_pre_patho_f}")
vcf = pd.read_csv(vcf_pre_patho_f, sep="\t", low_memory=False)
vcf = utils.get_sample_cols_from_VCF(vcf)
vcf = vcf.applymap(utils.binarize)

logging.info(f"Loading post-pathogenicity filtering VCF from {vcf_post_patho_f}")
patho_vcf = pd.read_csv(vcf_post_patho_f, sep="\t", low_memory=False)
patho_vcf = utils.get_sample_cols_from_VCF(patho_vcf)
patho_vcf = patho_vcf.applymap(utils.binarize)

logging.info(f"Pre-pathogenicity filtering VCF shape: {vcf.shape}")
logging.info(f"Post-pathogenicity filtering VCF shape: {patho_vcf.shape}")

In [None]:
n_variants_per_sample = vcf.sum(axis=0)
n_samples_per_variant = vcf.sum(axis=1)
n_variants_per_sample_patho = patho_vcf.sum(axis=0)
n_samples_per_variant_patho = patho_vcf.sum(axis=1)

logging.info(f"max # of variants in a single sample: {np.max(n_variants_per_sample)} vs {np.max(n_variants_per_sample_patho)}")
logging.info(f"max # of samples with a particular variant: {np.max(n_samples_per_variant)} vs {np.max(n_samples_per_variant_patho)}")

logging.info(f"min # of variants in a single sample: {np.min(n_variants_per_sample)} vs {np.min(n_variants_per_sample_patho)}")
logging.info(f"min # of samples with a particular variant: {np.min(n_samples_per_variant)} vs {np.min(n_samples_per_variant_patho)}")


In [None]:
logging.info(f"Number of variants with no samples before patho filtration: {np.sum(n_samples_per_variant == 0)}") # these variants didn't have any samples where they passed my universal filters
logging.info(f"Number of variants with no samples after patho filtration: {np.sum(n_samples_per_variant_patho == 0)}")

In [None]:
# Create a 2x2 grid of subplots for visualization
patho_color = '#d62728'  # red for pathogenic-only plots
before_color = '#7f7f7f'  # gray for pre-pathogenicity filtration plots

def add_mean_vline(ax, values, color='#4d4d4d', ls='-', lw=2, offset_frac=0.02):
    """
    Draws a vertical line at the mean of `values` and labels it slightly to the right.
    offset_frac: fraction of x-axis width to offset the label from the mean
    """
    m = np.mean(values)
    ax.axvline(m, color=color, linestyle=ls, linewidth=lw, label=f'Mean = {m:.2f}')
    
    # Compute offset in data units
    x_range = ax.get_xlim()[1] - ax.get_xlim()[0]
    offset = offset_frac * x_range
    
    ax.text(
        m + offset,
        ax.get_ylim()[1] * 0.95,  # 90% up the y-axis
        f'{m:.2f}',
        ha='left', va='top',
        fontsize=10,
        color=color
    )
    return m

def add_row_labels(fig, axs, labels, pad=0.02, **textkw):
    """
    Put one label per row to the RIGHT of a subplot grid.
    pad: horizontal gap in figure coords from the right edge of the last column.
    Extra text properties can be passed via **textkw (e.g., fontsize, bbox).
    """
    import numpy as np
    axs = np.atleast_2d(axs)
    last_col = axs[:, -1]

    # x position: a bit right of the last column
    x = last_col[0].get_position().x1 + pad

    # If there's no room, expand the right margin
    if x > 1:
        fig.subplots_adjust(right=last_col[0].get_position().x1 - pad)
        x = last_col[0].get_position().x1 + pad

    for ax, label in zip(last_col, labels):
        pos = ax.get_position()            # figure coordinates
        y = 0.5 * (pos.y0 + pos.y1)        # vertically center on the row
        fig.text(x, y, label, ha='left', va='center', transform=fig.transFigure, **textkw)



logging.info("making histograms of variants per sample and samples per variant pre and post pathogenicity filtration")
fig, axs = plt.subplots(2, 2, figsize=(5, 4))

p1 = utils.n_variants_per_sample_from_vcf(vcf, 
                                          plot_title="Variants per sample",
                                          plot_id=f"{subset_id}",
                                          ax=axs[0, 0])
p2 = utils.n_samples_per_variant_from_vcf(vcf, 
                                            plot_title="Samples per variant",
                                            plot_id=f"{subset_id}",
                                            ax=axs[0, 1], 
                                            logscale=True)

logging.info("making plots, now with pathogenic variants only")
patho_p1 = utils.n_variants_per_sample_from_vcf(patho_vcf, 
                                                plot_title="Variants per sample",
                                                plot_id=f"{subset_id}",
                                                ax=axs[1, 0])
patho_p2 = utils.n_samples_per_variant_from_vcf(patho_vcf, 
                                                plot_title="Samples per variant",
                                                plot_id=f"{subset_id}",
                                                ax=axs[1, 1], 
                                            logscale=True)

logging.debug("Add mean num variants per sample line")
add_mean_vline(p1, n_variants_per_sample, offset_frac=0.04)
add_mean_vline(patho_p1, n_variants_per_sample_patho, offset_frac=0.04)

logging.debug("Change the color of pathogenic-only plots to red (and pre-patho filtration to gray)")
[patch.set_fc(patho_color) for ax in axs[1, :] for patch in ax.patches]
[patch.set_fc(before_color) for ax in axs[0, :] for patch in ax.patches]

logging.debug("Add row labels to the right of the second column")
pre_patho_label = f'Pre-pathogenicity filtration\nn={vcf.shape[0]} variants'
patho_label = f'Pathogenic only\nn={patho_vcf.shape[0]} variants'
add_row_labels(
    fig, axs,
    labels=[pre_patho_label, patho_label],
    pad=0.10,
    fontsize=12,
    # bbox=dict(boxstyle='round,pad=0.3', fc='white', ec='black', alpha=0.8)
)


# Clear titles for second row
for ax in axs[1, :]:
    ax.set_title('')

# Clear x-axis labels for first row
for ax in axs[0, :]:
    ax.set_xlabel('')  

# fig.suptitle(f"{subset_id}\n(red is post pathogenic-only filter)")
plt.tight_layout()
plt.subplots_adjust(wspace=0.3, hspace=0.3)

# Save figure
save_path = os.path.join(FIGDIR, f"{subset_id}", f"variant_and_sample_dists_{subset_id}.png")
utils.savefig(save_path)

# Show the figure
plt.savefig(save_path, format='png', dpi=600)
plt.show()