In [None]:
from pathlib import Path
import math

import pandas as pd
import numpy as np
from scipy import stats

import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")

In [None]:
# Sites with low-confidence AA, sites with unknown AA, and singletons are included.
# Multi-allelic sites and sites with indels are excluded.

In [None]:
base_dir = Path("/Users/szhan/Projects/tsimpute/results/")
prefix = "chr20_p."
suffix = ".imputation.csv.gz"

# tsinfer
# withOUT sites with low-confidence or unknown AA
in_ts_csv_file = base_dir / str(prefix + "default" + suffix)
# with sites with low-confidence or unknown AA
in_ts_no_csv_file = base_dir / str(prefix + "default_no" + suffix)
# with genetic map
in_ts_map_csv_file = base_dir / str(prefix + "default_no.genetic_map" + suffix)

# Sample-matching
# withOUT genetic map
in_sm_csv_file = base_dir / str(prefix + "sample_matched_no.precision10" + suffix)
# with genetic map
in_sm_map_csv_file = base_dir / str(prefix + "sample_matched_no.genetic_map.precision10" + suffix)

# BEAGLE
# with genetic map
in_beagle_csv_file = base_dir / str(prefix + "beagle_no" + suffix)


In [None]:
df_ts = pd.read_csv(in_ts_csv_file, comment="#")
df_ts_no = pd.read_csv(in_ts_no_csv_file, comment="#")
df_ts_map = pd.read_csv(in_ts_map_csv_file, comment="#")

df_sm = pd.read_csv(in_sm_csv_file, comment="#")
df_sm_map = pd.read_csv(in_sm_map_csv_file, comment="#")

df_beagle = pd.read_csv(in_beagle_csv_file, comment="#")


In [None]:
print(df_ts.shape)
print(df_ts_no.shape)
print(df_ts_map.shape)
print(df_sm.shape)
print(df_sm_map.shape)
print(df_beagle.shape)


In [None]:
df_ts.head(5)


In [None]:
sm_site_pos = set(df_sm["position"].to_numpy())
beagle_site_pos = set(df_beagle["position"].to_numpy())
shared_site_pos = list(sm_site_pos & beagle_site_pos)
print(f"SHARED SITES: {len(shared_site_pos)}")


In [None]:
df_ts = df_ts[["ref_minor_allele_freq", "iqs"]].dropna(axis=0).reset_index()
df_ts_no = df_ts_no[["ref_minor_allele_freq", "iqs"]].dropna(axis=0).reset_index()
df_ts_map = df_ts_map[["ref_minor_allele_freq", "iqs"]].dropna(axis=0).reset_index()

df_sm = df_sm[["ref_minor_allele_freq", "iqs"]].dropna(axis=0).reset_index()
df_sm_map = df_sm_map[["ref_minor_allele_freq", "iqs"]].dropna(axis=0).reset_index()

df_beagle = df_beagle[["ref_minor_allele_freq", "iqs"]].dropna(axis=0).reset_index()


In [None]:
print(df_ts.shape)
print(df_ts_no.shape)
print(df_ts_map.shape)
print(df_sm.shape)
print(df_sm_map.shape)
print(df_beagle.shape)


In [None]:
def plot_results(df, method, out_png_file=None, max_maf=0.50, dpi=100, subsample_fraction=0.01):
    assert 0.0 <= subsample_fraction <= 1.0
    subsample_size = math.ceil(df.shape[0] * subsample_fraction)
    subsample = np.random.choice(np.arange(df.shape[0]), subsample_size)

    values = np.vstack([df["ref_minor_allele_freq"][subsample], df["iqs"][subsample]])
    kernel = stats.gaussian_kde(values)
    x = kernel(np.vstack([df["ref_minor_allele_freq"], df["iqs"]]))

    fig, ax = plt.subplots(figsize=(7, 7,))

    ax.set_title(f"{method}", size=20)
    ax.set_xlim([0, max_maf])
    ax.set_ylabel("IQS", size=20)
    ax.set_xlabel("MAF", size=20)
    ax.tick_params(axis='both', which='major', labelsize=20)

    g = sns.scatterplot(
        y="iqs",
        x="ref_minor_allele_freq",
        data=df,
        c=x,
        cmap="viridis",
        #x_jitter=True,
        ax=ax
    );

    if out_png_file is not None:
        g.get_figure().savefig(out_png_file, dpi=dpi)


In [None]:
# Minimum IQS threshold
min_iqs = 0.90


In [None]:
# Without sites having low-confidence or unknown AA
plot_results(df_ts, "tsinfer (default) without genetic map")

num_sites_min_iqs = np.sum(df_ts["iqs"] >= min_iqs)
prop_sites_min_iqs = num_sites_min_iqs / float(df_ts.shape[0])
print(f"% sites with min IQS: {round(prop_sites_min_iqs * 100.0, 2)}")


In [None]:
# With sites having low-confidence or unknown AA
plot_results(df_ts_no, "tsinfer (default) without genetic map")

num_sites_min_iqs = np.sum(df_ts_no["iqs"] >= min_iqs)
prop_sites_min_iqs = num_sites_min_iqs / float(df_ts_no.shape[0])
print(f"% sites with min IQS: {round(prop_sites_min_iqs * 100.0, 2)}")


In [None]:
plot_results(df_ts_map, "tsinfer (default) with genetic map")

num_sites_min_iqs = np.sum(df_ts_map["iqs"] >= min_iqs)
prop_sites_min_iqs = num_sites_min_iqs / float(df_ts_map.shape[0])
print(f"% sites with min IQS: {round(prop_sites_min_iqs * 100.0, 2)}")


In [None]:
plot_results(df_beagle, "BEAGLE with genetic map")

num_sites_min_iqs = np.sum(df_beagle["iqs"] >= min_iqs)
prop_sites_min_iqs = num_sites_min_iqs / float(df_beagle.shape[0])
print(f"% sites with min IQS: {round(prop_sites_min_iqs * 100.0, 2)}")


In [None]:
plot_results(df_sm, "Sample matching without genetic map")

num_sites_min_iqs = np.sum(df_sm["iqs"] >= min_iqs)
prop_sites_min_iqs = num_sites_min_iqs / float(df_sm.shape[0])
print(f"% sites with min IQS: {round(prop_sites_min_iqs * 100.0, 2)}")


In [None]:
plot_results(df_sm_map, "Sample matching with genetic map")

num_sites_min_iqs = np.sum(df_sm_map["iqs"] >= min_iqs)
prop_sites_min_iqs = num_sites_min_iqs / float(df_sm_map.shape[0])
print(f"% sites with min IQS: {round(prop_sites_min_iqs * 100.0, 2)}")


### Deep dive into wrongly imputed sites in relatively high MAF bins

In [None]:
# Focus on sites with MAF >= 0.40 and IQS <= 0.80
subset_sites = df_sm[(df_sm["ref_minor_allele_freq"] >= 0.40) & (df_sm["iqs"] <= 0.80)]

In [None]:
# Questions
# 1. Are these sites near regions with high breakpoint density?
# 2. What % samples in subtree under MRCA of all wrongly imputed samples are correctly imputed?