In [None]:
from pathlib import Path
import math

import pandas as pd
import numpy as np
from scipy import stats

import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")

In [None]:
# Sites with low-confidence AA, sites with unknown AA, and singletons are included.
# Multi-allelic sites and sites with indels are excluded.

In [None]:
base_dir = Path("/Users/szhan/Projects/tsimpute/results/")
prefix = "chr20_p."
suffix = ".imputation.csv.gz"

# tsinfer 
#in_ts_csv_file = base_dir / str(prefix + "default" + suffix)    # withOUT genetic map
#in_ts_map_csv_file = base_dir / str(prefix + "default.genetic_map" + suffix)    # with genetic map

# Sample-matching
in_sm_csv_file = base_dir / str(prefix + "sample_matched.precision10" + suffix)  # withOUT genetic map
in_sm_map_csv_file = base_dir / str(prefix + "sample_matched.genetic_map.precision10" + suffix)  # with genetic map

# BEAGLE
in_beagle_csv_file = base_dir / str(prefix + "beagle" + suffix) # with genetic map


In [None]:
#df_ts = pd.read_csv(in_ts_csv_file, comment="#")
#df_ts_map = pd.read_csv(in_ts_map_csv_file, comment="#")

df_sm = pd.read_csv(in_sm_csv_file, comment="#")
df_sm_map = pd.read_csv(in_sm_map_csv_file, comment="#")

df_beagle = pd.read_csv(in_beagle_csv_file, comment="#")


In [None]:
#print(df_ts.shape)
#print(df_ts_map.shape)
print(df_sm.shape)
print(df_sm_map.shape)
print(df_beagle.shape)


In [None]:
sm_site_pos = set(df_sm["position"].to_numpy())
beagle_site_pos = set(df_beagle["position"].to_numpy())
shared_site_pos = list(sm_site_pos & beagle_site_pos)
print(f"SHARED SITES: {len(shared_site_pos)}")


In [None]:
#df_ts = df_ts[np.isin(df_ts["position"], shared_site_pos)]
#df_ts_map = df_ts_map[np.isin(df_ts_map["position"], shared_site_pos)]

df_sm = df_sm[np.isin(df_sm["position"], shared_site_pos)]
df_sm_map = df_sm_map[np.isin(df_sm_map["position"], shared_site_pos)]

df_beagle = df_beagle[np.isin(df_beagle["position"], shared_site_pos)]


In [None]:
#df_ts = df_ts[["ref_minor_allele_freq", "iqs"]].dropna(axis=0).reset_index()
#df_ts_map = df_ts_map[["ref_minor_allele_freq", "iqs"]].dropna(axis=0).reset_index()

df_sm = df_sm[["ref_minor_allele_freq", "iqs"]].dropna(axis=0).reset_index()
df_sm_map = df_sm_map[["ref_minor_allele_freq", "iqs"]].dropna(axis=0).reset_index()

df_beagle = df_beagle[["ref_minor_allele_freq", "iqs"]].dropna(axis=0).reset_index()


In [None]:
#print(df_ts.shape)
#print(df_ts_map.shape)

print(df_sm.shape)
print(df_sm_map.shape)

print(df_beagle.shape)


In [None]:
def plot_results(df, method, out_png_file=None, max_maf=0.50, dpi=100, subsample_fraction=0.01):
    assert 0.0 <= subsample_fraction <= 1.0
    subsample_size = math.ceil(df.shape[0] * subsample_fraction)
    subsample = np.random.choice(np.arange(df.shape[0]), subsample_size)

    values = np.vstack([df["ref_minor_allele_freq"][subsample], df["iqs"][subsample]])
    kernel = stats.gaussian_kde(values)
    x = kernel(np.vstack([df["ref_minor_allele_freq"], df["iqs"]]))

    fig, ax = plt.subplots(figsize=(7, 7,))

    ax.set_title(f"{method}", size=20)
    ax.set_xlim([0, max_maf])
    ax.set_ylabel("IQS", size=20)
    ax.set_xlabel("MAF", size=20)
    ax.tick_params(axis='both', which='major', labelsize=20)

    g = sns.scatterplot(
        y="iqs",
        x="ref_minor_allele_freq",
        data=df,
        c=x,
        cmap="viridis",
        #x_jitter=True,
        ax=ax
    );

    if out_png_file is not None:
        g.get_figure().savefig(out_png_file, dpi=dpi)


In [None]:
#plot_results(df_ts, "tsinfer (default) without genetic map")

In [None]:
#plot_results(df_ts_map, "tsinfer (default) with genetic map")

In [None]:
plot_results(df_sm, "Sample-matching without genetic map")

In [None]:
plot_results(df_sm_map, "Sample-matching with genetic map")

In [None]:
plot_results(df_beagle, "BEAGLE with genetic map")

In [None]:
min_iqs = 0.90

#score_ts = np.sum(df_ts["iqs"] >= min_iqs) / float(df_ts.shape[0])
#score_ts_map = np.sum(df_ts_map["iqs"] >= min_iqs) / float(df_ts_map.shape[0])

score_sm = np.sum(df_sm["iqs"] >= min_iqs) / float(df_sm.shape[0])
score_sm_map = np.sum(df_sm_map["iqs"] >= min_iqs) / float(df_sm_map.shape[0])

score_beagle = np.sum(df_beagle["iqs"] >= min_iqs) / float(df_beagle.shape[0])


In [None]:
print(f"% sites with IQS >= {min_iqs}")

#print(f"tsinfer withOUT map         : {score_ts}")
#print(f"tsinfer with map            : {score_ts_map}")

print(f"sample-matching withOUT map : {score_sm}")
print(f"sample-matching with map    : {score_sm_map}")

print(f"beagle with map             : {score_beagle}")
