In [35]:
import os
import warnings
import itertools

import pandas as pd
import numpy as np
from scipy import stats

import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")

In [52]:
pop_s_ref = ["CEU"]
pop_s_query = ["YRI"]

pop_pairs = [[r, q] for r in pop_s_ref for q in pop_s_query]

perc_mask_sites = 10
max_maf = 0.50

base_dir = "/Users/szhan/Projects/tsimpute/analysis/genealogy_only/"

plot_results = False
dpi = 1_000

In [53]:
for pop_ref, pop_query in pop_pairs:
    in_prefix = f"ten_{pop_ref}_{pop_query}_t0_p{perc_mask_sites}"
    out_png_file = in_prefix + "." + "m" + str(int(max_maf * 100)) + "." + "png"

    # Collate results from simulations
    num_sites_total = []
    num_sites_iqs_notna = []
    perc_sites_iqs_eg_90 = []
    mean_iqs = []
    median_iqs = []

    df_concat = None

    num_reps = 100
    for i in np.arange(1, num_reps + 1):
        in_file = base_dir + in_prefix + "/" + \
            "sim_" + str(i) + "." + "csv"
        
        print(f"INFO: Parsing {in_file}")
        if os.path.exists(in_file):
            df = pd.read_csv(in_file, comment="#")
            num_sites_total.append(df.shape[0])

            df = df[["minor_allel_freq", "iqs"]].dropna(axis=0)
            num_sites_iqs_notna.append(df.shape[0])

            perc_sites_iqs_eg_90.append(
                float(df[df["iqs"] >= 0.90].shape[0]) / float(df.shape[0])
            )
            mean_iqs.append(np.mean(df["iqs"]))
            median_iqs.append(np.median(df["iqs"]))

            df_concat = df if df_concat is None else pd.concat([df_concat, df])
        else:
            warnings.warn(f"Cannot find {in_file}")

    if plot_results:
        print(f"INFO: Plotting results from {in_file}")
        title_text = f"{pop_ref} into {pop_query}"
        
        values = np.vstack([df_concat["minor_allel_freq"], df_concat["iqs"]])
        kernel = stats.gaussian_kde(values)
        x = kernel(np.vstack([df_concat["minor_allel_freq"], df_concat["iqs"]]))

        fig, ax = plt.subplots(figsize=(10, 10))

        ax.set_title(title_text, size=25)
        ax.set_xlim([0, max_maf])
        ax.set_ylim([0, 1]) # TODO: Show negative values
        ax.set_ylabel("IQS", size=20)
        ax.set_xlabel("MAF", size=20)
        ax.tick_params(axis='both', which='major', labelsize=20)

        g = sns.scatterplot(
            y="iqs",
            x="minor_allel_freq",
            data=df_concat,
            c=x,
            cmap="viridis",
            #x_jitter=True,
            ax=ax
        );

        g.get_figure().savefig(out_png_file, dpi=dpi)

INFO: Parsing /Users/szhan/Projects/tsimpute/analysis/genealogy_only/ten_CEU_CHB_t0_p10/sim_1.csv
INFO: Parsing /Users/szhan/Projects/tsimpute/analysis/genealogy_only/ten_CEU_CHB_t0_p10/sim_2.csv
INFO: Parsing /Users/szhan/Projects/tsimpute/analysis/genealogy_only/ten_CEU_CHB_t0_p10/sim_3.csv
INFO: Parsing /Users/szhan/Projects/tsimpute/analysis/genealogy_only/ten_CEU_CHB_t0_p10/sim_4.csv
INFO: Parsing /Users/szhan/Projects/tsimpute/analysis/genealogy_only/ten_CEU_CHB_t0_p10/sim_5.csv
INFO: Parsing /Users/szhan/Projects/tsimpute/analysis/genealogy_only/ten_CEU_CHB_t0_p10/sim_6.csv
INFO: Parsing /Users/szhan/Projects/tsimpute/analysis/genealogy_only/ten_CEU_CHB_t0_p10/sim_7.csv
INFO: Parsing /Users/szhan/Projects/tsimpute/analysis/genealogy_only/ten_CEU_CHB_t0_p10/sim_8.csv
INFO: Parsing /Users/szhan/Projects/tsimpute/analysis/genealogy_only/ten_CEU_CHB_t0_p10/sim_9.csv
INFO: Parsing /Users/szhan/Projects/tsimpute/analysis/genealogy_only/ten_CEU_CHB_t0_p10/sim_10.csv
INFO: Parsing /User

In [54]:
print(f"MIN: {min(perc_sites_iqs_eg_90)}")
print(f"MAX: {max(perc_sites_iqs_eg_90)}")

MIN: 0.7349137931034483
MAX: 0.8710359408033826
