In [1]:
import pathlib
from plotly import graph_objects as go

import pandas as pd
pd.set_option('plotting.backend', 'plotly')

import plotly.io as pio
pio.templates.default = "plotly_white"

from plotly.colors import qualitative

from scipy.stats import f_oneway, bartlett, pearsonr
import pingouin as pg

In [2]:
hom_sap_results = pd.read_parquet("results/HomSap/all_results.parquet")
# hom_sap_results

In [3]:
hom_sap_results_pop0 = pd.read_parquet("results/HomSap/all_results_population_0.parquet")
hom_sap_results_pop0

Unnamed: 0,PSV,sample_id,PS,missing,n_samples,n_snps,ld_pruned,population,missing_per_sample,model,exclude_pops,seq_len
0,0.736166,indiv_0,0.89,0.0,100,802,False,1000_Genomes_YRI_(Yoruba),0.000000,AshkSub_7G19_YRI,"['YRI', 'CHB']",1e5
1,0.846758,indiv_1,0.89,0.0,100,802,False,1000_Genomes_YRI_(Yoruba),0.000000,AshkSub_7G19_YRI,"['YRI', 'CHB']",1e5
2,0.837938,indiv_10,0.89,0.0,100,802,False,1000_Genomes_YRI_(Yoruba),0.000000,AshkSub_7G19_YRI,"['YRI', 'CHB']",1e5
3,0.789671,indiv_11,0.89,0.0,100,802,False,1000_Genomes_YRI_(Yoruba),0.000000,AshkSub_7G19_YRI,"['YRI', 'CHB']",1e5
4,0.856451,indiv_12,0.89,0.0,100,802,False,1000_Genomes_YRI_(Yoruba),0.000000,AshkSub_7G19_YRI,"['YRI', 'CHB']",1e5
...,...,...,...,...,...,...,...,...,...,...,...,...
695,0.636289,indiv_95,0.78,0.5,100,51334,True,Basal/OOA,0.497721,AncientEurope_4A21_OOA,[],1e8
696,0.649130,indiv_96,0.78,0.5,100,51334,True,Basal/OOA,0.500643,AncientEurope_4A21_OOA,[],1e8
697,0.629602,indiv_97,0.78,0.5,100,51334,True,Basal/OOA,0.503312,AncientEurope_4A21_OOA,[],1e8
698,0.630374,indiv_98,0.78,0.5,100,51334,True,Basal/OOA,0.503136,AncientEurope_4A21_OOA,[],1e8


In [13]:
hom_sap_results_pop1= pd.read_parquet("results/HomSap/all_results_population_1.parquet")
hom_sap_results_pop1

Unnamed: 0,PSV,sample_id,PS,missing,n_samples,n_snps,ld_pruned,population,missing_per_sample,model,exclude_pops,seq_len
0,0.736166,indiv_0,0.89,0.0,100,802,False,1000_Genomes_YRI_(Yoruba),0.000000,AshkSub_7G19_CHB,"['YRI', 'CHB']",1e5
1,0.846758,indiv_1,0.89,0.0,100,802,False,1000_Genomes_YRI_(Yoruba),0.000000,AshkSub_7G19_CHB,"['YRI', 'CHB']",1e5
2,0.837938,indiv_10,0.89,0.0,100,802,False,1000_Genomes_YRI_(Yoruba),0.000000,AshkSub_7G19_CHB,"['YRI', 'CHB']",1e5
3,0.789671,indiv_11,0.89,0.0,100,802,False,1000_Genomes_YRI_(Yoruba),0.000000,AshkSub_7G19_CHB,"['YRI', 'CHB']",1e5
4,0.856451,indiv_12,0.89,0.0,100,802,False,1000_Genomes_YRI_(Yoruba),0.000000,AshkSub_7G19_CHB,"['YRI', 'CHB']",1e5
...,...,...,...,...,...,...,...,...,...,...,...,...
695,0.639042,indiv_95,0.77,0.5,100,51334,True,Basal/OOA,0.498695,AncientEurope_4A21_NE,[],1e8
696,0.628666,indiv_96,0.77,0.5,100,51334,True,Basal/OOA,0.500701,AncientEurope_4A21_NE,[],1e8
697,0.629811,indiv_97,0.77,0.5,100,51334,True,Basal/OOA,0.501597,AncientEurope_4A21_NE,[],1e8
698,0.556109,indiv_98,0.77,0.5,100,51334,True,Basal/OOA,0.503779,AncientEurope_4A21_NE,[],1e8


In [44]:
hom_sap_results.ld_pruned.dtype

dtype('bool')

In [47]:
def anova_and_error_plot(df, pandora_value, column, prefilter=lambda x: x.ld_pruned):
    if pandora_value not in ["PS", "PSV"]:
        raise ValueError("pandora_value must be 'PS' or 'PSV'")

    df = df.loc[prefilter]

    fig = go.Figure()
    colors = dict(zip(df.seq_len.unique(), qualitative.D3))

    for seq_len in df.seq_len.unique():
        _data = df.loc[lambda x: (x.seq_len == seq_len)]
        comp_dfs = []
        for value in _data[column].unique():
            comp_dfs.append(_data.loc[lambda x: x[column] == value][pandora_value])

        print("\nseq_len", seq_len)

        if df[column].dtype != "bool":
            if pandora_value == "PS":
                p = pearsonr(_data.missing, _data[pandora_value])
            else:
                p = pearsonr(_data.missing_per_sample, _data[pandora_value])
            print(f"- Pearson missing - {pandora_value}", round(p.statistic, 3), round(p.pvalue, 3))

        # check if variances are equal, if p-value < 0.05, variances are not equal -> Welch's ANOVA
        b = bartlett(*comp_dfs)
        print("- Bartlett", round(b.statistic, 3), round(b.pvalue, 3))
        if b.pvalue < 0.05:
            print("-> H0 can be rejected: Variances are not equal -> Welch's ANOVA")
            # welch anova (if variances are not equal); if p-value < 0.05, means are not equal
            w = pg.welch_anova(dv=pandora_value, between=column, data=_data)
            print("- Welch", w.F.mean().round(3), w["p-unc"].mean().round(3))
        else:
            print("-> H0 cannot be rejected -> regular ANOVA")
            # regular anova (only if variances are equal); if p-value < 0.05, means are not equal
            a = f_oneway(*comp_dfs)
            print("- ANOVA", round(a.statistic, 3), round(a.pvalue, 3))

        _mean = _data.groupby(column)[pandora_value].mean()
        _var = _data.groupby(column)[pandora_value].std()

        fig.add_trace(
            go.Scatter(
                x=_mean.index,
                y=_mean.values,
                marker_color=colors[seq_len],
                name=seq_len,
                error_y=dict(
                    type='data', # value of error bar given in data coordinates
                    array=_var.values,
                    visible=True
                )
            )
        )


    fig.update_xaxes(title=column)
    fig.update_yaxes(title=pandora_value)

    fig.update_layout(template="plotly_white", title=f"{pandora_value} distribution for " + column)
    return fig

In [50]:
anova_and_error_plot(hom_sap_results_pop0, "PS", "ld_pruned", lambda x: x.missing == 0)


seq_len 1e5
- Bartlett 1.835 0.176
-> H0 cannot be rejected -> regular ANOVA
- ANOVA 4474.904 0.0

seq_len 1e6
- Bartlett 303.711 0.0
-> H0 can be rejected: Variances are not equal -> Welch's ANOVA
- Welch 9136.697 0.0

seq_len 1e7
- Bartlett 128.268 0.0
-> H0 can be rejected: Variances are not equal -> Welch's ANOVA
- Welch 1212.642 0.0

seq_len 1e8
- Bartlett 269.013 0.0
-> H0 can be rejected: Variances are not equal -> Welch's ANOVA
- Welch 495.071 0.0


In [40]:
anova_and_error_plot(hom_sap_results, "PS", "missing")


seq_len 1e5
- Pearson missing - PS -0.597 0.0
- Bartlett 787.195 0.0
-> H0 can be rejected: Variances are not equal -> Welch's ANOVA
- Welch 328.332 0.0

seq_len 1e6
- Pearson missing - PS -0.407 0.0
- Bartlett 99.645 0.0
-> H0 can be rejected: Variances are not equal -> Welch's ANOVA
- Welch 169.398 0.0

seq_len 1e7
- Pearson missing - PS -0.468 0.0
- Bartlett 629.087 0.0
-> H0 can be rejected: Variances are not equal -> Welch's ANOVA
- Welch 176.786 0.0

seq_len 1e8
- Pearson missing - PS -0.532 0.0
- Bartlett 265.958 0.0
-> H0 can be rejected: Variances are not equal -> Welch's ANOVA
- Welch 294.655 0.0


In [22]:
# anova_and_error_plot(hom_sap_results, "PS")
anova_and_error_plot(hom_sap_results_pop0, "PS", "missing")


seq_len 1e5
- Pearson missing - PS -0.596 0.0
- Bartlett 3972.375 0.0
-> H0 can be rejected: Variances are not equal -> Welch's ANOVA
- Welch 317.921 0.0

seq_len 1e6
- Pearson missing - PS -0.809 0.0
- Bartlett 113.256 0.0
-> H0 can be rejected: Variances are not equal -> Welch's ANOVA
- Welch 2206.098 0.0

seq_len 1e7
- Pearson missing - PS -0.794 0.0
- Bartlett 482.383 0.0
-> H0 can be rejected: Variances are not equal -> Welch's ANOVA
- Welch 1677.183 0.0

seq_len 1e8
- Pearson missing - PS -0.671 0.0
- Bartlett 616.815 0.0
-> H0 can be rejected: Variances are not equal -> Welch's ANOVA
- Welch 738.106 0.0


In [23]:
anova_and_error_plot(hom_sap_results, "PSV", "missing")


seq_len 1e5
- Pearson missing - PSV -0.365 0.0
- Bartlett 36.905 0.0
-> H0 can be rejected: Variances are not equal -> Welch's ANOVA
- Welch 146.22 0.0

seq_len 1e6
- Pearson missing - PSV -0.214 0.0
- Bartlett 13.167 0.022
-> H0 can be rejected: Variances are not equal -> Welch's ANOVA
- Welch 47.752 0.0

seq_len 1e7
- Pearson missing - PSV -0.197 0.0
- Bartlett 70.377 0.0
-> H0 can be rejected: Variances are not equal -> Welch's ANOVA
- Welch 36.562 0.0

seq_len 1e8
- Pearson missing - PSV -0.177 0.0
- Bartlett 135.738 0.0
-> H0 can be rejected: Variances are not equal -> Welch's ANOVA
- Welch 27.049 0.0


In [24]:
anova_and_error_plot(hom_sap_results_pop0, "PSV", "missing")


seq_len 1e5
- Pearson missing - PSV -0.417 0.0
- Bartlett 266.731 0.0
-> H0 can be rejected: Variances are not equal -> Welch's ANOVA
- Welch 212.519 0.0

seq_len 1e6
- Pearson missing - PSV -0.377 0.0
- Bartlett 7.01 0.22
-> H0 cannot be rejected -> regular ANOVA
- ANOVA 240.978 0.0

seq_len 1e7
- Pearson missing - PSV -0.424 0.0
- Bartlett 20.268 0.001
-> H0 can be rejected: Variances are not equal -> Welch's ANOVA
- Welch 290.091 0.0

seq_len 1e8
- Pearson missing - PSV -0.403 0.0
- Bartlett 38.103 0.0
-> H0 can be rejected: Variances are not equal -> Welch's ANOVA
- Welch 245.519 0.0


In [25]:
anova_and_error_plot(hom_sap_results_pop1, "PS", "missing")


seq_len 1e5
- Pearson missing - PS -0.614 0.0
- Bartlett 3152.307 0.0
-> H0 can be rejected: Variances are not equal -> Welch's ANOVA
- Welch 290.316 0.0

seq_len 1e6
- Pearson missing - PS -0.84 0.0
- Bartlett 65.122 0.0
-> H0 can be rejected: Variances are not equal -> Welch's ANOVA
- Welch 3312.262 0.0

seq_len 1e7
- Pearson missing - PS -0.795 0.0
- Bartlett 356.516 0.0
-> H0 can be rejected: Variances are not equal -> Welch's ANOVA
- Welch 1469.534 0.0

seq_len 1e8
- Pearson missing - PS -0.706 0.0
- Bartlett 302.653 0.0
-> H0 can be rejected: Variances are not equal -> Welch's ANOVA
- Welch 877.333 0.0


In [26]:
anova_and_error_plot(hom_sap_results_pop1, "PSV", "missing")


seq_len 1e5
- Pearson missing - PSV -0.444 0.0
- Bartlett 173.756 0.0
-> H0 can be rejected: Variances are not equal -> Welch's ANOVA
- Welch 215.475 0.0

seq_len 1e6
- Pearson missing - PSV -0.377 0.0
- Bartlett 2.493 0.777
-> H0 cannot be rejected -> regular ANOVA
- ANOVA 201.593 0.0

seq_len 1e7
- Pearson missing - PSV -0.416 0.0
- Bartlett 24.781 0.0
-> H0 can be rejected: Variances are not equal -> Welch's ANOVA
- Welch 227.346 0.0

seq_len 1e8
- Pearson missing - PSV -0.406 0.0
- Bartlett 29.685 0.0
-> H0 can be rejected: Variances are not equal -> Welch's ANOVA
- Welch 215.109 0.0
