# Recreating the NearEastPublic dataset analyses
We used the publicly available data from Lazaridis, I., Patterson, N., Mittnik, A. et al. Ancient human genomes suggest three ancestral populations for present-day Europeans. Nature 513, 409–413 (2014). https://doi.org/10.1038/nature13673

In [1]:
from filter_merge_utils import *

Uncomment and run the following cell to download the dataset from the Reich Lab website.

In [None]:
# ! wget https://reich.hms.harvard.edu/sites/reich.hms.harvard.edu/files/inline-files/NearEastPublic.tar.gz
# ! mkdir NearEastPublic && mv NearEastPublic.tar.gz NearEastPublic
# ! cd NearEastPublic && tar -xf NearEastPublic.tar.gz
# ! mkdir raw && mv ./* raw
# ! mkdir global && mkdir westEurasia && mkdir westEurasia_ancient

In [None]:
base_dir = pathlib.Path("NearEastPublic")
dataset_prefix = base_dir / "raw" / "HumanOriginsPublic2068"

smartpca_settings = {
    "numoutlieriter": 0
}

## Global Dataset
Next, we filter the dataset to reproduce the dataset of global samples.
For this, we use the list of global populations as provided in the supplement of the publication.

In [None]:
global_prefix = base_dir / "global" / "HumanOriginsPublic2068.global"
global_population_file = pathlib.Path(f"{global_prefix}.populations.txt")

global_set = get_pop_set_from_string("AA, Algonquin, Ami, Atayal, Basque, BedouinB, Biaka, Bougainville, Brahui, Cabecar, Chipewyan, Chukchi, Damara, Datog, Dinka, Esan, Eskimo, Georgian, Gui, GujaratiD, Hadza, Han, Itelmen, Ju_hoan_North, Kalash, Karitiana, Kharia, Korean, Koryak, LaBrana, Lahu, Lodhi, Loschbour, MA1, Mala, Mandenka, Masai, Mbuti, Mozabite, Naxi, Nganasan, Onge, Papuan, Pima, Sandawe, Sardinian, She, Somali, Stuttgart, Surui, Tubalar, Ulchi, Vishwabrahmin, Yoruba")
save_pop_set(global_set, global_population_file)

# filter the global populations
filter_dataset(
    prefix_in=dataset_prefix,
    prefix_out=global_prefix,
    poplistname=global_population_file,
    redo=False
)

# save a config file used for the Pandora analysis
configfile = base_dir / "global" / "global.pandora.yaml"
save_pandora_config(
    global_prefix,
    base_dir / "global" / "results",
    smartpca_settings,
    configfile
)

## West-Eurasian Dataset
Next, we filter the dataset to reproduce the dataset of west-eurasion samples.
For this, we use the list of west-eurasian populations as provided in the supplement of the publication.

In [None]:
westeurasian_prefix = base_dir / "westEurasia" / "HumanOriginsPublic2068.westEurasian"
westeurasian_population_file = pathlib.Path(f"{westeurasian_prefix}.populations.txt")

westeurasian_set = get_pop_set_from_string("Abkhasian, Adygei, Albanian, Armenian, Ashkenazi_Jew, Balkar, Basque, BedouinA, BedouinB, Belarusian, Bergamo, Bulgarian, Canary_Islanders, Chechen, Croatian, Cypriot, Czech, Druze, English, Estonian, Finnish, French, French_South, Georgian, Georgian_Jew, Greek, Hungarian, Icelandic, Iranian, Iranian_Jew, Iraqi_Jew, Italian_South, Jordanian, Kumyk, LaBrana, Lebanese, Lezgin, Libyan_Jew, Lithuanian, Loschbour, Maltese, Mordovian, Moroccan_Jew, Motala12, Motala_merge, North_Ossetian, Norwegian, Orcadian, Palestinian, Russian, Sardinian, Saudi, Scottish, Sicilian, Spanish, Spanish_North, Stuttgart, Syrian, Tunisian_Jew, Turkish, Turkish_Jew, Tuscan, Ukrainian, Yemenite_Jew")
save_pop_set(westeurasian_set, westeurasian_population_file)

# filter the westeurasian populations
filter_dataset(
    prefix_in=dataset_prefix,
    prefix_out=westeurasian_prefix,
    poplistname=westeurasian_population_file,
    redo=False
)

# save a config file used for the Pandora analysis
configfile = base_dir / "westEurasia" / "westEurasia.pandora.yaml"
save_pandora_config(
    westeurasian_prefix,
    base_dir / "westEurasia" / "results",
    smartpca_settings,
    configfile
)

### Merging the west-eurasian samples with ancient samples

In [None]:
ancient_prefix = base_dir / "raw" / "AncientLazaridis2016"
merged_prefix = base_dir / "westEurasia_ancient" / "AncientLazaridis2016_ModernWestEurasia"

# The ancient samples contain some samples we want to exclude prior to PCA (e.g. Chimp sequences)
with tempfile.TemporaryDirectory() as tmpdir:
    tmpdir = pathlib.Path(tmpdir)
    tmp_merged_prefix = tmpdir / "merged"
    merge_datasets(
        prefix_ds1=westeurasian_prefix,
        prefix_ds2=ancient_prefix,
        prefix_out=tmp_merged_prefix,
        redo=False
    )

    ind_file = pathlib.Path(f"{tmp_merged_prefix}.ind")
    ind_df = indfile_to_dataframe(ind_file)

    keep_populations = tmpdir / "exclude.poplist.txt"
    exclude = ["Mota", "Denisovan", "Chimp", "Mbuti.DG", "Altai",
               "Vi_merge", "Clovis", "Kennewick", "Chuvash", "Ust_Ishim",
               "AG2", "MA1", "MezE", "hg19ref", "Kostenki14"]
    keep = [p for p in ind_df.population.unique() if p not in exclude]
    keep_populations.open("w").write("\n".join(keep))

    filter_dataset(
        prefix_in=tmp_merged_prefix,
        prefix_out=merged_prefix,
        poplistname=keep_populations,
        redo=False
    )

# finally, save the population names of the modern samples in a specific file such that we can later use it for the PCA projection
ancient_populations = ["Anatolia_ChL", "Anatolia_N", "Armenia_ChL", "Armenia_EBA", "Armenia_MLBA", "CHG", "EHG", "Europe_EN", "Europe_LNBA", "Europe_MNChL", "Iberia_BA", "Iran_ChL", "Iran_HotuIIIb", "Iran_LN", "Iran_N", "Levant_BA", "Levant_N", "Natufian", "SHG", "Steppe_EMBA", "Steppe_Eneolithic", "Steppe_IA", "Steppe_MLBA", "Switzerland_HG", "WHG"]

ind_file = pathlib.Path(f"{merged_prefix}.ind")
ind_df = indfile_to_dataframe(ind_file)

modern = [p for p in ind_df.population.unique() if p not in ancient_populations]
modern_populations = base_dir / "westEurasia_ancient" / "modern.poplist.txt"
modern_populations.open("w").write("\n".join(modern))

# save a config file used for the Pandora analysis
configfile = base_dir / "westEurasia_ancient" / "westEurasia_ancient.pandora.yaml"
save_pandora_config(
    merged_prefix,
    base_dir / "westEurasia_ancient" / "results",
    smartpca_settings,
    configfile,
    pca_populations=str(modern_populations)
)

# Result analysis
## West-eurasian set

In [1]:
from pandora.pca import *
from pandora.pca_comparison import PCAComparison

result_dir = pathlib.Path("NearEastPublic") / "westEurasia" / "results"

support_values = pd.read_table(result_dir / "pandora.supportValues.txt", names=["sample_id", "support"])
# support_values.describe()

In [9]:
empirical_pca = from_smartpca(
    result_dir / "HumanOriginsPublic2068.westEurasian.evec",
    result_dir / "HumanOriginsPublic2068.westEurasian.eval",
)

empirical_pca.explained_variances

array([0.00910824, 0.0043691 , 0.00332266, 0.00307179, 0.00275337,
       0.00268761, 0.00264226, 0.00262669, 0.00261782, 0.00259369,
       0.0025751 , 0.00255008, 0.00250403, 0.00246429, 0.00240428,
       0.00237218, 0.0023042 , 0.00228075, 0.00227246, 0.00225154])

In [29]:
eigenvalues = open(result_dir / "HumanOriginsPublic2068.westEurasian.eval").readlines()
eigenvalues = [float(ev) for ev in eigenvalues]
explained_variances = [ev / sum(eigenvalues) for ev in eigenvalues]
explained_variances = np.asarray(explained_variances)
explained_variances_cumsum = explained_variances.cumsum() * 100

from plotly import graph_objects as go
fig = go.Figure(
    go.Scatter(
        x=list(range(1, len(explained_variances_cumsum) + 1)),
        y=explained_variances_cumsum
    )
)
fig.update_yaxes(title="Cumulative sum of explained variance", ticksuffix="%")
fig.update_xaxes(title="Number of PCs")
fig.add_hline(y=80, )
fig.update_layout(template="plotly_white", title="Cumulative Sum of explained variance for <br>west-eurasian subset of Human Origins Dataset")
fig.write_image("/tmp/npcs.png")
fig.write_image("/tmp/npcs.png")