# Southeast European Sheep 

Ciani, E., Mastrangelo, S., Da Silva, A. et al. On the origin of European sheep as revealed by the diversity of the Balkan breeds and by optimizing population-genetic analysis tools. Genet Sel Evol 52, 25 (2020). https://doi.org/10.1186/s12711-020-00545-7

In [126]:
# ! mkdir southeast_european_sheep && cd southeast_european_sheep && wget https://figshare.com/ndownloader/articles/8947346/versions/1
# ! cd southeast_european_sheep && unzip 1 && rm 1 

In [124]:
import pathlib
import shutil

import numpy as np
import pandas as pd

from pandora.converter import run_convertf
from pandora.custom_types import FileFormat
from pandora.dataset import numpy_dataset_from_eigenfiles


BASE_DIR = pathlib.Path("southeast_european_sheep")

# We use the LD pruned dataset similar to the publication
FULL_DATASET_PREFIX = BASE_DIR / "OaSNP1477x21960-1807"
SAMPLED_DATASET_PREFIX = BASE_DIR / "sheep"

In [18]:
run_convertf(
    in_prefix=FULL_DATASET_PREFIX,
    in_format=FileFormat.PACKEDPED,
    out_prefix=FULL_DATASET_PREFIX,
    out_format=FileFormat.EIGENSTRAT,
    convertf="convertf"
)

# The convertf run fails to retrieve the correct populations, so we manually fix the populations in the .ind file
fam_file = FULL_DATASET_PREFIX.with_suffix(".fam")
ind_file = FULL_DATASET_PREFIX.with_suffix(".ind")

# convert the .fam data to a pandas dataframe
fam_data = {
    "sample_id": [],
    "population": []
}
for fam in fam_file.open():
    # OldNorwegianSpael NSO234 0 0 1 -9
    population, sample_id, *_ = fam.split()
    fam_data["population"].append(population)
    fam_data["sample_id"].append(sample_id)

fam_data = pd.DataFrame(fam_data)

# convert the .ind data to a pandas dataframe
ind_data = {
    "sample_id": [],
    "sex": []
}
for ind in ind_file.open():
    # FIN76 U  ???
    sample_id, sex, _ = ind.split()
    ind_data["sample_id"].append(sample_id)
    ind_data["sex"].append(sex)
ind_data = pd.DataFrame(ind_data)

ind_data = fam_data.merge(ind_data, on="sample_id")

with ind_file.open("w") as f:
    for idx, row in ind_data.iterrows():
        f.write(f"{row.sample_id}\t{row.sex}\t{row.population}\n")

In [28]:
# in their publication, Ciani et al. only use up to 6 individuals per population for the PCA analyses
# since we don't know which ones they used, for each population we randomly sample 6 individuals
full_dataset = numpy_dataset_from_eigenfiles(FULL_DATASET_PREFIX)

(1477,) (1477,) (1477, 21960)


In [120]:
# STEP 1: SAMPLE THE IDs WE WANT TO USE: max 6 samples per population
samples = full_dataset.sample_ids
samples.name = "sample_id"
populations = full_dataset.populations
populations.name = "population"
# concat the sample IDs and the populations
samples_and_populations = pd.concat([samples, populations], axis=1)
# find all populations and respective samples with less than or exactly 6 samples as we don't need to sample from them
s_and_p_no_sampling = samples_and_populations.groupby("population").count().loc[lambda x: x.sample_id <= 6]
# filter all samples to get only the ones we do need to sample
s_and_p_to_sample = samples_and_populations.loc[lambda x: ~x.population.isin(s_and_p_no_sampling.index)]
# draw six random samples, set the seed to obtain the same results each time we run this notebook
sampled = s_and_p_to_sample.groupby("population").sample(n=6, random_state=42)
# finally collect all samples we want to use for the PCA analyses
sampled_samples = pd.concat([sampled, s_and_p_no_sampling])


# STEP 2: FILTER THE GENOTYPE DATA
sampled_genotypes = []
for sample_id, genotype in zip(full_dataset.sample_ids, full_dataset.input_data):
    if sample_id in sampled_samples.sample_id.values:
        sampled_genotypes.append(genotype)

# STEP 3: WRITE THE DATA IN EIGENFORMAT TO FILE
# we only modified the .ind and .geno data, so we can simply copy the .snp file
shutil.copy(FULL_DATASET_PREFIX.with_suffix(".snp"), SAMPLED_DATASET_PREFIX.with_suffix(".snp"))

# write the .ind file
# for this we also need the sex information, so we base this on the FULL_DATASET ind file
with FULL_DATASET_PREFIX.with_suffix(".ind").open() as full_ind, SAMPLED_DATASET_PREFIX.with_suffix(".ind").open("w") as sampled_ind:
    for ind in full_ind:
        sid, _ = ind.split(maxsplit=1)
        if sid in sampled_samples.sample_id.values:
            sampled_ind.write(ind)
            
# write the .geno file
sampled_genotypes = np.asarray(sampled_genotypes).T
# represent the missing value correclty in EIGEN format as a 9
sampled_genotypes[sampled_genotypes == full_dataset._missing_value] = 9
lengths = set()
with SAMPLED_DATASET_PREFIX.with_suffix(".geno").open("w") as f:
    for line in sampled_genotypes:
        str_to_write = "".join([str(v) for v in line]).strip()
        lengths.add(len(str_to_write))
        f.write(str_to_write)
        f.write("\n")