# Sudanese Goats
Rahmatalla, S.A., Arends, D., Reissmann, M. et al. Whole genome population genetics analysis of Sudanese goats identifies regions harboring genes associated with major traits. BMC Genet 18, 92 (2017). https://doi.org/10.1186/s12863-017-0553-z

In [46]:
import pathlib
import pickle
import numpy as np
import pandas as pd
import tqdm.contrib.concurrent

GOAT_DATA_DIR = pathlib.Path("sudanese_goats")
GOAT_DATA_DIR.mkdir(exist_ok=True)

SAMPLE_ID_FILE = GOAT_DATA_DIR / "sample_ids.pickle"
POPULATION_FILE = GOAT_DATA_DIR / "populations.pickle"
GENO_FILE = GOAT_DATA_DIR / "geno.pickle"

GENO_CSV = GOAT_DATA_DIR / "goats.geno.csv"
SNP_CSV = GOAT_DATA_DIR / "goats.snp.csv"
IND_CSV = GOAT_DATA_DIR / "goats.ind.csv"

EIG_GENO = GOAT_DATA_DIR / "goats.geno"
EIG_SNP = GOAT_DATA_DIR / "goats.snp"
EIG_IND = GOAT_DATA_DIR / "goats.ind"

In [47]:
def _get_data_from_csv():
    def _convert_row(args):
        idx, row = args
        reference_allel = snps.loc[idx]["Reference Allele"]
        snp_calls = []
        for vals in row.values:
            if isinstance(vals, float):
                snp_calls.append(np.nan)
                continue
            c1, c2 = list(vals)
            if c1 == c2:
               try:
                    if c1 == reference_allel:
                        # two copies of the reference allel
                        snp_calls.append(2)
                    else:
                        # zero copies of the reference allel
                        snp_calls.append(0)
               except:
                   print(c1, c2, reference_allel)
            else:
                # one copy of the reference allel
                snp_calls.append(1)
        return snp_calls
    geno = pd.read_csv(GENO_CSV, delimiter=";", index_col=0)
    snps = pd.read_csv(SNP_CSV, delimiter=";", index_col=0)
    snps = snps.dropna(subset="Reference Allele")
    inds = pd.read_csv(IND_CSV, delimiter=";", index_col=0)
    
    sample_ids = geno.columns.to_series()
    sample_ids.to_pickle(SAMPLE_ID_FILE)
    populations = pd.Series([inds.loc[s].Breed for s in sample_ids])
    populations.to_pickle(POPULATION_FILE)
    
    genotype_data = tqdm.contrib.concurrent.thread_map(_convert_row, geno.iterrows(), total=geno.shape[0])
    genotype_data = [g for g in genotype_data if g]
    genotype_data = np.asarray(genotype_data).T
    pickle.dump(genotype_data, GENO_FILE.open("wb"))
    return genotype_data, sample_ids, populations

def get_data(redo=False):
    files_exist = SAMPLE_ID_FILE.exists() and POPULATION_FILE.exists() and GENO_FILE.exists()
    if files_exist and not redo:
        genotype_data = pickle.load(GENO_FILE.open("rb"))
        sample_ids = pickle.load(SAMPLE_ID_FILE.open("rb"))
        populations = pickle.load(POPULATION_FILE.open("rb"))
        return genotype_data, sample_ids, populations
    else:
        return _get_data_from_csv()

def _geno_to_eigen():
    genotype_data, _, _ = get_data()
    geno = genotype_data.T
    geno[np.isnan(geno)] = 9
    geno = geno.astype(int)
    
    with EIG_GENO.open("w") as f:
        for line in geno:
            f.write("".join([str(v) for v in line]))
            f.write("\n")
            

def _ind_to_eigen():
    inds = pd.read_csv(IND_CSV, delimiter=";", index_col=0)

    with EIG_IND.open("w") as f:
        for idx, row in inds.iterrows():
            sample_id = idx.replace(" ", "_")
            # female goat according to the publication
            f.write(f"{sample_id}\tF\t{row.Breed}\n")
        
def _snp_to_eigen():
    snps = pd.read_csv(SNP_CSV, delimiter=";", index_col=0)
    snps = snps.dropna(subset="Reference Allele")
    
    with EIG_SNP.open("w") as f:
        for idx, row in snps.iterrows():
            chrom = row.Chromosome
            if chrom == "X":
                chrom = 30
                
            ref = row["Reference Allele"]
            snp_allels = row["SNP alleles"]
            s1, s2 = snp_allels.split("/")
            alt = s1 if s1 != ref else s2
            f.write(f"{row.rsID}\t{chrom}\t0.0\t{int(row.Position)}\t{ref}\t{alt}\n")
            
            
def data_to_eigen(redo=False):
    files_exist = EIG_SNP.exists() and EIG_IND.exists() and EIG_GENO.exists()
    if redo or not files_exist:
        _geno_to_eigen()
        _ind_to_eigen()
        _snp_to_eigen()

In [48]:
data_to_eigen()