In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import copy

In [None]:
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_columns', 7)

In [None]:
df_meta = pd.read_csv('metadata_plants.csv')

In [None]:
df_meta.head()
# Alyrata  Csativus  Macuminata  Ppatens  Taestivum  Vcarteri
# remove test genomes
df_plants = df_meta[~df_meta['species'].isin(['Alyrata','Csativus','Macuminata','Ppatens',
                                              'Taestivum','Vcarteri'])].reset_index(drop=True)



In [None]:
np.random.seed(11345)
x = np.repeat(range(6), 9)
np.random.shuffle(x)
x

In [None]:
df_plants.loc[:,"random_split"] = x

In [None]:
df_plants.loc[:,"length_split"] = -1

In [None]:
df_plants.head()

In [None]:
for i, split in zip(np.argsort(df_plants.loc[:, "total_len"]), np.repeat(range(6), 9)):
    df_plants.loc[i,"length_split"] = split

In [None]:
df_plants.loc[:,"gc_split"] = -1

In [None]:
x2 = df_plants.loc[:,"C"] / (df_plants.loc[:,"A"] + df_plants.loc[:,"C"])

In [None]:
for i, split in zip(np.argsort(x2), np.repeat(range(6), 9)):
    df_plants.loc[i,"gc_split"] = split

In [None]:
df_plants.loc[df_plants.loc[:, "random_split"] == 1, "species"]

In [None]:
df_plants.loc[df_plants.loc[:, "length_split"] == 1, "species"]

In [None]:
# and now for the phylogenetic split
remaining = list(df_plants['species'])
green_algae = ['Dsalina', 'Creinhardtii', 'Czofingiensis', 'MpusillaCCMP1545', 'MpusillaRCC299',
               'Olucimarinus', 'CsubellipsoideaC169']
remaining = [x for x in remaining if x not in green_algae]
monocots = ['Hvulgare', 'Bdistachyon', 'Osativa', 'Sbicolor', 'Zmays', 'Sitalica', 'Othomaeum', 'Acomosus',
            'Aofficinalis', 'Zmarina', 'Spolyrhiza']
remaining = [x for x in remaining if x not in monocots]

asterids = ['Hannuus', 'Lsativa', 'Dcarota', 'Mguttatus', 'Oeuropaea', 'Stuberosum',
            'Slycopersicum']
remaining = [x for x in remaining if x not in asterids]
fabids = ['Mesculenta', 'Rcommunis', 'Lusitatissimum', 'Ptrichocarpa', 'Mdomestica', 'Ppersica',
          'Fvesca', 'Mtruncatula', 'Carietinum', 'Gmax']
remaining = [x for x in remaining if x not in fabids]
malvids = ['Athaliana', 'Crubella', 'Cgrandiflora', 'Esalsugineum', 'Cpapaya', 'Graimondii', 'Csinensis',
           'Cclementina', 'Egrandis', 'Tcacao', 'Boleraceacapitata']
remaining = [x for x in remaining if x not in malvids]

remaining
df_plants.loc[:, 'phylo_split'] = 5

In [None]:
df_plants.loc[[x in green_algae for x in df_plants.loc[:, "species"]], 'phylo_split'] = 0
df_plants.loc[[x in monocots for x in df_plants.loc[:, "species"]], 'phylo_split'] = 1
df_plants.loc[[x in asterids for x in df_plants.loc[:, "species"]], 'phylo_split'] = 2
df_plants.loc[[x in fabids for x in df_plants.loc[:, "species"]], 'phylo_split'] = 3
df_plants.loc[[x in malvids for x in df_plants.loc[:, "species"]], 'phylo_split'] = 4




In [None]:
def mk_bash_denbi(outdir, species):
    template = """
pfx=/mnt/share/ubuntu/data/plants/single_genomes/
outdir={}

mkdir -p $outdir
python /mnt/share/ubuntu/repos/github/weberlab-hhu/helixer_scratch/data_scripts/merge-files.py \\
        --input-files {} \\
        --output-file $outdir/training_data.h5
cd $outdir
ln -s ../../eight_genomes_nosplit_phase/validation_data.h5
cd ..
"""
    
    one_path = "$pfx/{}/test_data.h5"
    sp_paths = [one_path.format(sp) for sp in species]
    return template.format(outdir, ' '.join(sp_paths))

In [None]:
print(mk_bash_denbi('random5', df_plants.loc[df_plants.loc[:, "random_split"] == 5, "species"]))

In [None]:
print(mk_bash_denbi('length5', df_plants.loc[df_plants.loc[:, "length_split"] == 5, "species"]))

In [None]:
print(mk_bash_denbi('phylo5', df_plants.loc[df_plants.loc[:, "phylo_split"] == 5, "species"]))