# Initial main simulation analysis

- Performs the precursory analysis for the first set of main simulations

In [1]:

import os
from collections import OrderedDict
from random import sample
from tqdm import tqdm_notebook
import logging
import pandas as pd
from math import floor
import shutil
import pathlib
import sqlite3


In [2]:
# For dev use only - auto-reloading of modules
%load_ext autoreload
%aimport pycoalescence.coalescence_tree
from pycoalescence.coalescence_tree import check_sql_table_exist
from pycoalescence.helper import update_parameter_names
from pycoalescence import CoalescenceTree
from pycoalescence.sqlite_connection import check_sql_table_exist, SQLiteConnection
from compress_times import compress_times, double_backup, double_restore_backup, get_unique_times

%autoreload 1

In [3]:

intervals = {'artinskian',
 'asselian',
 'bashkirian',
 'gzhelian',
 'kasimovian',
 'kungurian',
 'moscovian',
 'sakmarian'}

tetrapod_groups = {'amniote', 'amphibian'}

In [4]:
def percent_cover_detection(fine_map_file):
	"""
	Detects the percentage cover from the fine map path
	:param fine_map_file: path to check for
	:return: the simulation percentage cover
	"""
	for pc in [0.1, 0.2, 0.4, 0.8]:
		if "_{}_".format(pc) in fine_map_file:
			return pc
	raise ValueError("No percentage cover detected!")

In [5]:
def sim_type_detection(fine_map_file):
	"""
	Detects the simulation type from the fine map path (because I didn't save it anywhere sensible!
	:param fine_map_file: the path to check for
	:return the sim type
	"""
	for interval in intervals:
		for tet_group in tetrapod_groups:
			if interval in fine_map_file and tet_group in fine_map_file:
				return (interval, tet_group)
	raise ValueError("No type detected! Filename: {}.".format(fine_map_file))

In [6]:
def sim_scenario_detection(file):
    """
    Detects the scenario from the file name
    :param the file name to check
    :return the scenario
    """
    for scenario in ["fragmented", "clustered", "pristine"]:
        if scenario in file:
            return scenario
    raise ValueError("Scenario not detected")

In [7]:

# Set the import directories and variables - paths relative to the jupyter notebook directory
# fragmented="fragmented"
fragmented = "all"  # put back if want to analyse normal sims
local_dir = "/home/sam/Documents/PhD/PaleoSampling"
ext_dir = "/run/media/sam/Media/Paleo"
# local_dir = "/Users/samthompson/Documents/PhD/PaleoSampling/"
# ext_dir = "/Volumes/Seagate 3TB/Paleo/"
results_dir = os.path.join(ext_dir, "Results", "PaleoMainOcc2b")
dst_folder = os.path.join(local_dir, "Results", "PaleoMainOcc2")
dst_folder2 = os.path.join(
    local_dir, "Code", "MainSimulationR", "results", "PaleoMainOcc2"
)
data_dir = os.path.join(ext_dir, "Data")
speciation_rates = [
    0.000_000_01,
    0.000_000_001,
    0.000_000_002,
    0.000_000_004,
    0.000_000_006,
    0.000_000_008,
    0.000_000_05,
    0.000_000_1,
    0.000_001,
]
sim_int = 2
directory_descriptions = [
    ("Main{}".format(sim_int), "pristine"),
    ("Fragmented{}".format(sim_int), "fragmented"),
    ("Clustered{}".format(sim_int), "clustered"),
]


In [8]:
# Extract the files into a single directory
logging.getLogger().setLevel(20)
for directory, description in directory_descriptions:
    parent_dir = pathlib.Path(results_dir, directory)
    if parent_dir.exists():
        for folder in parent_dir.iterdir():
            if folder.is_dir():
                for file in folder.iterdir():
                    if file.is_file():
                        try:
                            c = CoalescenceTree(file)
                            seed = c.get_simulation_parameters()["seed"]
                            job_type = c.get_simulation_parameters()["job_type"]
                        except IOError as ioe:
                            if description == "main":
                                raise IOError(f"File {file.name} is not a completed simulation: {description}, {job_type}, {seed}")
                            else:
                                logging.info(f"Deleting {file.name}...")
                                file.unlink()
                                continue
                        except Exception as e:
                            logging.info("Skipping {}: {}...".format(file, e))
                            file.unlink()
                            continue
                        dest = pathlib.Path(results_dir, "{}_{}_{}.db".format(description, job_type, seed))
                        shutil.move(file, dest)
                    else:
                        logging.info("Skipping {}...".format(file))
            else:
                logging.info("Skipping {} - not a directory...".format(folder))

INFO:root:Deleting data_18_23.db...
INFO:root:Deleting data_18_13.db...
INFO:root:Deleting data_18_4.db...
INFO:root:Deleting data_18_20.db...
INFO:root:Deleting data_18_16.db...
INFO:root:Deleting data_18_21.db...
INFO:root:Deleting data_18_5.db...
INFO:root:Deleting data_18_14.db...
INFO:root:Deleting data_18_10.db...
INFO:root:Deleting data_18_19.db...
INFO:root:Deleting data_18_2.db...
INFO:root:Deleting data_18_9.db...
INFO:root:Deleting data_18_18.db...
INFO:root:Deleting data_18_11.db...
INFO:root:Deleting data_8_5.db...
INFO:root:Deleting data_8_23.db...
INFO:root:Deleting data_8_19.db...
INFO:root:Deleting data_8_20.db...
INFO:root:Deleting data_10_19.db...
INFO:root:Deleting data_10_5.db...
INFO:root:Deleting data_12_23.db...
INFO:root:Deleting data_12_20.db...
INFO:root:Deleting data_12_9.db...
INFO:root:Deleting data_12_16.db...
INFO:root:Deleting data_12_22.db...
INFO:root:Deleting data_12_14.db...
INFO:root:Deleting data_12_10.db...
INFO:root:Deleting data_12_4.db...
INFO

In [10]:
for directory, _ in directory_descriptions:
    parent_dir = pathlib.Path(results_dir, directory)
    if parent_dir.exists():
        for folder in parent_dir.iterdir():
            if folder.is_dir():
                try:
                    folder.rmdir()
                except IOError as ioe:
                    logging.info("Cannot remove {}: {}".format(folder, ioe))
                    continue
        try:
            parent_dir.rmdir()
        except IOError as ioe:
            logging.info("Cannot remove {}: {}.".format(parent_dir, ioe))

In [11]:
# Delete all simulations which haven't completed yet.
for file in os.listdir(results_dir):
	if ".db" in file:
		try:
			t = CoalescenceTree(os.path.join(results_dir, file))
		except IOError:
			print("Removing incomplete simulation {}.".format(file))
			os.remove(os.path.join(results_dir, file))

In [12]:
max_density = None
# max_density = OrderedDict([(('artinskian', 'amniote'), 225),
#                            (('artinskian', 'amphibian'), 225),
#                            (('asselian', 'amniote'), 36),
#                            (('asselian', 'amphibian'), 36),
#                            (('bashkirian', 'amniote'), 1),
#                            (('bashkirian', 'amphibian'), 529),
#                            (('gzhelian', 'amniote'), 49),
#                            (('gzhelian', 'amphibian'), 49),
#                            (('kasimovian', 'amniote'), 49),
#                            (('kasimovian', 'amphibian'), 16),
#                            (('kungurian', 'amniote'), 225),
#                            (('kungurian', 'amphibian'), 289),
#                            (('moscovian', 'amniote'), 16),
#                            (('moscovian', 'amphibian'), 441),
#                            (('sakmarian', 'amniote'), 36),
#                            (('sakmarian', 'amphibian'), 225)])

In [13]:
# Calculate the biodiversity metrics -  can take a bit of time
for file in tqdm_notebook(os.listdir(results_dir), desc="Files"):
    file_path = os.path.join(results_dir, file)
    f, ext = os.path.splitext(file_path)
    if ".db" == ext:
#         print(file)
        t = CoalescenceTree(file_path)
        double_restore_backup(t)
        if check_sql_table_exist(t.database, "SPECIES_LIST_ORIGINAL"):
            t.revert_downsample()
        
        t.wipe_data()
        try:
            t._restore_backup_species_list()
        except sqlite3.OperationalError:
            pass
        sim_params = t.get_simulation_parameters()
        (interval, tet_group) = sim_type_detection(sim_params["sample_file"])
        deme = sim_params["deme"]
        sample_size = sim_params["sample_size"]
        fragment_csv = os.path.join(data_dir, "configs",
                                    "fragments_occ_{}_{}.csv".format(interval, tet_group))
        compress_times(t)
        double_backup(t)
        t.downsample_at_locations(fragment_csv=fragment_csv, ignore_errors=True)
#         downsample_rate = max_density[(interval, tet_group)] /( deme * sample_size)
#         try:
#             t.downsample(downsample_rate)
#         except ValueError:
#             logging.warning("Skipping file {} for {}, {} because downsample rate ({}) > 1.0".format(file, interval, tet_group, downsample_rate))
        t.set_speciation_parameters(record_spatial=True,
                                record_fragments=fragment_csv,
                                speciation_rates=speciation_rates)
        t.clear_calculations()
        t.apply()
        t.import_comparison_data(os.path.join(data_dir, "databases", "{}_{}_occ_sq.db".format(interval, tet_group)))
#         t.adjust_data()
        t._clear_goodness_of_fit()
        t.calculate_fragment_richness()
        # t.calculate_alpha_diversity()
        t.calculate_beta_diversity()
        # break
        t.calculate_goodness_of_fit()

HBox(children=(IntProgress(value=0, description='Files', max=1170, style=ProgressStyle(description_width='init…


	Lineage at 112 has not speciated and parent is 0. Integer overflow possible. Correcting by setting gens_alive to min value necessary for speciation.

	Lineage at 17 has not speciated and parent is 0. Integer overflow possible. Correcting by setting gens_alive to min value necessary for speciation.

	Lineage at 103 has not speciated and parent is 0. Integer overflow possible. Correcting by setting gens_alive to min value necessary for speciation.

	Lineage at 114 has not speciated and parent is 0. Integer overflow possible. Correcting by setting gens_alive to min value necessary for speciation.

	Lineage at 17 has not speciated and parent is 0. Integer overflow possible. Correcting by setting gens_alive to min value necessary for speciation.

	Lineage at 101 has not speciated and parent is 0. Integer overflow possible. Correcting by setting gens_alive to min value necessary for speciation.





In [14]:
# Sample from the simulations
tmp = []
distance_sim_tmp = []
fragment_abundances = []
for file in tqdm_notebook(os.listdir(results_dir), desc="Files"):
    file_path = os.path.join(results_dir, file)
    _, ext = os.path.splitext(file_path)
    if ".db" == ext:
        # print(file)
        t = CoalescenceTree(file_path)
        if check_sql_table_exist(t.database, "SPECIES_DISTANCE_SIMILARITY"):
            t.cursor.execute("DROP TABLE IF EXISTS SPECIES_DISTANCE_SIMILARITY")
        t.calculate_species_distance_similarity()
        if not check_sql_table_exist(t.database, "BIODIVERSITY_METRICS"):
            logging.warning("Skipping {} as BIODIVERSITY_METRICS table doesn't exist.".format(file))
            continue
        for sr in speciation_rates:
                ref = t.get_community_reference(speciation_rate=sr, time=0.0, fragments=True)
                spec_r = t.cursor.execute("SELECT actual FROM BIODIVERSITY_METRICS WHERE"
                                     " community_reference==? AND fragment=='whole' AND "
                                     "metric=='fragment_richness'", 
                                     (ref,)).fetchall()[0][0]
                b = t.cursor.execute("SELECT actual FROM BIODIVERSITY_METRICS WHERE"
                                     " community_reference==? AND fragment=='whole' AND "
                                     "metric=='beta_diversity'", 
                                     (ref,)).fetchall()[0][0]
                a = t.cursor.execute("SELECT actual FROM BIODIVERSITY_METRICS WHERE"
                                     " community_reference==? AND fragment=='whole' AND "
                                     "metric=='alpha_diversity'", 
                                     (ref,)).fetchall()[0][0]
                sim_params = t.get_simulation_parameters()
                species_richness = t.get_species_richness(ref)
                beta = t.get_beta_diversity(ref)
                alpha = t.get_alpha_diversity(ref)
                goodness_fit = t.get_goodness_of_fit(reference=ref)
                total_ind = t.get_number_individuals(community_reference=ref)
                (interval, tet_group) = sim_type_detection(sim_params["sample_file"])
                scenario = sim_scenario_detection(file)
                if scenario == "fragmented":
                    pc = percent_cover_detection(sim_params["fine_map_file"])
                else:
                    pc = 1.0
                tmp.append({"interval": interval, "tetrapod_group" : tet_group,
                            "sigma" : sim_params["sigma"], "speciation_rate" : sr,
                            "deme" : sim_params["deme"], "richness" : species_richness,
                            "beta_diversity" : beta, "alpha_diversity" : alpha,
                            "gof" : goodness_fit, "actual_richness" : spec_r, "actual_beta" : b,
                            "actual_alpha" : a, "percent_cover": pc, "simulated_individuals" : total_ind, "scenario" : scenario})
                # try:
                distance_sim = t.get_species_distance_similarity(ref)
                for distance, no_ind in distance_sim:
                    distance_sim_tmp.append({"interval": interval, "tetrapod_group" : tet_group,
                                             "sigma" : sim_params["sigma"], "speciation_rate" : sr,
                            "deme" : sim_params["deme"], "richness" : species_richness,
                            "beta_diversity" : beta, "alpha_diversity" : alpha,
                            "gof" : goodness_fit, "actual_richness" : spec_r, "actual_beta" : b,
                            "actual_alpha" : a, "distance" : distance, "no_individuals" : no_ind,"percent_cover": pc, "scenario" : scenario})
                for fragment in t.get_fragment_list(ref):
                    r = t.get_fragment_richness(fragment, ref)
                    fragment_abundances.append({"interval": interval, "tetrapod_group" : tet_group,
                                                "sigma" : sim_params["sigma"], "speciation_rate" : sr,
                                                "deme" : sim_params["deme"], "richness" : r,
                                                "fragment" : fragment,
                                                "percent_cover": pc, "scenario" : scenario})
				
df = pd.DataFrame(tmp)
df_distance_sim = pd.DataFrame(distance_sim_tmp)
df_fragment_abundances = pd.DataFrame(fragment_abundances)

HBox(children=(IntProgress(value=0, description='Files', max=1170, style=ProgressStyle(description_width='init…




In [15]:
# Save the output to csv
src_csv = os.path.join(results_dir, "results_{}_occ_sq.csv".format(fragmented))
df.to_csv(src_csv, index=False)
src_csv2 = os.path.join(results_dir, "results_distance_sim_{}_occ_sq.csv".format(fragmented))
df_distance_sim.to_csv(src_csv2)
src_csv3 = os.path.join(results_dir, "results_fragment_abundances_{}_occ_sq.csv".format(fragmented))
df_fragment_abundances.to_csv(src_csv3)

In [16]:
# Move the output csvs - change fragmented variable as appropriate

if not os.path.exists(dst_folder):
	os.makedirs(dst_folder)
dst_csv = os.path.join(dst_folder, "results_{}_occ_sq.csv".format(fragmented))
dst_csv2 = os.path.join(dst_folder, "results_distance_sim_{}_occ_sq.csv".format(fragmented))
dst_csv3 = os.path.join(dst_folder, "results_fragment_abundances_{}_occ_sq.csv".format(fragmented))
shutil.copy2(src_csv, dst_csv)
shutil.copy2(src_csv2, dst_csv2)
shutil.copy2(src_csv3, dst_csv3)

'/home/sam/Documents/PhD/PaleoSampling/Results/PaleoMainOcc2/results_fragment_abundances_all_occ_sq.csv'

In [17]:
# Move the output csvs - change fragmented variable as appropriate
if not os.path.exists(dst_folder2):
	os.makedirs(dst_folder2)
dst_csv = os.path.join(dst_folder2, "results_{}_occ_sq.csv".format(fragmented))
dst_csv2 = os.path.join(dst_folder2, "results_distance_sim_{}_occ_sq.csv".format(fragmented))
dst_csv3 = os.path.join(dst_folder2, "results_fragment_abundances_{}_occ_sq.csv".format(fragmented))
shutil.copy2(src_csv, dst_csv)
shutil.copy2(src_csv2, dst_csv2)
shutil.copy2(src_csv3, dst_csv3)

'/home/sam/Documents/PhD/PaleoSampling/Code/MainSimulationR/results/PaleoMainOcc2/results_fragment_abundances_all_occ_sq.csv'

## Verify that the data is sampling the correct number of individuals

In [117]:
fragment_df = pd.DataFrame(columns=["fragment", "xmin", "ymin", "xmax", "ymax", "total"])
for interval in intervals:
    for tet_group in tetrapod_groups:
        fragment_csv = pathlib.Path(data_dir, "configs",
                                            "fragments_occ_{}_{}.csv".format(interval, tet_group))
        tmp = pd.read_csv(fragment_csv, header=None, names=["fragment", "xmin", "ymin", "xmax", "ymax", "total"]).assign(interval=interval, tetrapod_group=tet_group)
        fragment_df = fragment_df.append(tmp, sort=False)
        
fragment_df = fragment_df.groupby(["interval", "tetrapod_group"])["total"].sum().reset_index()

In [118]:
for file in tqdm_notebook(os.listdir(results_dir), desc="Files"):
    file_path = os.path.join(results_dir, file)
    _, ext = os.path.splitext(file_path)
    if ".db" == ext:
        # print(file)
        t = CoalescenceTree(file_path)
        with SQLiteConnection(t.database) as c:
            max_n = c.execute("SELECT COUNT(*) FROM SPECIES_LIST_ORIGINAL_0 WHERE tip==1").fetchone()[0]
        sim_params = t.get_simulation_parameters()
        species_richness = t.get_species_richness(ref)
        beta = t.get_beta_diversity(ref)
        alpha = t.get_alpha_diversity(ref)
        scenario = sim_scenario_detection(file)
        goodness_fit = t.get_goodness_of_fit(reference=ref)
        total_ind = t.get_number_individuals(community_reference=ref)
        (interval, tet_group) = sim_type_detection(sim_params["sample_file"])
        n = t.get_number_individuals()
        expected_n = int(fragment_df[(fragment_df.interval == interval) & (fragment_df.tetrapod_group == tet_group)]["total"])
        if n != expected_n:
            logging.info(f"{n} != {expected_n} in {interval}, {tet_group} and max_ind = {max_n} in scenario {scenario}")
        

HBox(children=(IntProgress(value=0, description='Files', max=1112, style=ProgressStyle(description_width='init…

INFO:root:151 != 1026 in artinskian, amphibian and max_ind = 396 in scenario clustered
INFO:root:151 != 1026 in artinskian, amphibian and max_ind = 297 in scenario clustered
INFO:root:151 != 1026 in artinskian, amphibian and max_ind = 306 in scenario clustered
INFO:root:93 != 1026 in artinskian, amphibian and max_ind = 99 in scenario clustered
INFO:root:151 != 1026 in artinskian, amphibian and max_ind = 600 in scenario clustered
INFO:root:151 != 1026 in artinskian, amphibian and max_ind = 306 in scenario clustered
INFO:root:151 != 1026 in artinskian, amphibian and max_ind = 306 in scenario clustered
INFO:root:147 != 1026 in artinskian, amphibian and max_ind = 273 in scenario clustered
INFO:root:149 != 1026 in artinskian, amphibian and max_ind = 288 in scenario clustered
INFO:root:148 != 1026 in artinskian, amphibian and max_ind = 272 in scenario clustered
INFO:root:151 != 1026 in artinskian, amphibian and max_ind = 330 in scenario clustered
INFO:root:151 != 1026 in artinskian, amphibia




In [20]:
interval, tet_group

('artinskian', 'amphibian')

In [21]:
int(fragment_df[(fragment_df.interval == interval) & (fragment_df.tetrapod_group == tet_group)]["total"])

1026

In [18]:
df.head()

Unnamed: 0,actual_alpha,actual_beta,actual_richness,alpha_diversity,beta_diversity,deme,gof,interval,percent_cover,richness,scenario,sigma,simulated_individuals,speciation_rate,tetrapod_group
0,2.307692,32.933333,76.0,2.446154,64.591195,342.571843,0.692738,artinskian,0.8,158,fragmented,0.690367,728,1e-08,amniote
1,2.307692,32.933333,76.0,2.461538,30.46875,342.571843,0.927043,artinskian,0.8,75,fragmented,0.690367,728,1e-09,amniote
2,2.307692,32.933333,76.0,2.369231,40.519481,342.571843,0.855374,artinskian,0.8,96,fragmented,0.690367,728,2e-09,amniote
3,2.307692,32.933333,76.0,2.523077,44.390244,342.571843,0.80411,artinskian,0.8,112,fragmented,0.690367,728,3e-09,amniote
4,2.307692,32.933333,76.0,2.476923,50.869565,342.571843,0.75914,artinskian,0.8,126,fragmented,0.690367,728,4e-09,amniote


In [33]:
(df[df.tetrapod_group == "amphibian"][["deme", "interval", "scenario", "tetrapod_group"]]
 .groupby(["interval", "scenario", "tetrapod_group"]).size())

interval    scenario    tetrapod_group
artinskian  clustered   amphibian         240
            fragmented  amphibian         160
            pristine    amphibian          50
asselian    clustered   amphibian         240
            fragmented  amphibian         450
            pristine    amphibian         150
bashkirian  pristine    amphibian          80
gzhelian    clustered   amphibian         240
            fragmented  amphibian         580
            pristine    amphibian         200
kasimovian  fragmented  amphibian         710
            pristine    amphibian         230
kungurian   clustered   amphibian         240
            fragmented  amphibian         130
            pristine    amphibian          20
moscovian   clustered   amphibian         240
            pristine    amphibian         130
sakmarian   clustered   amphibian         240
            fragmented  amphibian         130
            pristine    amphibian          40
dtype: int64

In [41]:
tmp = []
distance_sim_tmp = []
fragment_abundances = []
for file in tqdm_notebook(os.listdir(results_dir), desc="Files"):
    file_path = os.path.join(results_dir, file)
    _, ext = os.path.splitext(file_path)
    if ".db" == ext:
        # print(file)
        t = CoalescenceTree(file_path)
#         if check_sql_table_exist(t.database, "SPECIES_DISTANCE_SIMILARITY"):
#             t.cursor.execute("DROP TABLE IF EXISTS SPECIES_DISTANCE_SIMILARITY")
#         t.calculate_species_distance_similarity()
#         if not check_sql_table_exist(t.database, "BIODIVERSITY_METRICS"):
#             logging.warning("Skipping {} as BIODIVERSITY_METRICS table doesn't exist.".format(file))
#             continue
        for sr in speciation_rates:
                sim_params = t.get_simulation_parameters()
#                 species_richness = t.get_species_richness(ref)
#                 beta = t.get_beta_diversity(ref)
#                 alpha = t.get_alpha_diversity(ref)
#                 goodness_fit = t.get_goodness_of_fit(reference=ref)
                total_ind = t.get_number_individuals(community_reference=ref)
                (interval, tet_group) = sim_type_detection(sim_params["sample_file"])
                scenario = None
                
                for i in ["fragmented", "clustered", "pristine"]:
                    if i in file:
                        scenario = i
                        break
                if scenario == "fragmented":
                    pc = percent_cover_detection(sim_params["fine_map_file"])
                else:
                    pc = 1.0
                tmp.append({"interval": interval, "tetrapod_group" : tet_group,
                            "sigma" : sim_params["sigma"], "speciation_rate" : sr,
                            "deme" : sim_params["deme"], "job_type" : sim_params["job_type"],
                            "seed" : sim_params["seed"],
                            "percent_cover": pc, "simulated_individuals" : total_ind, "scenario" : scenario})
#                 # try:
#                 distance_sim = t.get_species_distance_similarity(ref)
#                 for distance, no_ind in distance_sim:
#                     distance_sim_tmp.append({"interval": interval, "tetrapod_group" : tet_group,
#                                              "sigma" : sim_params["sigma"], "speciation_rate" : sr,
#                             "deme" : sim_params["deme"], "richness" : species_richness,
#                             "beta_diversity" : beta, "alpha_diversity" : alpha,
#                             "gof" : goodness_fit, "actual_richness" : spec_r, "actual_beta" : b,
#                             "actual_alpha" : a, "distance" : distance, "no_individuals" : no_ind,"percent_cover": pc, "scenario" : scenario})
#                 for fragment in t.get_fragment_list(ref):
#                     r = t.get_fragment_richness(fragment, ref)
#                     fragment_abundances.append({"interval": interval, "tetrapod_group" : tet_group,
#                                                 "sigma" : sim_params["sigma"], "speciation_rate" : sr,
#                                                 "deme" : sim_params["deme"], "richness" : r,
#                                                 "fragment" : fragment,
#                                                 "percent_cover": pc, "scenario" : scenario})
				
tmp_df = pd.DataFrame(tmp)

HBox(children=(IntProgress(value=0, description='Files', max=1112, style=ProgressStyle(description_width='init…




In [42]:
tmp_df[(tmp_df.interval == "artinksian") & (tmp_df.scenario == "pristine") & (tmp_df.tetrapod_group == "amphibian")]

Unnamed: 0,deme,interval,job_type,percent_cover,scenario,seed,sigma,simulated_individuals,speciation_rate,tetrapod_group


In [43]:
max_density = OrderedDict([(('artinskian', 'amniote'), 225),
                           (('artinskian', 'amphibian'), 225),
                           (('asselian', 'amniote'), 36),
                           (('asselian', 'amphibian'), 36),
                           (('bashkirian', 'amniote'), 1),
                           (('bashkirian', 'amphibian'), 529),
                           (('gzhelian', 'amniote'), 49),
                           (('gzhelian', 'amphibian'), 49),
                           (('kasimovian', 'amniote'), 49),
                           (('kasimovian', 'amphibian'), 16),
                           (('kungurian', 'amniote'), 225),
                           (('kungurian', 'amphibian'), 289),
                           (('moscovian', 'amniote'), 16),
                           (('moscovian', 'amphibian'), 441),
                           (('sakmarian', 'amniote'), 36),
                           (('sakmarian', 'amphibian'), 225)])

intervals = ["artinskian", "asselian", "bashkirian", "gzhelian", "kasimovian", "kungurian", "moscovian", "sakmarian"]
import lhsmdu

tetrapod_groups = ["amniote", "amphibian"]
def get_sim_parameters(job_num, save_dir, sim_type="pristine", param_list=None):
    """
	Gets the sim parameters, returned as a dictionary.

	:param job_num: the value passed in sys.argv
	:param save_dir: the directory to save the sim output to
	:return: dictionary containing the simulation parameters
	"""
    sim_list = []
    job_type = 1
    proportion_covers = [20, 40, 80] if sim_type == "fragmented" else [100]
    for proportion_cover in proportion_covers:
        for interval in intervals:
            for tetra_group in tetrapod_groups:
                job_type += 1
                for seed in range(1, 26, 1):
                    sim_list.append([interval, tetra_group, seed, job_type, proportion_cover])
    this_sim = sim_list[job_num]
    _, _, sigma, density_per_km = choose_sim_variables(this_sim[2], dimensions=2, param_list=param_list)
    density_per_km = 10 ** density_per_km
    return {
        "interval": this_sim[0],
        "tetrapod_group": this_sim[1],
        "sigma": sigma,
        "seed": this_sim[2],
        "job_type": this_sim[3],
        "density_per_km": density_per_km,
        "proportion_cover" : this_sim[4],
        "output_directory": os.path.join(save_dir, str(this_sim[3])),
    }

def choose_sim_variables(job_num, dimensions=2, param_list=None):
    """
	Choose the relevant simulation variables given the command-line argument
	:param job_num: the command-line argument for setting the task
	:return: a list containing the important dispersal parameters
	"""
    # job_type = int(job_num) % 5 # For latin hypercube sampling every job type is different
    job_type = job_num
    if param_list is None:
        if dimensions == 2:
            param_list = latin_two_dimensional_sampling(0.1, 20, 1.4, 3, 25)
        elif dimensions == 3:
            param_list = latin_three_dimensional_sampling(0.1, 8.0, 3.0, 10.0, 10, 6400, 100)
    if dimensions == 2:
        return [job_num, job_type, param_list[0, job_num], param_list[1, job_num]]
    elif dimensions == 3:
        return [job_num, job_type, param_list[0, job_num], param_list[1, job_num], param_list[2, job_num]]
    raise ValueError("Dimensions must be 2 or 3 currently.")
    

def latin_two_dimensional_sampling(dim_1_min, dim_1_max, dim_2_min, dim_2_max, number):
    """
	Samples efficiently and completely from the parameter space specified. Returns a list of pairs of parameters which
	simulations should run with.

	:param dim_1_min: the minimum value of the first dimension
	:param dim_1_max: the maximum value of the first dimension
	:param dim_2_min: the minimum value of the second dimension
	:param dim_2_max: the maximum value of the second dimension
	:param number: the number of samples to draw in each dimension

	:return: a list of parameter pairs generated using the lhsmdu module.
	:rtype list
	"""
    samples = lhsmdu.sample(2, number, randomSeed=1001)
    assert dim_1_min < dim_1_max
    assert dim_2_min < dim_2_max
    samples[0] = (dim_1_max - dim_1_min) * samples[0] + dim_1_min
    samples[1] = (dim_2_max - dim_2_min) * samples[1] + dim_2_min
    return samples

In [93]:
params = []
param_list = latin_two_dimensional_sampling(0.1, 20, 1.4, 3, 25)
for i in range(1, 1400):
    
    try:
        p = get_sim_parameters(i, "none", param_list = param_list, sim_type="fragmented")
        p["pbs"] = i
        params.append(p)
    except:
        continue
tmp_df = pd.DataFrame(params)

In [94]:
tmp_df["filename"] = tmp_df.apply(lambda x: "pristine_{}_{}.db".format(x["job_type"], x["seed"]), axis=1)
tmp_df

Unnamed: 0,density_per_km,interval,job_type,output_directory,pbs,proportion_cover,seed,sigma,tetrapod_group,filename
0,121.998706,artinskian,2,none/2,1,20,2,16.119937,amniote,pristine_2_2.db
1,89.141170,artinskian,2,none/2,2,20,3,2.766267,amniote,pristine_2_3.db
2,476.047838,artinskian,2,none/2,3,20,4,8.508750,amniote,pristine_2_4.db
3,944.345234,artinskian,2,none/2,4,20,5,18.225811,amniote,pristine_2_5.db
4,134.240159,artinskian,2,none/2,5,20,6,19.340737,amniote,pristine_2_6.db
5,26.429407,artinskian,2,none/2,6,20,7,16.824312,amniote,pristine_2_7.db
6,342.571843,artinskian,2,none/2,7,20,8,0.690367,amniote,pristine_2_8.db
7,258.848859,artinskian,2,none/2,8,20,9,7.681397,amniote,pristine_2_9.db
8,53.290374,artinskian,2,none/2,9,20,10,6.710675,amniote,pristine_2_10.db
9,706.745869,artinskian,2,none/2,10,20,11,1.337784,amniote,pristine_2_11.db


In [95]:
tmp_df["exists"] = tmp_df.apply(lambda x : pathlib.Path(results_dir, x["filename"]).exists(), axis=1)

In [96]:
tmp_df[tmp_df.exists == False]

Unnamed: 0,density_per_km,interval,job_type,output_directory,pbs,proportion_cover,seed,sigma,tetrapod_group,filename,exists
95,45.863370,bashkirian,6,none/6,100,20,1,2.131084,amniote,pristine_6_1.db,False
97,89.141170,bashkirian,6,none/6,102,20,3,2.766267,amniote,pristine_6_3.db,False
98,476.047838,bashkirian,6,none/6,103,20,4,8.508750,amniote,pristine_6_4.db,False
99,944.345234,bashkirian,6,none/6,104,20,5,18.225811,amniote,pristine_6_5.db,False
100,134.240159,bashkirian,6,none/6,105,20,6,19.340737,amniote,pristine_6_6.db,False
101,26.429407,bashkirian,6,none/6,106,20,7,16.824312,amniote,pristine_6_7.db,False
102,342.571843,bashkirian,6,none/6,107,20,8,0.690367,amniote,pristine_6_8.db,False
103,258.848859,bashkirian,6,none/6,108,20,9,7.681397,amniote,pristine_6_9.db,False
104,53.290374,bashkirian,6,none/6,109,20,10,6.710675,amniote,pristine_6_10.db,False
105,706.745869,bashkirian,6,none/6,110,20,11,1.337784,amniote,pristine_6_11.db,False


In [97]:
tmp_df.query("interval == 'sakmarian' and tetrapod_group == 'amphibian'")

Unnamed: 0,density_per_km,interval,job_type,output_directory,pbs,proportion_cover,seed,sigma,tetrapod_group,filename,exists
359,45.863370,sakmarian,17,none/17,375,20,1,2.131084,amphibian,pristine_17_1.db,True
360,121.998706,sakmarian,17,none/17,376,20,2,16.119937,amphibian,pristine_17_2.db,True
361,89.141170,sakmarian,17,none/17,377,20,3,2.766267,amphibian,pristine_17_3.db,True
362,476.047838,sakmarian,17,none/17,378,20,4,8.508750,amphibian,pristine_17_4.db,True
363,944.345234,sakmarian,17,none/17,379,20,5,18.225811,amphibian,pristine_17_5.db,True
364,134.240159,sakmarian,17,none/17,380,20,6,19.340737,amphibian,pristine_17_6.db,True
365,26.429407,sakmarian,17,none/17,381,20,7,16.824312,amphibian,pristine_17_7.db,True
366,342.571843,sakmarian,17,none/17,382,20,8,0.690367,amphibian,pristine_17_8.db,True
367,258.848859,sakmarian,17,none/17,383,20,9,7.681397,amphibian,pristine_17_9.db,True
368,53.290374,sakmarian,17,none/17,384,20,10,6.710675,amphibian,pristine_17_10.db,True


In [89]:
tmp_output = []
for file in tqdm_notebook(os.listdir(results_dir), desc="Files"):
    file_path = os.path.join(results_dir, file)
    _, ext = os.path.splitext(file_path)
    if ".db" == ext:
        # print(file)
        t = CoalescenceTree(file_path)
        sim_params = t.get_simulation_parameters()
        (interval, tet_group) = sim_type_detection(sim_params["sample_file"])
        scenario = sim_scenario_detection(file)
        if scenario == "clustered": 
            tmp_output.append({"interval" : interval, "tetrapod_group": tet_group, "file" : pathlib.Path(file).name, "seed" : sim_params["seed"],
                              "job_type": sim_params["job_type"]})

HBox(children=(IntProgress(value=0, description='Files', max=1112, style=ProgressStyle(description_width='init…




In [91]:
pd.DataFrame(tmp_output).groupby(["interval", "tetrapod_group"]).agg(["count"])

Unnamed: 0_level_0,Unnamed: 1_level_0,file,job_type,seed
Unnamed: 0_level_1,Unnamed: 1_level_1,count,count,count
interval,tetrapod_group,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
artinskian,amniote,24,24,24
artinskian,amphibian,24,24,24
asselian,amphibian,24,24,24
gzhelian,amniote,24,24,24
gzhelian,amphibian,24,24,24
kasimovian,amniote,15,15,15
kungurian,amniote,24,24,24
kungurian,amphibian,24,24,24
moscovian,amniote,3,3,3
moscovian,amphibian,24,24,24
