# Fragmented simulation analysis

- Performs the precursory analysis for the first set of main simulations

In [1]:
import os
from collections import OrderedDict
from tqdm import tqdm_notebook
from math import floor
import pandas as pd
import shutil
import sqlite3
from pycoalescence import CoalescenceTree

In [2]:
# For dev use only - auto-reloading of modules
%load_ext autoreload
%aimport pycoalescence.coalescence_tree
from pycoalescence.coalescence_tree import check_sql_table_exist
from pycoalescence.helper import update_parameter_names
%autoreload 1

In [3]:

intervals = {'artinskian',
 'asselian',
 'bashkirian',
 'gzhelian',
 'kasimovian',
 'kungurian',
 'moscovian',
 'sakmarian'}

tetrapod_groups = {'amniote', 'amphibian'}

In [4]:
def percent_cover_detection(fine_map_file):
	"""
	Detects the percentage cover from the fine map path
	:param fine_map_file: path to check for
	:return: the simulation percentage cover
	"""
	for pc in [0.1, 0.2, 0.5]:
		if "_{}_".format(pc) in fine_map_file:
			return pc
	raise ValueError("No percentage cover detected!")

In [5]:
def sim_type_detection(fine_map_file):
	"""
	Detects the simulation type from the fine map path (because I didn't save it anywhere sensible!
	:param fine_map_file: the path to check for
	:return the sim type
	"""
	for interval in intervals:
		for tet_group in tetrapod_groups:
			if interval in fine_map_file and tet_group in fine_map_file:
				return (interval, tet_group)
	raise ValueError("No type detected! Filename: {}.".format(fine_map_file))

In [6]:
# Set the import directories and variables - paths relative to the jupyter notebook directory
fragmented="fragmented" 
local_dir = "/home/sam/Documents/PhD/PaleoSampling"
ext_dir = "/run/media/sam/Media/Paleo"
# local_dir = "/Users/samthompson/Documents/PhD/PaleoSampling/"
# ext_dir = "/Volumes/Seagate 3TB/Paleo/"
# ext_dir = "/Users/samthompson/Documents/PhD/PaleoSampling/"
results_dir = os.path.join(ext_dir, "Results", "PaleoMain", "Sim8")
dst_folder = os.path.join(local_dir, "Results", "Sim8")
dst_folder2 = os.path.join(local_dir, "Code", "MainSimulationR", "results", "Sim8")
data_dir = os.path.join(ext_dir, "Data")
speciation_rates = [0.0001, 0.00001, 0.000001, 0.0000001, 0.00000001]
max_density = OrderedDict(
    {
        ("artinskian", "amniote"): 15,
        ("artinskian", "amphibian"): 15,
        ("asselian", "amniote"): 6,
        ("asselian", "amphibian"): 6,
        ("bashkirian", "amniote"): 1,
        ("bashkirian", "amphibian"): 23,
        ("gzhelian", "amniote"): 7,
        ("gzhelian", "amphibian"): 7,
        ("kasimovian", "amniote"): 7,
        ("kasimovian", "amphibian"): 4,
        ("kungurian", "amniote"): 15,
        ("kungurian", "amphibian"): 17,
        ("moscovian", "amniote"): 4,
        ("moscovian", "amphibian"): 21,
        ("sakmarian", "amniote"): 6,
        ("sakmarian", "amphibian"): 15,
    }
)

In [8]:
# Delete all simulations which haven't completed yet.
for pc in tqdm_notebook(["20", "40", "80"], desc="Percent cover"):
    for file in tqdm_notebook(os.listdir(os.path.join(results_dir, pc)), desc="Files"):
        if ".db" in file:
            try:
                t = CoalescenceTree(os.path.join(results_dir, pc, file))
            except IOError:
                t = CoalescenceTree()
                t.file = os.path.join(results_dir, pc, file)
                t.database = sqlite3.connect(t.file)
                interval, tet_group = sim_type_detection(os.path.basename(t.get_simulation_parameters()["sample_file"]))
                print("Removing incomplete simulation {} for {} and {}.".format(file, interval, tet_group))
                os.remove(os.path.join(results_dir, pc, file))

HBox(children=(IntProgress(value=0, description='Percent cover', max=3, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=0, description='Files', max=284, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='Files', max=267, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='Files', max=261, style=ProgressStyle(description_width='initi…




In [10]:
# Calculate the biodiversity metrics -  can take a bit of time
for pc in tqdm_notebook(["20", "40", "80"], desc="Percent cover"):
    for file in tqdm_notebook(os.listdir(os.path.join(results_dir, pc)), desc="Files"):
        if ".db" in file:
#             print(file)
            t = CoalescenceTree(os.path.join(results_dir, pc, file))
            if check_sql_table_exist(t.database, "SPECIES_LIST_ORIGINAL"):
                t.revert_downsample()
            t.wipe_data()
            sim_params = t.get_simulation_parameters()
            (interval, tet_group) = sim_type_detection(sim_params["sample_file"])
            deme = sim_params["deme"]
            sample_size = sim_params["sample_size"]
#             downsample_rate = max_density[(interval, tet_group)] /( deme * sample_size)
            (interval, tet_group) = sim_type_detection(sim_params["sample_file"])
            fragment_csv = os.path.join(data_dir, "configs", "fragments_{}_{}.csv".format(interval, tet_group))
            t.downsample_at_locations(fragment_csv=fragment_csv, ignore_errors=True)
            t.set_speciation_parameters(record_spatial=True,
                                        record_fragments=fragment_csv,
                                        speciation_rates=speciation_rates)
            t.clear_calculations()
            t.apply()
            t.import_comparison_data(os.path.join(data_dir, "databases", "{}_{}.db".format(interval, tet_group)))
            # t.adjust_data()
            t._clear_goodness_of_fit()
            t.calculate_fragment_richness()
            # t.calculate_alpha_diversity()
            t.calculate_beta_diversity()
            # break
            t.calculate_goodness_of_fit()

HBox(children=(IntProgress(value=0, description='Percent cover', max=3, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=0, description='Files', max=284, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='Files', max=267, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='Files', max=261, style=ProgressStyle(description_width='initi…

In [9]:
(interval, tet_group)

('kasimovian', 'amphibian')

In [13]:
tmp_dict = {}
for pc in tqdm_notebook(["20", "40", "80"], desc="Percent cover"):
    for file in tqdm_notebook(os.listdir(os.path.join(results_dir, pc)), desc="Files"):
        if ".db" in file:
            t = CoalescenceTree(os.path.join(results_dir, pc, file))
            sim_params = t.get_simulation_parameters()
            (interval, tet_group) = sim_type_detection(sim_params["sample_file"])
            tmp_dict[(interval, tet_group)] = max(tmp_dict.get((interval, tet_group), 0), t.get_number_individuals())

HBox(children=(IntProgress(value=0, description='Percent cover', max=3, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=0, description='Files', max=284, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='Files', max=267, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='Files', max=261, style=ProgressStyle(description_width='initi…

In [15]:
tmp_dict

{('gzhelian', 'amphibian'): 51,
 ('gzhelian', 'amniote'): 27,
 ('asselian', 'amphibian'): 135,
 ('asselian', 'amniote'): 25,
 ('artinskian', 'amphibian'): 182,
 ('artinskian', 'amniote'): 160,
 ('sakmarian', 'amphibian'): 130,
 ('sakmarian', 'amniote'): 67,
 ('kungurian', 'amphibian'): 238,
 ('kungurian', 'amniote'): 226,
 ('kasimovian', 'amphibian'): 26,
 ('kasimovian', 'amniote'): 15}

In [16]:
# Sample from the simulations
tmp = []
distance_sim_tmp = []
fragment_abundances = []
for pc in tqdm_notebook(["20", "40", "80"], desc="Percent cover"):
	for file in tqdm_notebook(os.listdir(os.path.join(results_dir, pc)), desc="Files"):
		if ".db" in file:
			# print(file)
			t = CoalescenceTree(os.path.join(results_dir, pc, file), logging_level=30)
			if check_sql_table_exist(t.database, "SPECIES_DISTANCE_SIMILARITY"):
				t.cursor.execute("DROP TABLE IF EXISTS SPECIES_DISTANCE_SIMILARITY")
			t.calculate_species_distance_similarity()
			for sr in speciation_rates:
					ref = t.get_community_reference(speciation_rate=sr, time=0.0, fragments=True)
					spec_r = t.cursor.execute("SELECT actual FROM BIODIVERSITY_METRICS WHERE"
										 " community_reference==? AND fragment=='whole' AND "
										 "metric=='fragment_richness'", 
										 (ref,)).fetchall()[0][0]
					b = t.cursor.execute("SELECT actual FROM BIODIVERSITY_METRICS WHERE"
										 " community_reference==? AND fragment=='whole' AND "
										 "metric=='beta_diversity'", 
										 (ref,)).fetchall()[0][0]
					a = t.cursor.execute("SELECT actual FROM BIODIVERSITY_METRICS WHERE"
										 " community_reference==? AND fragment=='whole' AND "
										 "metric=='alpha_diversity'", 
										 (ref,)).fetchall()[0][0]
					sim_params = t.get_simulation_parameters()
					species_richness = t.get_species_richness(ref)
					beta = t.get_beta_diversity(ref)
					alpha = t.get_alpha_diversity(ref)
					goodness_fit = t.get_goodness_of_fit(reference=ref)
					total_ind = t.get_number_individuals(community_reference=ref)
					(interval, tet_group) = sim_type_detection(sim_params["sample_file"])
					tmp.append({"interval": interval, "tetrapod_group" : tet_group,
								"sigma" : sim_params["sigma"], "speciation_rate" : sr,
								"deme" : sim_params["deme"], "richness" : species_richness,
								"beta_diversity" : beta, "alpha_diversity" : alpha,
								"gof" : goodness_fit, "actual_richness" : spec_r, "actual_beta" : b,
								"actual_alpha" : a, "percent_cover": pc, "simulated_individuals" : total_ind})
					# try:
					distance_sim = t.get_species_distance_similarity(ref)
					for distance, no_ind in distance_sim:
						distance_sim_tmp.append({"interval": interval, "tetrapod_group" : tet_group,
												 "sigma" : sim_params["sigma"], "speciation_rate" : sr,
								"deme" : sim_params["deme"], "richness" : species_richness,
								"beta_diversity" : beta, "alpha_diversity" : alpha,
								"gof" : goodness_fit, "actual_richness" : spec_r, "actual_beta" : b,
								"actual_alpha" : a, "distance" : distance, "no_individuals" : no_ind,"percent_cover": pc})
					for fragment in t.get_fragment_list(ref):
						r = t.get_fragment_richness(fragment, ref)
						fragment_abundances.append({"interval": interval, "tetrapod_group" : tet_group,
													"sigma" : sim_params["sigma"], "speciation_rate" : sr,
													"deme" : sim_params["deme"], "richness" : r,
													"fragment" : fragment,
													"percent_cover": pc})
				
df = pd.DataFrame(tmp)
df_distance_sim = pd.DataFrame(distance_sim_tmp)
df_fragment_abundances = pd.DataFrame(fragment_abundances)

HBox(children=(IntProgress(value=0, description='Percent cover', max=3, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=0, description='Files', max=284, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='Files', max=267, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='Files', max=261, style=ProgressStyle(description_width='initi…

In [20]:
# Save the output to csv
src_csv = os.path.join(results_dir, "results_{}.csv".format(fragmented))
df.to_csv(src_csv, index=False)
src_csv2 = os.path.join(results_dir, "results_distance_sim_{}.csv".format(fragmented))
df_distance_sim.to_csv(src_csv2)
src_csv3 = os.path.join(results_dir, "results_fragment_abundances_{}.csv".format(fragmented))
df_fragment_abundances.to_csv(src_csv3)

In [21]:
# Move the output csvs - change fragmented variable as appropriate
if not os.path.exists(dst_folder):
	os.makedirs(dst_folder)
dst_csv = os.path.join(dst_folder, "results_{}.csv".format(fragmented))
dst_csv2 = os.path.join(dst_folder, "results_distance_sim_{}.csv".format(fragmented))
dst_csv3 = os.path.join(dst_folder, "results_fragment_abundances_{}.csv".format(fragmented))
shutil.copy2(src_csv, dst_csv)
shutil.copy2(src_csv2, dst_csv2)
shutil.copy2(src_csv3, dst_csv3)

'/home/sam/Documents/PhD/PaleoSampling/Results/Sim8/results_fragment_abundances_fragmented.csv'

In [22]:
# Move the output csvs - change fragmented variable as appropriate
if not os.path.exists(dst_folder2):
	os.makedirs(dst_folder2)
dst_csv = os.path.join(dst_folder2, "results_{}.csv".format(fragmented))
dst_csv2 = os.path.join(dst_folder2, "results_distance_sim_{}.csv".format(fragmented))
dst_csv3 = os.path.join(dst_folder2, "results_fragment_abundances_{}.csv".format(fragmented))
shutil.copy2(src_csv, dst_csv)
shutil.copy2(src_csv2, dst_csv2)
shutil.copy2(src_csv3, dst_csv3)

'/home/sam/Documents/PhD/PaleoSampling/Code/MainSimulationR/results/Sim8/results_fragment_abundances_fragmented.csv'