# DATA PREPARATION FOR SPATIALLY-EXPLICIT NEUTRAL MODELS 

This file requires
- scotese shape files for continental boundaries at different times (280, 320 and 340 mya)
- info_per_pcoords.csv for location data for each fragment (output of R script)

# This file outputs
- Raster files for each interval for continental extents
- Raster files for sampling regimes across the world
- Shapefiles for all sites at each interval

In [1]:
import math
import numpy as np
import os
import pandas as pd

from data_preparation import rasterize, extract_from_shapefile, \
	create_shapefile_from_points, find_distance_lat_long, randomly_clear_landscape, add_masks
from pycoalescence import Map

In [6]:
# Data set up
# set this to the path to the data directory. Everything else should work from that
# data_directory = "/Volumes/Seagate 3TB/Paleo/Data"
data_directory = "/run/media/sam/Media/Paleo/Data/"
data_directory_local = "../MainSimulationR/input/"
# the list of folders containing the shape files to convert to raster
folder_list = ["340m", "320m", "280m"]
interval_list = ["permian", "early_carboniferous", "late_carboniferous"]

In [4]:
# This extracts the continental shapes from the shapefiles into a new file, and then converts it to a
# raster.
for folder in folder_list:
	file_path = os.path.join(data_directory, "scotese", folder, "wcnt.shp")
	file_path_altered = os.path.join(data_directory, "scotese", folder, "wcnt_altered.shp")
	if not os.path.exists(file_path):
		raise IOError("file {} does not exist".format(file_path))
	# Drop all features from the shapefile which don't match "CM" field
	extract_from_shapefile(file_path, file_path_altered, field="TYPE", field_value="CM")
	# Now convert our altered shapefile to a raster
	output_path = os.path.join(data_directory, "paleo_maps", "{}.tif".format(folder))
	rasterize(file_path_altered, output_path, pixel_size=0.01)

In [5]:
# now rename the three files in question
os.rename(os.path.join(data_directory, "paleo_maps", "320m.tif"), 
		  os.path.join(data_directory, "paleo_maps", "late_carboniferous.tif"))
os.rename(os.path.join(data_directory, "paleo_maps", "340m.tif"), 
		  os.path.join(data_directory, "paleo_maps", "early_carboniferous.tif"))
os.rename(os.path.join(data_directory, "paleo_maps", "280m.tif"), 
		  os.path.join(data_directory, "paleo_maps", "permian.tif"))

In [7]:
# Import the fragment coordinates from csv.
info_per_pcoord = pd.read_csv(os.path.join(data_directory_local, "info_per_pcoord.csv"))
info_per_pcoord['lat'] = np.NaN
info_per_pcoord['long'] = np.NaN
# split into lat and long for shapefile
for index, row in info_per_pcoord.iterrows():
	lat, long = row["pcoords"].split(",")
	info_per_pcoord.loc[index, 'lat'] = pd.to_numeric(lat) # plus some small modifier\n",
	info_per_pcoord.loc[index, 'long'] = pd.to_numeric(long)

In [7]:
# Create the shape file from these points and the spatial sampling masks,
#  which are = 1 at our fossil sites, and 0 everywhere else.
# These sampling masks will be updated later on to contain the sampling proportion at each site,
# not just a binary mask.
for interval in info_per_pcoord['interval'].unique():
	tmp_df = info_per_pcoord[info_per_pcoord.interval == interval]
	# the mask files for defining our spatial sampling
	mask_shp = os.path.join(data_directory, "paleo_maps",
							"pointsmask_{}.shp".format(interval.replace(" ", "_").lower()))
	mask_raster = os.path.join(data_directory, "paleo_maps",
							"paleomask_{}.tif".format(interval.replace(" ", "_").lower()))
	create_shapefile_from_points(tmp_df, mask_shp)
	# use our mask to create a binary raster map
	rasterize(mask_shp, mask_raster, pixel_size=0.01)

In [8]:
# Calculate the origins for each of the time points
mask_origins = {}
for interval in interval_list:
	mask_raster = os.path.join(data_directory, "paleo_maps",
							"paleomask_{}.tif".format(interval))
	paleo_mask = Map(mask_raster)
	_, _, _,_, x_res, y_res, ulx_mask, uly_mask = paleo_mask.get_dimensions()
	mask_origins[interval] = [uly_mask, ulx_mask, y_res, x_res]

OSError: File /run/media/sam/Media/Paleo/Data/paleo_maps/paleomask_permian.tif does not exist or is not accessible. Check read/write access.

In [9]:
# Generate the fragmented landscapes using random removal of points
# Then add our binary masks to the habitat map to ensure there is habitat at every cell we sample
pc_list = [0.1, 0.2, 0.5]
for interval in interval_list:
	for pc in pc_list:
		print(pc)
		input_map = os.path.join(data_directory, "paleo_maps", "{}.tif".format(interval))
		output_map = os.path.join(data_directory, "paleo_maps", "{}_fragmented_{}.tif".format(interval,
																							  pc))
		if not os.path.exists(output_map):
			randomly_clear_landscape(input_map, output_map, pc)
		mask_raster = os.path.join(data_directory, "paleo_maps",
								   "paleomask_{}.tif".format(interval))
		final_map = os.path.join(data_directory,
								 "paleo_maps", "{}_fragmented_{}_masked.tif".format(interval, pc))
		if not os.path.exists(final_map):
			add_masks(output_map, mask_raster, final_map)

0.1
0.2
0.5
0.1
0.2
0.5
0.1
0.2
0.5


In [5]:
interval_max_ind = info_per_pcoord.groupby(["interval"])["individuals"].max()
interval_max_ind = interval_max_ind.rename({x : x.replace(" ", "_").lower() for x in interval_max_ind.keys()})

In [13]:
sum(info_per_pcoord[info_per_pcoord.interval == "Early Carboniferous"]["individuals"])

32

In [14]:
# Create the fragment csv for defining fragments within the simulation
# This creates one fragment csv for each interval
# We now need to create our masks again, altering the values of the shapefile to match our fragment coordinates
# This seems a bit stupid, but because fragment coordinates can't be calculated until the mask file
# origin is known, this is the best way
if not os.path.exists(os.path.join(data_directory, "configs")):
	os.mkdir(os.path.join(data_directory, "configs"))
for interval in interval_list:
	columns = ['fragment', 'x_min', 'y_min', 'x_max', 'y_max', 'no_individuals']
	rows = [x for x in range(0, info_per_pcoord.shape[0], 1)]
	fragment_locations = pd.DataFrame(columns=columns, index=rows)
	fragment_map = Map(file=os.path.join(data_directory, "paleo_maps", "paleomask_{}.tif".format(interval)))
	fragment_map.open()
	for index, row in info_per_pcoord.iterrows():
		if row.interval.replace(" ", "_").lower() != interval:
			continue
		modifier = 0
		# # Weirdly one of the fragments is put above instead of below the geo-coord - change it here
		# if row.collection_ref == "174420":
		# 	modifier = -1
		fragment_locations.fragment[index] = row.collection_ref_2
		offsets = find_distance_lat_long(mask_origins[interval][0], mask_origins[interval][1], row.lat, row.long, 
										 res=mask_origins[interval][3])
		fragment_locations.x_min[index] = int(math.floor(offsets[1])) 
		fragment_locations.x_max[index] = int(math.floor(offsets[1] + 1))
		fragment_locations.y_min[index] = int(math.floor(offsets[0])) 
		fragment_locations.y_max[index] = int(math.floor(offsets[0] + 1)) + modifier
		fragment_locations.no_individuals[index] = row.individuals
		# Alter the fragment mask with updating the sampling values.  ***** add this back in if needed
		fragment_map.data[fragment_locations.y_min[index], fragment_locations.x_min[index]] = (1 + row.individuals)/interval_max_ind[interval]
	fragment_locations = fragment_locations[pd.notnull(fragment_locations['x_min'])]
	fragment_locations.to_csv(os.path.join(data_directory, "configs", "fragments_{}.csv".format(interval)),
							  index=False, header=False)
	fragment_map.write()

In [13]:
# Now double check that these all make sense
for interval in interval_list:
	df = pd.read_csv(os.path.join(data_directory, "configs", "fragments_{}.csv".format(interval)),
					 header=None)
	df.columns = ['fragment', 'x_min', 'y_min', 'x_max', 'y_max', 'no_individuals']
	mask_raster = Map(os.path.join(data_directory, "paleo_maps",
								   "paleomask_{}.tif".format(interval.replace(" ", "_").lower())))
	mask_value = mask_raster.get_cached_subset(0, 0, 1, 1)
	for index, row in df.iterrows():
		# print(row.fragment)
		mask_value = mask_raster.get_cached_subset(row.x_min, row.y_min, 1, 1)
		if not math.isclose(mask_value[0][0], row.no_individuals/interval_max_ind[interval], rel_tol=1e-2):
			print("Mask value not correct: {} != {}"
							 " for location {}, {} - fragment {}".format(mask_value[0][0],
																		 row.no_individuals/interval_max_ind[interval],
																		 row.x_min,
																		 row.y_min,
																		 row.fragment))
print("All maps verified")

Mask value not correct: 0.016949152573943138 != 0.02702702702702703 for location 607, 3545 - fragment 11152
Mask value not correct: 0.016949152573943138 != 0.02702702702702703 for location 5805, 2695 - fragment 121601
Mask value not correct: 0.016949152573943138 != 0.02702702702702703 for location 2179, 3689 - fragment 126543
Mask value not correct: 0.016949152573943138 != 0.02702702702702703 for location 2203, 3616 - fragment 127110
Mask value not correct: 0.23728813230991364 != 0.24324324324324326 for location 711, 3245 - fragment 12976-177014-177015
Mask value not correct: 0.033898305147886276 != 0.05405405405405406 for location 607, 3548 - fragment 13044
Mask value not correct: 0.016949152573943138 != 0.02702702702702703 for location 745, 3253 - fragment 13046
Mask value not correct: 0.016949152573943138 != 0.02702702702702703 for location 676, 3561 - fragment 13048
Mask value not correct: 0.22033898532390594 != 0.10810810810810811 for location 613, 3551 - fragment 13080
Mask value