In [1]:
import pandas as pd
from pathlib import Path
import json
import re

from coffeaphylogeo.definitions import Definitions

In [2]:
defs = Definitions()
defs.root_dir

PosixPath('/home/local/USHERBROOKE/lals2906/programming/python_projects/coffeaPhyloGeo')

In [16]:
defs.geoclim_dirs

{'main': 'geoclim',
 'gps_data': 'geospatial',
 'climate_data': 'climate',
 'environment_data': 'enviro'}

In [17]:
climate_dir = defs.get_geoclim_path("climate_data")
enviro_dir = defs.get_geoclim_path("environment_data")

In [18]:
print(climate_dir)
print(enviro_dir)

/home/local/USHERBROOKE/lals2906/programming/python_projects/coffeaPhyloGeo/data/geoclim/climate
/home/local/USHERBROOKE/lals2906/programming/python_projects/coffeaPhyloGeo/data/geoclim/enviro


In [19]:
defs.geoclim_files

{'raw_table': 'coff_madag_species_summary.xlsx',
 'gps_all': 'species_locations_all_positions.csv',
 'gbs_filtered': 'species_locations_gbs_only.csv',
 'madaclim_current': 'madaclim_current.tif',
 'madaclim_enviro': 'madaclim_enviro.tif',
 'clim_metadata': 'clim_metadata.json',
 'env_metadata': 'enviro_metadata.json',
 'clim_data_format': 'clim_data_format.json',
 'env_data_format': 'env_data_format.json'}

In [21]:
with open(climate_dir / defs.geoclim_files["clim_data_format"], "r") as f:
    data_format = json.load(f)


In [22]:
df = pd.read_json(data_format["table_0"])
df

Unnamed: 0,Layers,Climate variable
0,1-12,Monthly minimum temperature (°C x 10)
1,13-24,Monthly maximum temperature (°C x 10)
2,25-36,Monthly total precipitation (mm.month-1)
3,37-55,Bioclimatic variables (bioclim)
4,56-67,Monthly potential evapotranspiration (mm.month-1)
5,68,Annual potential evapotranspiration (mm)
6,69,Annual climatic water deficit (mm)
7,70,Number of dry months in the year


In [23]:
# Split 'Layers' column and create new rows
def split_layers(row):
    if '-' in row['Layers']:
        start, end = map(int, row['Layers'].split('-'))
        return pd.Series(row['Climate variable'], index=range(start, end+1))
    else:
        return pd.Series(row['Climate variable'], index=[int(row['Layers'])])

df = df.apply(split_layers, axis=1).stack().reset_index().drop(columns="level_0")
df.columns = ['layer_number', 'climate_variable']

In [24]:
df["data_type"] = "clim"

In [25]:
df["layer_name"] = ""

In [26]:
df

Unnamed: 0,layer_number,climate_variable,data_type,layer_name
0,1,Monthly minimum temperature (°C x 10),clim,
1,2,Monthly minimum temperature (°C x 10),clim,
2,3,Monthly minimum temperature (°C x 10),clim,
3,4,Monthly minimum temperature (°C x 10),clim,
4,5,Monthly minimum temperature (°C x 10),clim,
...,...,...,...,...
65,66,Monthly potential evapotranspiration (mm.month-1),clim,
66,67,Monthly potential evapotranspiration (mm.month-1),clim,
67,68,Annual potential evapotranspiration (mm),clim,
68,69,Annual climatic water deficit (mm),clim,


In [27]:
with open(climate_dir /  defs.geoclim_files["clim_metadata"],"r") as f:
    data_metadata = json.load(f)

In [28]:
monthly_vars = pd.read_json(data_metadata["table_0"])

In [29]:
monthly_vars.columns = ["layer_name", "description"]

In [30]:
monthly_vars

Unnamed: 0,layer_name,description
0,tmin1-12,Monthly minimum temperature (°C x 10)
1,tmax1-12,Monthly maximum temperature (°C x 10)
2,prec1-12,Monthly total precipitation (mm.month-1)


In [31]:
# Split 'Layers' column and create new rows
def split_repeating_vars(row, col_to_split, col_to_keep):
    
    # Extract the range and layername to a new smaller df of len(range(start, end))
    if "-" in row[col_to_split]:
        start = int(re.search("\d+", row[col_to_split].split("-")[0]).group())
        end = int(row[col_to_split].split("-")[1])
        name = re.search("[a-z]*", row[col_to_split]).group()
                
        # Create a DataFrame with the split values and the description column
        df = pd.DataFrame({col_to_split: [f"{name}{month}" for month in range(start, end+1)],
                           col_to_keep: row[col_to_keep]})
        return df
    
    # When no changes to changes to row
    else:
        df = pd.DataFrame({col_to_split: [row[col_to_split]], col_to_keep: [row[col_to_keep]]})
        return df


In [32]:
new_monthly = pd.concat(monthly_vars.apply(split_repeating_vars, axis=1, args=("layer_name", "description", )).to_list(), ignore_index=True)


In [33]:
pd.merge(df, new_monthly, left_on='climate_variable', right_on='description').groupby(['climate_variable', 'description']).head(12)

Unnamed: 0,layer_number,climate_variable,data_type,layer_name_x,layer_name_y,description
0,1,Monthly minimum temperature (°C x 10),clim,,tmin1,Monthly minimum temperature (°C x 10)
1,1,Monthly minimum temperature (°C x 10),clim,,tmin2,Monthly minimum temperature (°C x 10)
2,1,Monthly minimum temperature (°C x 10),clim,,tmin3,Monthly minimum temperature (°C x 10)
3,1,Monthly minimum temperature (°C x 10),clim,,tmin4,Monthly minimum temperature (°C x 10)
4,1,Monthly minimum temperature (°C x 10),clim,,tmin5,Monthly minimum temperature (°C x 10)
5,1,Monthly minimum temperature (°C x 10),clim,,tmin6,Monthly minimum temperature (°C x 10)
6,1,Monthly minimum temperature (°C x 10),clim,,tmin7,Monthly minimum temperature (°C x 10)
7,1,Monthly minimum temperature (°C x 10),clim,,tmin8,Monthly minimum temperature (°C x 10)
8,1,Monthly minimum temperature (°C x 10),clim,,tmin9,Monthly minimum temperature (°C x 10)
9,1,Monthly minimum temperature (°C x 10),clim,,tmin10,Monthly minimum temperature (°C x 10)


In [34]:
pd.merge(df, new_monthly, how="inner", left_on="climate_variable", right_on="description").head()

Unnamed: 0,layer_number,climate_variable,data_type,layer_name_x,layer_name_y,description
0,1,Monthly minimum temperature (°C x 10),clim,,tmin1,Monthly minimum temperature (°C x 10)
1,1,Monthly minimum temperature (°C x 10),clim,,tmin2,Monthly minimum temperature (°C x 10)
2,1,Monthly minimum temperature (°C x 10),clim,,tmin3,Monthly minimum temperature (°C x 10)
3,1,Monthly minimum temperature (°C x 10),clim,,tmin4,Monthly minimum temperature (°C x 10)
4,1,Monthly minimum temperature (°C x 10),clim,,tmin5,Monthly minimum temperature (°C x 10)


In [35]:
df[df["layer_number"].between(37, 55)]

Unnamed: 0,layer_number,climate_variable,data_type,layer_name
36,37,Bioclimatic variables (bioclim),clim,
37,38,Bioclimatic variables (bioclim),clim,
38,39,Bioclimatic variables (bioclim),clim,
39,40,Bioclimatic variables (bioclim),clim,
40,41,Bioclimatic variables (bioclim),clim,
41,42,Bioclimatic variables (bioclim),clim,
42,43,Bioclimatic variables (bioclim),clim,
43,44,Bioclimatic variables (bioclim),clim,
44,45,Bioclimatic variables (bioclim),clim,
45,46,Bioclimatic variables (bioclim),clim,
