In [1]:
import pandas as pd
from pathlib import Path
import json
import re

from coffeaphylogeo.definitions import Definitions

In [2]:
defs = Definitions()
defs.root_dir

PosixPath('/home/local/USHERBROOKE/lals2906/programming/python_projects/coffeaPhyloGeo')

In [3]:
defs.geoclim_dirs

{'main': 'geoclim',
 'gps_data': 'geospatial',
 'climate_data': 'climate',
 'environment_data': 'enviro'}

In [4]:
defs.get_geoclim_path("climate_data")

PosixPath('/home/local/USHERBROOKE/lals2906/programming/python_projects/coffeaPhyloGeo/data/geoclim/climate')

In [5]:
climate_dir = defs.get_geoclim_path("climate_data")
enviro_dir = defs.get_geoclim_path("environment_data")

In [6]:
print(climate_dir)
print(enviro_dir)

/home/local/USHERBROOKE/lals2906/programming/python_projects/coffeaPhyloGeo/data/geoclim/climate
/home/local/USHERBROOKE/lals2906/programming/python_projects/coffeaPhyloGeo/data/geoclim/enviro


In [8]:
defs.geoclim_files

{'raw_table': 'coff_madag_species_summary.xlsx',
 'gps_all': 'species_locations_all_positions.csv',
 'gbs_filtered': 'species_locations_gbs_only.csv',
 'madaclim_current': 'madaclim_current.tif',
 'madaclim_enviro': 'madaclim_enviro.tif',
 'clim_metadata': 'clim_metadata.json',
 'env_metadata': 'enviro_metadata.json',
 'clim_data_format': 'clim_data_format.json',
 'env_data_format': 'env_data_format.json'}

In [9]:
with open(climate_dir / defs.geoclim_files["clim_data_format"], "r") as f:
    data_format = json.load(f)


In [10]:
data_format.keys()

dict_keys(['table_0'])

In [11]:
df = pd.read_json(data_format["table_0"])
df

Unnamed: 0,Layers,Climate variable
0,1-12,Monthly minimum temperature (°C x 10)
1,13-24,Monthly maximum temperature (°C x 10)
2,25-36,Monthly total precipitation (mm.month-1)
3,37-55,Bioclimatic variables (bioclim)
4,56-67,Monthly potential evapotranspiration (mm.month-1)
5,68,Annual potential evapotranspiration (mm)
6,69,Annual climatic water deficit (mm)
7,70,Number of dry months in the year


In [12]:
# Split 'Layers' column and create new rows
def split_layers(row):
    if '-' in row['Layers']:
        start, end = map(int, row['Layers'].split('-'))
        return pd.Series(row['Climate variable'], index=range(start, end+1))
    else:
        return pd.Series(row['Climate variable'], index=[int(row['Layers'])])

df = df.apply(split_layers, axis=1).stack().reset_index().drop(columns="level_0")
df.columns = ['layer_number', 'climate_variable']

In [13]:
df["data_type"] = "clim"

In [14]:
df

Unnamed: 0,layer_number,climate_variable,data_type
0,1,Monthly minimum temperature (°C x 10),clim
1,2,Monthly minimum temperature (°C x 10),clim
2,3,Monthly minimum temperature (°C x 10),clim
3,4,Monthly minimum temperature (°C x 10),clim
4,5,Monthly minimum temperature (°C x 10),clim
...,...,...,...
65,66,Monthly potential evapotranspiration (mm.month-1),clim
66,67,Monthly potential evapotranspiration (mm.month-1),clim
67,68,Annual potential evapotranspiration (mm),clim
68,69,Annual climatic water deficit (mm),clim


In [15]:
with open(climate_dir /  defs.geoclim_files["clim_metadata"],"r") as f:
    data_metadata = json.load(f)

In [16]:
monthly_vars = pd.read_json(data_metadata["table_0"])

In [17]:
monthly_vars.columns = ["layer_name", "description"]

In [18]:
monthly_vars

Unnamed: 0,layer_name,description
0,tmin1-12,Monthly minimum temperature (°C x 10)
1,tmax1-12,Monthly maximum temperature (°C x 10)
2,prec1-12,Monthly total precipitation (mm.month-1)


In [19]:
pd.merge(df, monthly_vars, left_on="climate_variable", right_on="description", how="outer")

Unnamed: 0,layer_number,climate_variable,data_type,layer_name,description
0,1,Monthly minimum temperature (°C x 10),clim,tmin1-12,Monthly minimum temperature (°C x 10)
1,2,Monthly minimum temperature (°C x 10),clim,tmin1-12,Monthly minimum temperature (°C x 10)
2,3,Monthly minimum temperature (°C x 10),clim,tmin1-12,Monthly minimum temperature (°C x 10)
3,4,Monthly minimum temperature (°C x 10),clim,tmin1-12,Monthly minimum temperature (°C x 10)
4,5,Monthly minimum temperature (°C x 10),clim,tmin1-12,Monthly minimum temperature (°C x 10)
...,...,...,...,...,...
65,66,Monthly potential evapotranspiration (mm.month-1),clim,,
66,67,Monthly potential evapotranspiration (mm.month-1),clim,,
67,68,Annual potential evapotranspiration (mm),clim,,
68,69,Annual climatic water deficit (mm),clim,,


In [20]:
df["climate_variable"].unique()

array(['Monthly minimum temperature (°C x 10)',
       'Monthly maximum temperature (°C x 10)',
       'Monthly total precipitation (mm.month-1)',
       'Bioclimatic variables (bioclim)',
       'Monthly potential evapotranspiration (mm.month-1)',
       'Annual potential evapotranspiration (mm)',
       'Annual climatic water deficit (mm)',
       'Number of dry months in the year'], dtype=object)

In [21]:
pd.read_json(data_metadata["table_2"])

Unnamed: 0,Bioclimatic variable,Description
0,pet1-12,Monthly potential evapotranspiration from the ...
1,pet,Annual potential evapotranspiration from the T...


In [22]:
# Split 'Layers' column and create new rows
def split_repeating_vars(row, col_to_split, col_to_keep):
    
    # Extract the range and layername to a new smaller df of len(range(start, end))
    if "-" in row[col_to_split]:
        start = int(re.search("\d+", row[col_to_split].split("-")[0]).group())
        end = int(row[col_to_split].split("-")[1])
        name = re.search("[a-z]*", row[col_to_split]).group()
                
        # Create a DataFrame with the split values and the description column
        df = pd.DataFrame({col_to_split: [f"{name}{month}" for month in range(start, end+1)],
                           col_to_keep: row[col_to_keep]})
        return df
    
    # When no changes to changes to row
    else:
        df = pd.DataFrame({col_to_split: [row[col_to_split]], col_to_keep: [row[col_to_keep]]})
        return df


In [23]:
monthly_vars

Unnamed: 0,layer_name,description
0,tmin1-12,Monthly minimum temperature (°C x 10)
1,tmax1-12,Monthly maximum temperature (°C x 10)
2,prec1-12,Monthly total precipitation (mm.month-1)


In [24]:
new_monthly = pd.concat(monthly_vars.apply(split_repeating_vars, axis=1, args=("layer_name", "description", )).to_list(), ignore_index=True)
new_monthly


Unnamed: 0,layer_name,description
0,tmin1,Monthly minimum temperature (°C x 10)
1,tmin2,Monthly minimum temperature (°C x 10)
2,tmin3,Monthly minimum temperature (°C x 10)
3,tmin4,Monthly minimum temperature (°C x 10)
4,tmin5,Monthly minimum temperature (°C x 10)
5,tmin6,Monthly minimum temperature (°C x 10)
6,tmin7,Monthly minimum temperature (°C x 10)
7,tmin8,Monthly minimum temperature (°C x 10)
8,tmin9,Monthly minimum temperature (°C x 10)
9,tmin10,Monthly minimum temperature (°C x 10)


In [25]:
list(range(1,13))

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [26]:
monthly_vars.loc[monthly_vars["layer_name"] == "tmin1-12", "description"].values[0]

'Monthly minimum temperature (°C x 10)'

In [27]:
monthly_vars["description"].unique()[0]

'Monthly minimum temperature (°C x 10)'

In [28]:
new_monthly["layer_number"] = 0

In [29]:
df.head(37)

Unnamed: 0,layer_number,climate_variable,data_type
0,1,Monthly minimum temperature (°C x 10),clim
1,2,Monthly minimum temperature (°C x 10),clim
2,3,Monthly minimum temperature (°C x 10),clim
3,4,Monthly minimum temperature (°C x 10),clim
4,5,Monthly minimum temperature (°C x 10),clim
5,6,Monthly minimum temperature (°C x 10),clim
6,7,Monthly minimum temperature (°C x 10),clim
7,8,Monthly minimum temperature (°C x 10),clim
8,9,Monthly minimum temperature (°C x 10),clim
9,10,Monthly minimum temperature (°C x 10),clim


In [30]:
new_monthly.loc[new_monthly["description"] == monthly_vars.loc[monthly_vars["layer_name"] == "tmin1-12", "description"].values[0], "layer_number"] = range(1, 13)

In [31]:
new_monthly.loc[new_monthly["description"] == monthly_vars.loc[monthly_vars["layer_name"] == "tmax1-12", "description"].values[0], "layer_number"] = range(13, 25)

In [32]:
new_monthly.loc[new_monthly["description"] == monthly_vars.loc[monthly_vars["layer_name"] == "prec1-12", "description"].values[0], "layer_number"] = range(25, 37)

In [33]:
pd.merge(df, new_monthly, on="layer_number", how="outer").head(40)

Unnamed: 0,layer_number,climate_variable,data_type,layer_name,description
0,1,Monthly minimum temperature (°C x 10),clim,tmin1,Monthly minimum temperature (°C x 10)
1,2,Monthly minimum temperature (°C x 10),clim,tmin2,Monthly minimum temperature (°C x 10)
2,3,Monthly minimum temperature (°C x 10),clim,tmin3,Monthly minimum temperature (°C x 10)
3,4,Monthly minimum temperature (°C x 10),clim,tmin4,Monthly minimum temperature (°C x 10)
4,5,Monthly minimum temperature (°C x 10),clim,tmin5,Monthly minimum temperature (°C x 10)
5,6,Monthly minimum temperature (°C x 10),clim,tmin6,Monthly minimum temperature (°C x 10)
6,7,Monthly minimum temperature (°C x 10),clim,tmin7,Monthly minimum temperature (°C x 10)
7,8,Monthly minimum temperature (°C x 10),clim,tmin8,Monthly minimum temperature (°C x 10)
8,9,Monthly minimum temperature (°C x 10),clim,tmin9,Monthly minimum temperature (°C x 10)
9,10,Monthly minimum temperature (°C x 10),clim,tmin10,Monthly minimum temperature (°C x 10)


In [34]:
bio_vars = pd.read_json(data_metadata["table_1"])


In [35]:
bio_vars = bio_vars.rename(columns={"Bioclimatic variable": "layer_name"})

In [36]:
bio_vars["layer_number"] = range(37, 56)

In [65]:
bio_vars

Unnamed: 0,layer_name,description,layer_number
0,bio1,Annual mean temperature,37
1,bio2,Mean diurnal range (mean of monthly (max temp ...,38
2,bio3,Isothermality (BIO2/BIO7) (x 100),39
3,bio4,Temperature seasonality (standard deviation x ...,40
4,bio5,Max temperature of warmest month,41
5,bio6,Min temperature of coldest month,42
6,bio7,Temperature annual range (BIO5-BIO6),43
7,bio8,Mean temperature of wettest quarter,44
8,bio9,Mean temperature of driest quarter,45
9,bio10,Mean temperature of warmest quarter,46


In [38]:
evap_df = pd.read_json(data_metadata["table_2"])

In [39]:
evap_df.columns = ["layer_name", "description"]

In [40]:
evap_df

Unnamed: 0,layer_name,description
0,pet1-12,Monthly potential evapotranspiration from the ...
1,pet,Annual potential evapotranspiration from the T...


In [41]:
new_evap_df = pd.concat(evap_df.apply(split_repeating_vars, axis=1, args=("layer_name", "description", )).to_list(), ignore_index=True)

In [42]:
new_evap_df

Unnamed: 0,layer_name,description
0,pet1,Monthly potential evapotranspiration from the ...
1,pet2,Monthly potential evapotranspiration from the ...
2,pet3,Monthly potential evapotranspiration from the ...
3,pet4,Monthly potential evapotranspiration from the ...
4,pet5,Monthly potential evapotranspiration from the ...
5,pet6,Monthly potential evapotranspiration from the ...
6,pet7,Monthly potential evapotranspiration from the ...
7,pet8,Monthly potential evapotranspiration from the ...
8,pet9,Monthly potential evapotranspiration from the ...
9,pet10,Monthly potential evapotranspiration from the ...


In [43]:
new_evap_df["layer_number"] = range(56, 69)

In [44]:
new_evap_df

Unnamed: 0,layer_name,description,layer_number
0,pet1,Monthly potential evapotranspiration from the ...,56
1,pet2,Monthly potential evapotranspiration from the ...,57
2,pet3,Monthly potential evapotranspiration from the ...,58
3,pet4,Monthly potential evapotranspiration from the ...,59
4,pet5,Monthly potential evapotranspiration from the ...,60
5,pet6,Monthly potential evapotranspiration from the ...,61
6,pet7,Monthly potential evapotranspiration from the ...,62
7,pet8,Monthly potential evapotranspiration from the ...,63
8,pet9,Monthly potential evapotranspiration from the ...,64
9,pet10,Monthly potential evapotranspiration from the ...,65


In [45]:
biowater_df = pd.read_json(data_metadata["table_3"])

In [46]:
biowater_df.columns = ["layer_name", "description"]

In [47]:
biowater_df["layer_number"] = range(69, 71)

In [48]:
biowater_df

Unnamed: 0,layer_name,description,layer_number
0,cwd,Annual climatic water deficit (mm),69
1,ndm,Number of dry months in the year,70


Env Vars

In [56]:
defs.geoclim_files

{'raw_table': 'coff_madag_species_summary.xlsx',
 'gps_all': 'species_locations_all_positions.csv',
 'gbs_filtered': 'species_locations_gbs_only.csv',
 'madaclim_current': 'madaclim_current.tif',
 'madaclim_enviro': 'madaclim_enviro.tif',
 'clim_metadata': 'clim_metadata.json',
 'env_metadata': 'enviro_metadata.json',
 'clim_data_format': 'clim_data_format.json',
 'env_data_format': 'env_data_format.json'}

In [62]:
df

Unnamed: 0,layer_number,climate_variable,data_type
0,1,Monthly minimum temperature (°C x 10),clim
1,2,Monthly minimum temperature (°C x 10),clim
2,3,Monthly minimum temperature (°C x 10),clim
3,4,Monthly minimum temperature (°C x 10),clim
4,5,Monthly minimum temperature (°C x 10),clim
...,...,...,...
65,66,Monthly potential evapotranspiration (mm.month-1),clim
66,67,Monthly potential evapotranspiration (mm.month-1),clim
67,68,Annual potential evapotranspiration (mm),clim
68,69,Annual climatic water deficit (mm),clim


In [59]:
with open(enviro_dir / defs.geoclim_files["env_data_format"], "r") as jsfile:
    env_format = json.load(jsfile)

In [64]:
df_env = pd.read_json(env_format["table_0"])
df_env.columns = ["layer_number", "environment_variable"]
df_env.head()

Unnamed: 0,layer_number,environment_variable
0,1,Altitude (m)
1,2,Slope (in degree)
2,3,"Aspect (clockwise from North, in degree)"
3,4,Solar radiation (Wh.m-2.day-1)
4,5,"Geology (Kew Botanical Garden, 1997)"


In [None]:
df_env["layer_number"] = range()