# Testing a simple Decision Tree Regressor on the NC data

Trying to predict **leaf area index, high vegetation** (lai_hv) 

In [1]:
import os
import sys
import cdsapi
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import netCDF4 as nc
import numpy as np
import xarray as xa
import pandas as pd

module_path = os.path.abspath(os.path.join('./get_dataset'))
if module_path not in sys.path:
    sys.path.append(module_path)

import dataset_api as api

In [2]:
lat = [44.4, 44.8]
lon = [3.6, 4.5]
area_france = [lat[1], lon[0], lat[0], lon[1]]

folder = '.'
path = folder + '/ERA5_land.nc'
#api.get_era5_land(path)


In [21]:
with xa.open_mfdataset(path) as ds:
    df_cds = ds.to_dataframe() 

### Averaging on time values
df_cds = df_cds.groupby(['latitude', 'longitude']).mean()
print(df_cds.shape)
df_cds.head()

(50, 22)


Unnamed: 0_level_0,Unnamed: 1_level_0,d2m,t2m,fal,lai_hv,lai_lv,src,skt,stl1,stl2,stl3,...,ssr,str,sp,ssrd,strd,tp,swvl1,swvl2,swvl3,swvl4
latitude,longitude,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
44.400002,3.6,277.496429,282.621643,0.164273,2.267111,2.342778,9.9e-05,282.336792,282.78891,282.752533,282.685852,...,12423977.0,-5355694.0,90546.03125,14748765.0,25769978.0,0.003235,0.337979,0.340152,0.33981,0.383521
44.400002,3.7,276.819,282.129486,0.159346,2.166003,0.20147,8.6e-05,281.9758,282.352112,282.322357,282.262207,...,12497523.0,-5225019.0,88918.429688,14803388.0,25742658.0,0.003461,0.349543,0.348219,0.340912,0.386739
44.400002,3.8,276.792114,282.434509,0.148068,2.275795,1.400902,8.7e-05,282.317017,282.718201,282.685242,282.616516,...,12682524.0,-5258744.5,89169.421875,14865689.0,25856780.0,0.003594,0.336781,0.334866,0.327907,0.376442
44.400002,3.9,277.34259,283.359924,0.133727,2.355388,2.371155,8.8e-05,283.230286,283.653351,283.614807,283.530273,...,12922853.0,-5481798.5,90912.835938,14932132.0,26032944.0,0.003697,0.328119,0.326292,0.321483,0.373128
44.400002,4.0,278.408569,284.801941,0.118431,2.329728,2.962471,8.7e-05,284.62558,285.081451,285.035248,284.925598,...,13186053.0,-5891225.5,93966.617188,15000342.0,26236940.0,0.003784,0.316781,0.315108,0.313654,0.368169


## Land Dataset

In [4]:
df_land = pd.read_csv('./france_land.csv')
df_land = df_land.drop(columns=['Unnamed: 0'])

df_land = df_land.loc[(df_land['latitude'] >= lat[0]) & (df_land['latitude'] <= lat[1])  
                        & (df_land['longitude'] >= lon[0]) & (df_land['longitude'] <= lon[1])]
df_land = df_land.set_index(['latitude', 'longitude'])
df_land

Unnamed: 0_level_0,Unnamed: 1_level_0,NDVI,ALBH-AL-BH-NI,ALBH-LMK,ALBH-AL-BH-VI,ALBH-NMOD,ALBH-AL-BH-BB,ALBH-AL-BH-QFLAG,ALBH-AL-BH-BB-ERR,ALBH-AL-BH-VI-ERR,ALBH-AL-BH-NI-ERR,...,TOCR-REF-NOR-RED-ERR,TOCR-REF-NOR-BLUE,TOCR-REF-NOR-BLUE-ERR,TOCR-REF-NOR-NIR,TOCR-SZN,TOCR-REF-NOR-NIR-ERR,TOCR-REF-NOR-SWIR,TOCR-REF-NOR-RED,VCI_x,VCI_y
latitude,longitude,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
44.40,3.6,6.211765,1208.0,9.0,965.0,213.0,2276.0,3527.0,4.0,845.0,470.0,...,59.0,624.0,187.0,575.0,105.0,27.0,16.0,8.0,,182.0
44.41,3.6,5.976471,1221.0,5.5,973.5,214.5,1995.0,3118.5,2.0,861.5,353.0,...,38.5,578.0,156.5,557.0,105.0,25.5,25.5,7.0,,203.5
44.42,3.6,5.400000,1232.0,4.0,981.0,202.0,1959.0,3059.0,0.0,881.0,359.0,...,39.0,542.0,150.0,522.0,105.0,26.0,6.0,8.0,,202.0
44.43,3.6,5.682353,1273.0,3.0,1011.0,204.0,1407.0,2185.0,0.0,955.0,285.0,...,12.0,322.0,80.0,391.0,105.0,22.0,21.0,3.0,,217.0
44.44,3.6,6.429412,1268.0,3.0,1007.0,204.0,1415.0,2177.0,0.0,947.0,302.0,...,9.0,292.0,75.0,398.0,105.0,23.0,9.0,3.0,,182.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44.76,4.5,6.638235,1209.5,4.0,967.5,204.0,1686.5,2622.5,0.0,853.0,337.0,...,13.0,285.0,55.0,458.0,105.0,25.0,9.0,3.0,,250.0
44.77,4.5,6.357353,1217.0,4.0,971.5,202.0,1739.5,2675.0,0.0,861.5,382.0,...,18.5,367.5,66.0,499.0,105.0,30.5,13.0,3.0,,250.0
44.78,4.5,6.226471,1219.0,4.0,973.0,202.0,1855.0,2860.5,0.0,863.0,403.5,...,22.5,440.5,88.0,605.5,105.0,33.0,16.0,2.5,,154.0
44.79,4.5,5.955882,1211.5,4.5,968.0,202.0,1815.0,2802.5,0.0,855.0,399.5,...,21.5,364.0,82.5,501.5,105.0,28.0,7.0,4.5,,152.0


### Handling null values

By counting the null values for each column we can see that some of them are null in almost all of our dataset.

In [5]:
df_land_null_cols = df_land.isnull().sum()
print("Unique number of null values per columns: ", df_land_null_cols.unique())
print("Number of columns with null count >= 3000: ", len(df_land_null_cols[df_land_null_cols >= 15000].index))

null_cols = df_land_null_cols[df_land_null_cols >= 3000].index

df_land = df_land.drop(columns=null_cols)

Unique number of null values per columns:  [   0 3542 3431]
Number of columns with null count >= 3000:  0


As for the columns, some row have most of their values null

In [6]:
df_land_null_rows = df_land.isnull().sum(axis=1)
print("Unique number of null values per rows: ", df_land_null_rows.unique())
print("Number of rows with null count >= 60: ", df_land_null_rows[df_land_null_rows >= 60].count())
null_rows = df_land_null_rows[df_land_null_rows >= 60].index
df_land = df_land.drop(null_rows)

Unique number of null values per rows:  [0]
Number of rows with null count >= 60:  0


In [7]:
df_land

Unnamed: 0_level_0,Unnamed: 1_level_0,NDVI,ALBH-AL-BH-NI,ALBH-LMK,ALBH-AL-BH-VI,ALBH-NMOD,ALBH-AL-BH-BB,ALBH-AL-BH-QFLAG,ALBH-AL-BH-BB-ERR,ALBH-AL-BH-VI-ERR,ALBH-AL-BH-NI-ERR,...,TOCR-REF-NOR-SWIR-ERR,TOCR-REF-NOR-RED-ERR,TOCR-REF-NOR-BLUE,TOCR-REF-NOR-BLUE-ERR,TOCR-REF-NOR-NIR,TOCR-SZN,TOCR-REF-NOR-NIR-ERR,TOCR-REF-NOR-SWIR,TOCR-REF-NOR-RED,VCI_y
latitude,longitude,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
44.40,3.6,6.211765,1208.0,9.0,965.0,213.0,2276.0,3527.0,4.0,845.0,470.0,...,4.0,59.0,624.0,187.0,575.0,105.0,27.0,16.0,8.0,182.0
44.41,3.6,5.976471,1221.0,5.5,973.5,214.5,1995.0,3118.5,2.0,861.5,353.0,...,4.0,38.5,578.0,156.5,557.0,105.0,25.5,25.5,7.0,203.5
44.42,3.6,5.400000,1232.0,4.0,981.0,202.0,1959.0,3059.0,0.0,881.0,359.0,...,0.0,39.0,542.0,150.0,522.0,105.0,26.0,6.0,8.0,202.0
44.43,3.6,5.682353,1273.0,3.0,1011.0,204.0,1407.0,2185.0,0.0,955.0,285.0,...,0.0,12.0,322.0,80.0,391.0,105.0,22.0,21.0,3.0,217.0
44.44,3.6,6.429412,1268.0,3.0,1007.0,204.0,1415.0,2177.0,0.0,947.0,302.0,...,0.0,9.0,292.0,75.0,398.0,105.0,23.0,9.0,3.0,182.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44.76,4.5,6.638235,1209.5,4.0,967.5,204.0,1686.5,2622.5,0.0,853.0,337.0,...,2.0,13.0,285.0,55.0,458.0,105.0,25.0,9.0,3.0,250.0
44.77,4.5,6.357353,1217.0,4.0,971.5,202.0,1739.5,2675.0,0.0,861.5,382.0,...,4.0,18.5,367.5,66.0,499.0,105.0,30.5,13.0,3.0,250.0
44.78,4.5,6.226471,1219.0,4.0,973.0,202.0,1855.0,2860.5,0.0,863.0,403.5,...,2.0,22.5,440.5,88.0,605.5,105.0,33.0,16.0,2.5,154.0
44.79,4.5,5.955882,1211.5,4.5,968.0,202.0,1815.0,2802.5,0.0,855.0,399.5,...,2.0,21.5,364.0,82.5,501.5,105.0,28.0,7.0,4.5,152.0


## Merge of Climate and Land dataset

In [24]:
# The combined index is composed as (latitude*100)(longitude*100)
# x*yyyyy 
# where x* is a variable length number representing latitude*100
# and yyyyy is a 5 digits longitude*100 with zero padding

combine_index = (lambda x: int(str(int(x[0]*100))+str(int(x[1]*100)).zfill(5)))
df1 = df_cds.copy()
df2 = df_land.copy()

df1.index = df1.index.map(combine_index)
df1.index.name = "latitude-longitude"
df2.index = df2.index.map(combine_index)
df2.index.name = "latitude-longitude"

pd.merge_asof(df2.sort_values(['latitude_', df1, on='latitude-longitude', allow_exact_matches=False)

ValueError: left keys must be sorted