# Testing a simple Decision Tree Regressor on the NC data

Trying to predict **leaf area index, high vegetation** (lai_hv) 

In [1]:
import os
import sys
import cdsapi
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import netCDF4 as nc
import numpy as np
import xarray as xa
import pandas as pd
import glob

module_path = os.path.abspath(os.path.join('./get_dataset'))
if module_path not in sys.path:
    sys.path.append(module_path)

import dataset_api as api

In [2]:
lat = [44.4, 44.8]
lon = [3.6, 4.5]
area_france = [lat[1], lon[0], lat[0], lon[1]]

folder = '.'
path = folder + '/ERA5_land.nc'
#api.get_era5_land(path)


In [3]:
with xa.open_mfdataset(path) as ds:
    df_cds = ds.to_dataframe() 

### Averaging on time values
df_cds = df_cds.groupby(['latitude', 'longitude']).mean()
print(df_cds.shape)
df_cds.head()

(50, 22)


Unnamed: 0_level_0,Unnamed: 1_level_0,d2m,t2m,fal,lai_hv,lai_lv,src,skt,stl1,stl2,stl3,...,ssr,str,sp,ssrd,strd,tp,swvl1,swvl2,swvl3,swvl4
latitude,longitude,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
44.400002,3.6,277.496429,282.621643,0.164273,2.267111,2.342778,9.9e-05,282.336792,282.78891,282.752533,282.685852,...,12423977.0,-5355694.0,90546.03125,14748765.0,25769978.0,0.003235,0.337979,0.340152,0.33981,0.383521
44.400002,3.7,276.819,282.129486,0.159346,2.166003,0.20147,8.6e-05,281.9758,282.352112,282.322357,282.262207,...,12497523.0,-5225019.0,88918.429688,14803388.0,25742658.0,0.003461,0.349543,0.348219,0.340912,0.386739
44.400002,3.8,276.792114,282.434509,0.148068,2.275795,1.400902,8.7e-05,282.317017,282.718201,282.685242,282.616516,...,12682524.0,-5258744.5,89169.421875,14865689.0,25856780.0,0.003594,0.336781,0.334866,0.327907,0.376442
44.400002,3.9,277.34259,283.359924,0.133727,2.355388,2.371155,8.8e-05,283.230286,283.653351,283.614807,283.530273,...,12922853.0,-5481798.5,90912.835938,14932132.0,26032944.0,0.003697,0.328119,0.326292,0.321483,0.373128
44.400002,4.0,278.408569,284.801941,0.118431,2.329728,2.962471,8.7e-05,284.62558,285.081451,285.035248,284.925598,...,13186053.0,-5891225.5,93966.617188,15000342.0,26236940.0,0.003784,0.316781,0.315108,0.313654,0.368169


## Land Dataset

In [4]:
df_land = pd.read_csv('./france_land.csv')
df_land = df_land.drop(columns=['Unnamed: 0'])
df_land.rename(columns = {'latitude':'not_latitude', 'longitude':'not_longitude'}, inplace = True)
df_land.rename(columns = {'not_latitude':'longitude', 'not_longitude':'latitude'}, inplace = True) 

df_land = df_land.loc[(df_land['latitude'] >= lat[0]) & (df_land['latitude'] <= lat[1]) 
                        & (df_land['longitude'] >= lon[0]) & (df_land['longitude'] <= lon[1])]

df_land = df_land.set_index(['latitude', 'longitude'])
df_land = df_land.drop(columns=['DMP300-RT0-QFLAG', 'GDMP300-RT0-QFLAG', 'FAPAR300-RT0-NOBS', 
                                'FAPAR300-RT0-LENGTH-AFTER','FAPAR300-RT0-QFLAG', 'FAPAR300-RT0-RMSE',
                               'FAPAR300-RT0-LENGTH-BEFORE', 'FCOVER300-RT0-QFLAG','FCOVER300-RT0-LENGTH-AFTER',
                                'FCOVER300-RT0-NOBS', 'FCOVER300-RT0-LENGTH-BEFORE','FCOVER300-RT0-RMSE', 
                               'LAI300-RT0-LENGTH-BEFORE','LAI300-RT0-LENGTH-AFTER', 'LAI300-RT0-NOBS',
                                'LAI300-RT0-RMSE', 'LAI300-RT0-QFLAG','SWI1km-QFLAG-015', 'SWI1km-QFLAG-020',
                                'SWI1km-QFLAG-060', 'SWI1km-QFLAG-010', 'SWI1km-SSF',
                                'SWI1km-QFLAG-005', 'SWI1km-QFLAG-100', 'SWI1km-QFLAG-040',
                                'SWI1km-QFLAG-002', 'SSM1km-ssm-noise', 'LST-Q-FLAGS', 
                                'LST-PERCENT-PROC-PIXELS', 'LST-ERRORBAR-LST','LST-TIME-DELTA',
                               'ALDH-AL-DH-QFLAG', 'ALDH-NMOD', 'ALDH-LMK', 'ALDH-AL-DH-BB-ERR',
                                'ALDH-AL-DH-VI-ERR', 'ALDH-AL-DH-NI-ERR', 'ALBH-LMK',
                                'ALBH-NMOD', 'ALBH-AL-BH-QFLAG','ALBH-AL-BH-BB-ERR', 
                                'ALBH-AL-BH-VI-ERR', 'ALBH-AL-BH-NI-ERR', 
                               'TOCR-TOCR-QFLAG', 'TOCR-NMOD','TOCR-REF-NOR-SWIR-ERR', 
                                'TOCR-REF-NOR-RED-ERR','TOCR-REF-NOR-BLUE-ERR', 'TOCR-SZN',
                                'TOCR-REF-NOR-NIR-ERR', 'BA300-FDOB-DEKAD', 'BA300-BA-DEKAD',
                                'BA300-FDOB-SEASON', 'BA300-CP-DEKAD'])
df_land.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,NDVI,ALBH-AL-BH-NI,ALBH-AL-BH-VI,ALBH-AL-BH-BB,ALDH-AL-DH-BB,ALDH-AL-DH-VI,ALDH-AL-DH-NI,DMP300-RT0-DMP,FAPAR300-RT0-FAPAR,FCOVER300-RT0-FCOVER,...,SWI1km-SWI-010,SWI1km-SWI-060,SWI1km-SWI-015,SWI1km-SWI-020,TOCR-REF-NOR-BLUE,TOCR-REF-NOR-NIR,TOCR-REF-NOR-SWIR,TOCR-REF-NOR-RED,VCI_x,VCI_y
latitude,longitude,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
44.4,3.6,6.211765,1208.0,965.0,2276.0,3080.0,4.0,953.0,0.0,43.666667,0.0,...,130.0,122.0,124.0,198.0,624.0,575.0,16.0,8.0,,182.0
44.41,3.6,5.976471,1221.0,973.5,1995.0,2854.0,2.0,957.0,0.0,9.0,0.0,...,128.0,119.0,122.0,198.0,578.0,557.0,25.5,7.0,,203.5
44.42,3.6,5.4,1232.0,981.0,1959.0,2687.0,0.0,961.0,0.0,1.0,0.0,...,129.0,120.0,123.0,198.0,542.0,522.0,6.0,8.0,,202.0
44.43,3.6,5.682353,1273.0,1011.0,1407.0,1821.0,0.0,980.0,0.0,33.0,0.0,...,129.0,121.0,123.0,198.0,322.0,391.0,21.0,3.0,,217.0
44.44,3.6,6.429412,1268.0,1007.0,1415.0,1799.0,0.0,978.0,0.0,33.0,0.0,...,128.0,122.0,124.0,198.0,292.0,398.0,9.0,3.0,,182.0


### Handling null values

By counting the null values for each column we can see that some of them are null in almost all of our dataset.

In [5]:
df_land_null_cols = df_land.isnull().sum()
print("Unique number of null values per columns: ", df_land_null_cols.unique())
print("Number of columns with null count >= 3000: ", len(df_land_null_cols[df_land_null_cols >= 15000].index))

null_cols = df_land_null_cols[df_land_null_cols >= 3000].index

df_land = df_land.drop(columns=null_cols)

Unique number of null values per columns:  [   0 3542 3431]
Number of columns with null count >= 3000:  0


As for the columns, some row have most of their values null

In [6]:
df_land_null_rows = df_land.isnull().sum(axis=1)
print("Unique number of null values per rows: ", df_land_null_rows.unique())
print("Number of rows with null count >= 60: ", df_land_null_rows[df_land_null_rows >= 60].count())
null_rows = df_land_null_rows[df_land_null_rows >= 60].index
df_land = df_land.drop(null_rows)

Unique number of null values per rows:  [0]
Number of rows with null count >= 60:  0


In [7]:
df_land

Unnamed: 0_level_0,Unnamed: 1_level_0,NDVI,ALBH-AL-BH-NI,ALBH-AL-BH-VI,ALBH-AL-BH-BB,ALDH-AL-DH-BB,ALDH-AL-DH-VI,ALDH-AL-DH-NI,DMP300-RT0-DMP,FAPAR300-RT0-FAPAR,FCOVER300-RT0-FCOVER,...,SWI1km-SWI-005,SWI1km-SWI-010,SWI1km-SWI-060,SWI1km-SWI-015,SWI1km-SWI-020,TOCR-REF-NOR-BLUE,TOCR-REF-NOR-NIR,TOCR-REF-NOR-SWIR,TOCR-REF-NOR-RED,VCI_y
latitude,longitude,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
44.40,3.6,6.211765,1208.0,965.0,2276.0,3080.0,4.0,953.0,0.0,43.666667,0.0,...,175.0,130.0,122.0,124.0,198.0,624.0,575.0,16.0,8.0,182.0
44.41,3.6,5.976471,1221.0,973.5,1995.0,2854.0,2.0,957.0,0.0,9.000000,0.0,...,175.0,128.0,119.0,122.0,198.0,578.0,557.0,25.5,7.0,203.5
44.42,3.6,5.400000,1232.0,981.0,1959.0,2687.0,0.0,961.0,0.0,1.000000,0.0,...,175.0,129.0,120.0,123.0,198.0,542.0,522.0,6.0,8.0,202.0
44.43,3.6,5.682353,1273.0,1011.0,1407.0,1821.0,0.0,980.0,0.0,33.000000,0.0,...,175.0,129.0,121.0,123.0,198.0,322.0,391.0,21.0,3.0,217.0
44.44,3.6,6.429412,1268.0,1007.0,1415.0,1799.0,0.0,978.0,0.0,33.000000,0.0,...,175.0,128.0,122.0,124.0,198.0,292.0,398.0,9.0,3.0,182.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44.76,4.5,6.638235,1209.5,967.5,1686.5,2231.0,0.0,954.5,0.0,97.000000,0.0,...,252.0,252.0,252.0,252.0,252.0,285.0,458.0,9.0,3.0,250.0
44.77,4.5,6.357353,1217.0,971.5,1739.5,2359.0,0.0,956.5,0.0,73.000000,0.0,...,252.0,252.0,252.0,252.0,252.0,367.5,499.0,13.0,3.0,250.0
44.78,4.5,6.226471,1219.0,973.0,1855.0,2568.0,0.0,957.0,0.0,1.000000,0.0,...,252.0,252.0,252.0,252.0,252.0,440.5,605.5,16.0,2.5,154.0
44.79,4.5,5.955882,1211.5,968.0,1815.0,2431.5,0.0,955.0,0.0,81.000000,0.0,...,213.5,174.0,172.0,172.5,225.0,364.0,501.5,7.0,4.5,152.0


## Merge of Climate and Land dataset

In [8]:
# The combined index is composed as (latitude*100)(longitude*100)
# x*yyyyy 
# where x* is a variable length number representing latitude*100
# and yyyyy is a 5 digits longitude*100 with zero padding

combine_index = (lambda x: int(str(int(x[0]*100))+str(int(x[1]*100)).zfill(5)))
df1 = df_cds.copy()
df2 = df_land.copy()

df1.index = df1.index.map(combine_index)
df1.index.name = "latitude-longitude"
df2.index = df2.index.map(combine_index)
df2.index.name = "latitude-longitude"

df = pd.merge_asof(df2.sort_values(['latitude-longitude']), df1, on='latitude-longitude', allow_exact_matches=False)


## Occurrences

In [9]:
df_efa = pd.read_csv("./EEA_latLon_france.csv", index_col=['latitude', 'longitude'])
df_efa = df_efa.drop(columns=['Unnamed: 0'])
df_efa

Unnamed: 0_level_0,Unnamed: 1_level_0,endemic_percentage
latitude,longitude,Unnamed: 2_level_1
44.92,3.43,0.433788
44.92,3.44,0.431838
44.92,3.45,0.280900
44.92,3.47,0.538315
44.92,3.48,0.435600
...,...,...
44.36,4.60,0.236203
44.36,4.61,0.161611
44.36,4.62,0.031687
44.37,4.64,0.026899


In [10]:
# Concatenate all data into one DataFrame
df3 = df_efa.copy()

df3.index = df3.index.map(combine_index)
df3.index.name = "latitude-longitude"

df = pd.merge_asof(df.sort_values(['latitude-longitude']), df3.sort_values(['latitude-longitude']), on='latitude-longitude', allow_exact_matches=False)

df

Unnamed: 0,latitude-longitude,NDVI,ALBH-AL-BH-NI,ALBH-AL-BH-VI,ALBH-AL-BH-BB,ALDH-AL-DH-BB,ALDH-AL-DH-VI,ALDH-AL-DH-NI,DMP300-RT0-DMP,FAPAR300-RT0-FAPAR,...,str,sp,ssrd,strd,tp,swvl1,swvl2,swvl3,swvl4,endemic_percentage
0,444000360,6.211765,1208.0,965.0,2276.0,3080.0,4.0,953.0,0.0,43.666667,...,-5355694.0,90546.031250,14748765.0,25769978.0,0.003235,0.337979,0.340152,0.339810,0.383521,0.058482
1,444000361,5.905882,1208.0,965.0,2249.0,3037.0,0.0,953.0,0.0,25.000000,...,-5355694.0,90546.031250,14748765.0,25769978.0,0.003235,0.337979,0.340152,0.339810,0.383521,0.022500
2,444000362,5.364706,1207.0,965.0,2341.0,3058.0,4.0,953.0,0.0,11.666667,...,-5355694.0,90546.031250,14748765.0,25769978.0,0.003235,0.337979,0.340152,0.339810,0.383521,0.022500
3,444000363,8.061765,1215.0,970.0,2243.0,2922.0,4.0,955.0,0.0,1.000000,...,-5355694.0,90546.031250,14748765.0,25769978.0,0.003235,0.337979,0.340152,0.339810,0.383521,0.022500
4,444000364,8.039216,1238.0,985.0,2150.0,2922.0,4.0,963.0,0.0,11.666667,...,-5355694.0,90546.031250,14748765.0,25769978.0,0.003235,0.337979,0.340152,0.339810,0.383521,0.022500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3726,448000446,6.214706,1195.0,958.0,1984.0,2672.0,0.0,950.0,0.0,81.000000,...,-5346575.0,93770.703125,14911973.0,26671668.0,0.003713,0.334163,0.334872,0.330649,0.373588,0.159947
3727,448000447,7.054412,1194.0,958.0,1769.0,2327.0,4.0,951.0,0.0,55.000000,...,-5346575.0,93770.703125,14911973.0,26671668.0,0.003713,0.334163,0.334872,0.330649,0.373588,0.062683
3728,448000448,6.297059,1197.0,960.0,1860.0,2482.0,4.0,951.0,0.0,81.000000,...,-5346575.0,93770.703125,14911973.0,26671668.0,0.003713,0.334163,0.334872,0.330649,0.373588,0.054025
3729,448000449,6.232353,1201.0,962.0,2114.0,2865.0,4.0,953.0,0.0,89.000000,...,-5346575.0,93770.703125,14911973.0,26671668.0,0.003713,0.334163,0.334872,0.330649,0.373588,0.042361


In [30]:
# Data preprocessing 
import numpy as np

xx = df.drop(columns=['latitude-longitude'])
Nr, Nf = xx.shape
print("Number of rows in the original dataset: ", Nr)
print("Number of features in the original dataset: ", Nf)
#normalize the dataset
mm = xx.mean()
ss = xx.std()
xx = (xx-mm)/ss

xx=xx.dropna(thresh=20)
xx.head()

Number of rows in the original dataset:  3731
Number of features in the original dataset:  49


Unnamed: 0,NDVI,ALBH-AL-BH-NI,ALBH-AL-BH-VI,ALBH-AL-BH-BB,ALDH-AL-DH-BB,ALDH-AL-DH-VI,ALDH-AL-DH-NI,DMP300-RT0-DMP,FAPAR300-RT0-FAPAR,FCOVER300-RT0-FCOVER,...,str,sp,ssrd,strd,tp,swvl1,swvl2,swvl3,swvl4,endemic_percentage
0,-0.082276,-0.126732,-0.126791,-0.071622,-0.014174,-0.065112,-0.108296,-0.044915,0.151631,,...,1.586772,-2.443986,-2.110462,-3.063802,-0.649752,1.585411,1.672264,1.784841,1.652932,-0.739814
1,-0.568761,-0.126732,-0.126791,-0.075234,-0.020744,-0.146225,-0.108296,-0.044915,-0.386755,,...,1.586772,-2.443986,-2.110462,-3.063802,-0.649752,1.585411,1.672264,1.784841,1.652932,-0.993416
2,-1.429466,-0.126865,-0.126791,-0.062925,-0.017536,-0.065112,-0.108296,-0.044915,-0.771317,,...,1.586772,-2.443986,-2.110462,-3.063802,-0.649752,1.585411,1.672264,1.784841,1.652932,-0.993416
3,2.860027,-0.125805,-0.126131,-0.076037,-0.038314,-0.065112,-0.107997,-0.044915,-1.078966,,...,1.586772,-2.443986,-2.110462,-3.063802,-0.649752,1.585411,1.672264,1.784841,1.652932,-0.993416
4,2.824164,-0.122758,-0.124151,-0.08848,-0.038314,-0.065112,-0.106803,-0.044915,-0.771317,,...,1.586772,-2.443986,-2.110462,-3.063802,-0.649752,1.585411,1.672264,1.784841,1.652932,-0.993416


## Running basic ml models to test dataset

In [26]:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()

regressor_index = 'FAPAR300-RT0-FAPAR'

#X = pd.DataFrame(x_scaled)
y = df[regressor_index]
X = df.drop(columns=[regressor_index, 'endemic_percentage'])
X = min_max_scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(2984, 48) (2984,)
(747, 48) (747,)


In [27]:
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn import svm
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor 

svm_model = svm.SVR().fit(X_train, y_train)
tree_model = DecisionTreeRegressor().fit(X_train, y_train)
linear_model = LinearRegression(fit_intercept = True, normalize = True, copy_X = True, n_jobs = 2).fit(X_train, y_train)
rand_forest_model = RandomForestRegressor(n_estimators = 100, random_state = 0).fit(X_train, y_train)

print("SVM score: ", svm_model.score(X_test,y_test))
print("Tree score: " , tree_model.score(X_test,y_test))
print("Linear Regression score: " , linear_model.score(X_test,y_test))
print("Random forest score: ", rand_forest_model.score(X_test,y_test))
print("-"*50)
print("SVM cross validation: ", np.mean(cross_val_score(svm_model, X, y, cv=10)))
print("Tree cross validation: ", np.mean(cross_val_score(tree_model, X, y, cv=10)))
print("Linear Regression cross validation: ", np.mean(cross_val_score(linear_model, X, y, cv=10)))
print("Random forest cross validation: ", np.mean(cross_val_score(rand_forest_model, X, y, cv=10)))

SVM score:  0.9652012155386002
Tree score:  0.9997194788205606
Linear Regression score:  0.999854647074278
Random forest score:  0.9997298536522425
--------------------------------------------------
SVM cross validation:  0.9710681620465709
Tree cross validation:  0.9997268345187431
Linear Regression cross validation:  0.9990963034639219
Random forest cross validation:  0.9997537985386108


In [118]:
X[:, 1]

array([0.00041955, 0.00041955, 0.00040402, ..., 0.00024862, 0.00031078,
       0.00056718])