# Testing a simple Decision Tree Regressor on the NC data

Trying to predict **leaf area index, high vegetation** (lai_hv) 

In [1]:
import os
import sys
import cdsapi
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import netCDF4 as nc
import numpy as np
import xarray as xa

module_path = os.path.abspath(os.path.join('./get_dataset'))
if module_path not in sys.path:
    sys.path.append(module_path)

import dataset_api as api

#api.get_era5_data()

In [2]:
folder = './'

In [3]:
nc_path = folder + 'ERA5_land.nc'

with xa.open_mfdataset(nc_path) as ds:
    #print(ds.coords)
    df_land = ds.to_dataframe() 

#print(df_land.shape)
#df_land.isnull().sum()





Coordinates:
  * longitude  (longitude) float32 10.0 10.1 10.2 10.3 ... 29.7 29.8 29.9 30.0
  * latitude   (latitude) float32 70.0 69.9 69.8 69.7 ... 35.3 35.2 35.1 35.0
  * time       (time) datetime64[ns] 2019-01-01 2019-02-01 ... 2019-12-01
(846612, 25)


u10       244944
v10       244944
d2m       244944
t2m       244944
fal       244944
lai_hv    244944
lai_lv    244944
src       244944
skt       244944
stl1      244944
stl2      244944
stl3      244944
stl4      244944
slhf      244944
ssr       244944
str       244944
sp        244944
sshf      244944
ssrd      244944
strd      244944
tp        244944
swvl1     244944
swvl2     244944
swvl3     244944
swvl4     244944
dtype: int64

In [4]:
nc_path = folder + 'ERA5_pressure.nc'

with xa.open_mfdataset(nc_path) as ds:
    print(ds.coords)
    df_pressure = ds.to_dataframe() 

print(df_pressure.shape)
df_pressure.isnull().sum()

Coordinates:
  * longitude  (longitude) float32 10.0 10.25 10.5 10.75 ... 29.5 29.75 30.0
  * latitude   (latitude) float32 70.0 69.75 69.5 69.25 ... 35.5 35.25 35.0
  * level      (level) int32 1 2 3 5 7 10 20 30 ... 850 875 900 925 950 975 1000
  * time       (time) datetime64[ns] 2019-01-01 2019-02-01 ... 2019-12-01
(5070924, 3)


q       0
crwc    0
t       0
dtype: int64

In [6]:
nc_path = folder + 'ERA5_data.nc'

with xa.open_mfdataset(nc_path) as ds:
    print(ds.coords)
    df_data = ds.to_dataframe() 

print(df_data.shape)
df_data.isnull().sum()

Coordinates:
  * longitude  (longitude) float32 10.0 10.25 10.5 10.75 ... 29.5 29.75 30.0
  * latitude   (latitude) float32 70.0 69.75 69.5 69.25 ... 35.5 35.25 35.0
  * time       (time) datetime64[ns] 2019-01-01 2019-02-01 ... 2019-12-01
(137052, 25)


d2m       0
t2m       0
cvh       0
lai_hv    0
lai_lv    0
cvl       0
mer       0
mslhf     0
mtpr      0
skt       0
stl1      0
stl2      0
stl3      0
stl4      0
slt       0
ssr       0
sp        0
ssrd      0
tcc       0
tvh       0
tvl       0
swvl1     0
swvl2     0
swvl3     0
swvl4     0
dtype: int64

In [7]:
#Working with ERA5_data 

dataset = df_data.dropna()

y = dataset['lai_hv']
X = dataset.drop(columns=['lai_hv'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [8]:
model = DecisionTreeRegressor()
model.fit(X_train, y_train)
print("Prediction score on train: ", model.score(X_train, y_train))
print("Prediction score on test: ", model.score(X_test, y_test))

Prediction score on train:  0.9999999973453197
Prediction score on test:  0.9606260359420854


In [9]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 137052 entries, (70.0, 10.0, Timestamp('2019-01-01 00:00:00')) to (35.0, 30.0, Timestamp('2019-12-01 00:00:00'))
Data columns (total 25 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   d2m     137052 non-null  float32
 1   t2m     137052 non-null  float32
 2   cvh     137052 non-null  float32
 3   lai_hv  137052 non-null  float32
 4   lai_lv  137052 non-null  float32
 5   cvl     137052 non-null  float32
 6   mer     137052 non-null  float32
 7   mslhf   137052 non-null  float32
 8   mtpr    137052 non-null  float32
 9   skt     137052 non-null  float32
 10  stl1    137052 non-null  float32
 11  stl2    137052 non-null  float32
 12  stl3    137052 non-null  float32
 13  stl4    137052 non-null  float32
 14  slt     137052 non-null  float32
 15  ssr     137052 non-null  float32
 16  sp      137052 non-null  float32
 17  ssrd    137052 non-null  float32
 18  tcc     137052 non-null  float32
 19  t