# Exploratory Data Analysis

In [36]:
# Importing the libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [37]:
# Define settings
import os

pd.set_option("display.precision", 3)
data_dir_path = os.getcwd() + '/dataverse_files/'

In [38]:
# Importing data
import pyreadr

croissance_et_climat_decadaires = pyreadr.read_r(data_dir_path + "croissance_et_climat_decadaires.rds" ).popitem()[1]
valorisation_annuelle = pyreadr.read_r(data_dir_path + "valorisation_annuelle.rds" ).popitem()[1]

## Analysis of growths and decadal climates

In [109]:
df = croissance_et_climat_decadaires

In [110]:
df.head()

Unnamed: 0,ucs,safran,sol,type_de_prairie,gestion,annee,decade,Tmin,Tmax,Tmoy,Rain,RG,im,croissance
0,789,2131,330343,tp3,20,1985.0,1.0,-2.289,3.178,0.444,12.3,28.912,43.573,0.032
1,789,2131,330343,tp3,20,1985.0,2.0,-5.74,-0.41,-3.075,19.5,33.793,104.188,0.0
2,789,2131,330343,tp3,20,1985.0,3.0,3.77,9.13,6.45,36.8,24.751,82.772,0.046
3,789,2131,330343,tp3,20,1985.0,4.0,8.23,12.45,10.34,14.4,32.975,26.195,0.075
4,789,2131,330343,tp3,20,1985.0,5.0,-1.38,2.52,0.57,25.3,48.912,88.562,0.015


In [111]:
print(df.shape)
print(df.columns)
print(df.info())

(18693829, 14)
Index(['ucs', 'safran', 'sol', 'type_de_prairie', 'gestion', 'annee', 'decade',
       'Tmin', 'Tmax', 'Tmoy', 'Rain', 'RG', 'im', 'croissance'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18693829 entries, 0 to 18693828
Data columns (total 14 columns):
 #   Column           Dtype  
---  ------           -----  
 0   ucs              int32  
 1   safran           int32  
 2   sol              int32  
 3   type_de_prairie  object 
 4   gestion          int32  
 5   annee            float64
 6   decade           float64
 7   Tmin             float64
 8   Tmax             float64
 9   Tmoy             float64
 10  Rain             float64
 11  RG               float64
 12  im               float64
 13  croissance       float64
dtypes: float64(9), int32(4), object(1)
memory usage: 1.7+ GB
None


In [112]:
df.describe(include= "all")

Unnamed: 0,ucs,safran,sol,type_de_prairie,gestion,annee,decade,Tmin,Tmax,Tmoy,Rain,RG,im,croissance
count,18690000.0,18690000.0,18690000.0,18693829,18690000.0,18670000.0,18670000.0,18570000.0,18570000.0,18570000.0,18570000.0,18570000.0,18570000.0,18670000.0
unique,,,,3,,,,,,,,,,
top,,,,tp4,,,,,,,,,,
freq,,,,10563388,,,,,,,,,,
mean,1054.0,2523.0,330800.0,,13.62,1999.0,19.0,8.249,14.64,11.45,25.78,108.5,47.98,37.1
std,222.9,550.8,653.1,,9.407,8.626,10.68,4.365,5.607,4.893,24.99,65.81,48.97,30.66
min,640.0,1309.0,330300.0,,1.0,1984.0,1.0,-8.76,-5.38,-7.02,0.0,7.17,0.0,0.0
25%,888.0,2120.0,330600.0,,5.0,1991.0,10.0,5.04,10.43,7.855,6.8,46.84,11.44,10.9
50%,1080.0,2497.0,330600.0,,15.0,1999.0,19.0,8.29,14.25,11.18,18.9,104.3,32.97,28.12
75%,1222.0,2895.0,330600.0,,21.0,2006.0,28.0,11.91,19.1,15.54,37.2,161.5,68.76,59.97


In [113]:
df[["ucs", "safran", "sol"]].corr()

Unnamed: 0,ucs,safran,sol
ucs,1.0,0.937,0.085
safran,0.937,1.0,0.032
sol,0.085,0.032,1.0


### Feature extraction

Pedo-Climatic Units (PCU), result from the crossing of the climatic information (SAFRAN grid point) and soil information (UCS soil mapping units).

In [114]:
df_unique_pcu = df[["ucs", "safran"]].drop_duplicates(subset= ['ucs', 'safran'])
df_unique_pcu['pcu'] = df_unique_pcu.reset_index().index

df = pd.merge(df, df_unique_pcu, on= ['ucs','safran'])
df.insert(0, "pcu", df.pop('pcu'))
df.head()

Unnamed: 0,pcu,ucs,safran,sol,type_de_prairie,gestion,annee,decade,Tmin,Tmax,Tmoy,Rain,RG,im,croissance
0,0,789,2131,330343,tp3,20,1985.0,1.0,-2.289,3.178,0.444,12.3,28.912,43.573,0.032
1,0,789,2131,330343,tp3,20,1985.0,2.0,-5.74,-0.41,-3.075,19.5,33.793,104.188,0.0
2,0,789,2131,330343,tp3,20,1985.0,3.0,3.77,9.13,6.45,36.8,24.751,82.772,0.046
3,0,789,2131,330343,tp3,20,1985.0,4.0,8.23,12.45,10.34,14.4,32.975,26.195,0.075
4,0,789,2131,330343,tp3,20,1985.0,5.0,-1.38,2.52,0.57,25.3,48.912,88.562,0.015


In [115]:
df[["ucs", "safran", "pcu", "sol"]].corr()

Unnamed: 0,ucs,safran,pcu,sol
ucs,1.0,0.937,0.432,0.085
safran,0.937,1.0,0.444,0.032
pcu,0.432,0.444,1.0,-0.085
sol,0.085,0.032,-0.085,1.0


### Feature selection

We thought that "sol" could be the equivalent of "pcu", the grassland identifier. Since it is not correlated to any other variable, it is not an identifier.

"sol", "type_de_prairie" and "gestion" are not used as inputs of the model. aking the model learn the different inputs they have generated without knowing them will allow it to generalise.

In [116]:
# sol as PCU ?
valuable_columns = ["pcu", "ucs", "safran", "annee", "decade", "Tmin", "Tmax", "Tmoy", "Rain", "RG", "im", "croissance"]
df = df[valuable_columns]
df.head()

Unnamed: 0,pcu,ucs,safran,annee,decade,Tmin,Tmax,Tmoy,Rain,RG,im,croissance
0,0,789,2131,1985.0,1.0,-2.289,3.178,0.444,12.3,28.912,43.573,0.032
1,0,789,2131,1985.0,2.0,-5.74,-0.41,-3.075,19.5,33.793,104.188,0.0
2,0,789,2131,1985.0,3.0,3.77,9.13,6.45,36.8,24.751,82.772,0.046
3,0,789,2131,1985.0,4.0,8.23,12.45,10.34,14.4,32.975,26.195,0.075
4,0,789,2131,1985.0,5.0,-1.38,2.52,0.57,25.3,48.912,88.562,0.015


In [117]:
df.groupby(["pcu","annee","decade"]).count().mean()

ucs           14.129
safran        14.129
Tmin          14.049
Tmax          14.049
Tmoy          14.049
Rain          14.049
RG            14.049
im            14.049
croissance    14.129
dtype: float64

In [118]:
(df.isna() | df.isnull()).sum()

pcu                0
ucs                0
safran             0
annee          20262
decade         20262
Tmin          126008
Tmax          126008
Tmoy          126008
Rain          126008
RG            126008
im            126008
croissance     20262
dtype: int64

In [121]:
df.dropna(inplace= True)

In [122]:
(df.isna() | df.isnull()).sum()

pcu           0
ucs           0
safran        0
annee         0
decade        0
Tmin          0
Tmax          0
Tmoy          0
Rain          0
RG            0
im            0
croissance    0
dtype: int64

Now that we have removed the NaN and Null values, we need to check whether we have any missing decades among the years. We can do it by checking that the numbers of decades per (year,pcu) pairs are multiples of 37

In [159]:
(df.groupby(["annee","pcu"]).count()[["decade"]] % 37).sum()

decade    0
dtype: int64

In [160]:
df.describe()

Unnamed: 0,pcu,ucs,safran,annee,decade,Tmin,Tmax,Tmoy,Rain,RG,im,croissance
count,18570000.0,18570000.0,18570000.0,18570000.0,18570000.0,18570000.0,18570000.0,18570000.0,18570000.0,18570000.0,18570000.0,18570000.0
mean,635.0,1054.0,2523.0,1999.0,19.0,8.249,14.64,11.45,25.78,108.5,47.98,37.1
std,362.9,222.6,549.8,8.626,10.68,4.365,5.607,4.893,24.99,65.81,48.97,30.66
min,0.0,640.0,1309.0,1984.0,1.0,-8.76,-5.38,-7.02,0.0,7.17,0.0,0.0
25%,327.0,888.0,2120.0,1991.0,10.0,5.04,10.43,7.855,6.8,46.84,11.44,10.91
50%,630.0,1080.0,2497.0,1999.0,19.0,8.29,14.25,11.18,18.9,104.3,32.97,28.13
75%,955.0,1222.0,2895.0,2006.0,28.0,11.91,19.1,15.54,37.2,161.5,68.76,59.97
max,1240.0,1610.0,3886.0,2013.0,37.0,20.69,31.17,25.3,240.4,299.0,500.7,142.4


In [161]:
df.groupby(["pcu","annee","decade"]).count().mean()

ucs           14.121
safran        14.121
Tmin          14.121
Tmax          14.121
Tmoy          14.121
Rain          14.121
RG            14.121
im            14.121
croissance    14.121
dtype: float64

We have about 14 simulations per PCU and time unit.

### Reshape the dataframe

In [162]:
df["annee"] = df["annee"].astype("int64")
df["decade"] = df["decade"].astype("int64")

MemoryError: Unable to allocate 1.11 GiB for an array with shape (8, 18567821) and data type float64

In [18]:
dfgroup = df
dfgroup.head()

Unnamed: 0,sol,annee,decade,ucs,safran,Tmin,Tmax,Tmoy,Rain,RG,im,croissance
0,330343.0,1984,1,1087.248,2764.314,4.381,8.732,6.557,29.082,29.056,65.077,2.332
1,330343.0,1984,2,1087.248,2764.314,4.122,9.221,6.671,45.877,25.793,101.702,2.153
2,330343.0,1984,3,1087.248,2764.314,3.227,8.322,5.774,50.489,41.205,118.454,2.057
3,330343.0,1984,4,1087.248,2764.314,6.725,10.603,8.664,28.634,38.003,56.752,5.781
4,330343.0,1984,5,1087.248,2764.314,-0.182,7.1,3.459,0.093,96.482,0.27,4.308


In [19]:
dfgroup = dfgroup.groupby(["sol", "annee"]).count()
dfgroup.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,decade,ucs,safran,Tmin,Tmax,Tmoy,Rain,RG,im,croissance
sol,annee,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
330343.0,1984,37,37,37,37,37,37,37,37,37,37
330343.0,1985,37,37,37,37,37,37,37,37,37,37
330343.0,1986,37,37,37,37,37,37,37,37,37,37
330343.0,1987,37,37,37,37,37,37,37,37,37,37
330343.0,1988,37,37,37,37,37,37,37,37,37,37


In [20]:
df.describe()

Unnamed: 0,sol,annee,decade,ucs,safran,Tmin,Tmax,Tmoy,Rain,RG,im,croissance
count,26640.0,26640.0,26640.0,26640.0,26640.0,26640.0,26640.0,26640.0,26640.0,26640.0,26640.0,26640.0
mean,331246.125,1998.5,19.0,1063.87,2483.716,8.327,14.68,11.503,25.194,108.521,46.673,36.23
std,965.872,8.656,10.677,163.982,420.579,4.288,5.509,4.816,22.844,65.111,44.659,28.119
min,330343.0,1984.0,1.0,685.734,1624.963,-7.09,-3.347,-5.218,0.0,8.594,0.0,0.0
25%,330506.5,1991.0,10.0,990.063,2228.658,5.137,10.493,7.938,7.571,47.539,12.721,12.119
50%,330598.5,1998.5,19.0,1047.695,2498.312,8.346,14.317,11.242,19.3,105.046,33.606,28.564
75%,332378.25,2006.0,28.0,1179.985,2747.613,11.97,19.156,15.583,36.414,160.547,67.378,56.632
max,332668.0,2013.0,37.0,1473.0,3565.5,19.325,30.33,24.566,186.776,297.139,376.817,129.817


In [21]:
df.nunique()

sol              24
annee            30
decade           37
ucs             129
safran          138
Tmin          18793
Tmax          18958
Tmoy          19350
Rain          17683
RG            20035
im            19913
croissance    26566
dtype: int64

In [22]:
(df.isna() | df.isnull()).sum()

sol           0
annee         0
decade        0
ucs           0
safran        0
Tmin          0
Tmax          0
Tmoy          0
Rain          0
RG            0
im            0
croissance    0
dtype: int64

In [23]:
croissance_et_climat_decadaires_preprocessed = df

## Feature selection of annual valuations

In [24]:
df = valorisation_annuelle

In [25]:
df.head()

Unnamed: 0,ucs,safran,sol,type_de_prairie,gestion,annee,valorisation,RU,cumul_croissance
0,1005,2245,330562,tp3,15,1985.0,4.964,97.42,8.757
1,1005,2245,330562,tp3,15,1986.0,8.173,97.42,14.62
2,1005,2245,330562,tp3,15,1987.0,12.666,97.42,16.287
3,1005,2245,330562,tp3,15,1989.0,5.825,97.42,9.063
4,1005,2245,330562,tp3,15,1990.0,7.212,97.42,11.758


In [26]:
df.shape

(535406, 9)

In [27]:
valuable_columns = ["sol", "annee", "cumul_croissance"]
df = valorisation_annuelle[valuable_columns]

In [28]:
(df.isna() | df.isnull()).sum()

sol                 0
annee               0
cumul_croissance    0
dtype: int64

### Reshape the dataframe

In [29]:
df = df.reset_index(drop=True)
df[["annee"]] = df[["annee"]].astype("int64")

In [30]:
df.head()

Unnamed: 0,sol,annee,cumul_croissance
0,330562,1985,8.757
1,330562,1986,14.62
2,330562,1987,16.287
3,330562,1989,9.063
4,330562,1990,11.758


In [31]:
df.describe()

Unnamed: 0,sol,annee,cumul_croissance
count,535406.0,535406.0,535406.0
mean,330824.012,1998.691,13.771
std,652.279,8.625,3.777
min,330343.0,1984.0,0.754
25%,330556.0,1991.0,11.155
50%,330563.0,1999.0,13.813
75%,330638.0,2006.0,16.441
max,332668.0,2013.0,26.845


In [32]:
valorisation_annuelle_preprocessed = df

## Concatenation

In [33]:
X = croissance_et_climat_decadaires_preprocessed.merge(valorisation_annuelle_preprocessed, how='right', on=["sol", "annee"])
#[
#    croissance_et_climat_decadaires_preprocessed.loc[:, ["sol", "annee", "decade", "Tmin", "Tmax", "Tmoy", "Rain", "RG", "im", "croissance"]],
#    valorisation_annuelle_preprocessed.loc[:, ["sol", "annee", "cumul_croissance"]]
#    ]
#y = df.loc[:, ["cumul_croissance"]].values.astype(float)
X

Unnamed: 0,sol,annee,decade,ucs,safran,Tmin,Tmax,Tmoy,Rain,RG,im,croissance,cumul_croissance
0,330562.0,1985,1,1027.266,2509.783,-2.535,2.440,-0.047,9.819,36.384,35.623,1.486,8.757
1,330562.0,1985,2,1027.266,2509.783,-5.712,-0.420,-3.066,20.326,31.640,108.544,0.086,8.757
2,330562.0,1985,3,1027.266,2509.783,3.558,8.643,6.101,51.607,24.505,118.252,5.053,8.757
3,330562.0,1985,4,1027.266,2509.783,8.009,11.367,9.688,15.251,35.830,28.764,17.910,8.757
4,330562.0,1985,5,1027.266,2509.783,-0.880,2.595,0.858,30.177,49.275,101.987,6.563,8.757
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19810017,330563.0,2013,33,1027.266,2509.783,4.060,8.044,6.052,22.763,31.289,52.634,8.367,11.723
19810018,330563.0,2013,34,1027.266,2509.783,3.614,8.284,5.949,2.572,27.643,5.931,7.046,11.723
19810019,330563.0,2013,35,1027.266,2509.783,2.212,8.318,5.265,6.883,38.302,16.212,5.730,11.723
19810020,330563.0,2013,36,1027.266,2509.783,6.436,10.526,8.481,95.587,21.938,192.008,15.121,11.723


In [34]:
X.describe()

Unnamed: 0,sol,annee,decade,ucs,safran,Tmin,Tmax,Tmoy,Rain,RG,im,croissance,cumul_croissance
count,19810000.0,19810000.0,19810000.0,19810000.0,19810000.0,19810000.0,19810000.0,19810000.0,19810000.0,19810000.0,19810000.0,19810000.0,19810000.0
mean,330800.0,1999.0,19.0,1053.0,2519.0,8.252,14.64,11.45,25.84,108.5,48.11,37.17,13.77
std,652.3,8.625,10.68,97.87,222.2,4.302,5.537,4.845,23.19,64.77,45.58,28.1,3.777
min,330300.0,1984.0,1.0,685.7,1625.0,-7.09,-3.347,-5.218,0.0,8.594,0.0,0.0,0.7538
25%,330600.0,1991.0,10.0,1010.0,2432.0,5.016,10.43,7.873,7.968,47.63,13.31,12.64,11.16
50%,330600.0,1999.0,19.0,1027.0,2510.0,8.281,14.26,11.13,19.86,105.5,34.39,30.13,13.81
75%,330600.0,2006.0,28.0,1122.0,2645.0,11.93,19.16,15.56,37.59,160.5,69.61,58.39,16.44
max,332700.0,2013.0,37.0,1473.0,3566.0,19.32,30.33,24.57,186.8,297.1,376.8,129.8,26.84
