# Exploratory Data Analysis

In [1]:
# Importing the libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Define settings
import os

pd.set_option("display.precision", 3)
data_dir_path = os.getcwd() + '/dataverse_files/'

In [3]:
# Importing data
import pyreadr

croissance_et_climat_decadaires = pyreadr.read_r(data_dir_path + "croissance_et_climat_decadaires.rds" ).popitem()[1]
valorisation_annuelle = pyreadr.read_r(data_dir_path + "valorisation_annuelle.rds" ).popitem()[1]

## Analysis of growths and decadal climates

In [339]:
df = croissance_et_climat_decadaires

In [340]:
df.head()

Unnamed: 0,ucs,safran,sol,type_de_prairie,gestion,annee,decade,Tmin,Tmax,Tmoy,Rain,RG,im,croissance
0,789,2131,330343,tp3,20,1985.0,1.0,-2.289,3.178,0.444,12.3,28.912,43.573,0.032
1,789,2131,330343,tp3,20,1985.0,2.0,-5.74,-0.41,-3.075,19.5,33.793,104.188,0.0
2,789,2131,330343,tp3,20,1985.0,3.0,3.77,9.13,6.45,36.8,24.751,82.772,0.046
3,789,2131,330343,tp3,20,1985.0,4.0,8.23,12.45,10.34,14.4,32.975,26.195,0.075
4,789,2131,330343,tp3,20,1985.0,5.0,-1.38,2.52,0.57,25.3,48.912,88.562,0.015


In [341]:
print(df.shape)
print(df.columns)
print(df.info())

(18693829, 14)
Index(['ucs', 'safran', 'sol', 'type_de_prairie', 'gestion', 'annee', 'decade',
       'Tmin', 'Tmax', 'Tmoy', 'Rain', 'RG', 'im', 'croissance'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18693829 entries, 0 to 18693828
Data columns (total 14 columns):
 #   Column           Dtype  
---  ------           -----  
 0   ucs              int32  
 1   safran           int32  
 2   sol              int32  
 3   type_de_prairie  object 
 4   gestion          int32  
 5   annee            float64
 6   decade           float64
 7   Tmin             float64
 8   Tmax             float64
 9   Tmoy             float64
 10  Rain             float64
 11  RG               float64
 12  im               float64
 13  croissance       float64
dtypes: float64(9), int32(4), object(1)
memory usage: 1.7+ GB
None


In [342]:
df.describe(include= "all")

Unnamed: 0,ucs,safran,sol,type_de_prairie,gestion,annee,decade,Tmin,Tmax,Tmoy,Rain,RG,im,croissance
count,18690000.0,18690000.0,18690000.0,18693829,18690000.0,18670000.0,18670000.0,18570000.0,18570000.0,18570000.0,18570000.0,18570000.0,18570000.0,18670000.0
unique,,,,3,,,,,,,,,,
top,,,,tp4,,,,,,,,,,
freq,,,,10563388,,,,,,,,,,
mean,1054.0,2523.0,330800.0,,13.62,1999.0,19.0,8.249,14.64,11.45,25.78,108.5,47.98,37.1
std,222.9,550.8,653.1,,9.407,8.626,10.68,4.365,5.607,4.893,24.99,65.81,48.97,30.66
min,640.0,1309.0,330300.0,,1.0,1984.0,1.0,-8.76,-5.38,-7.02,0.0,7.17,0.0,0.0
25%,888.0,2120.0,330600.0,,5.0,1991.0,10.0,5.04,10.43,7.855,6.8,46.84,11.44,10.9
50%,1080.0,2497.0,330600.0,,15.0,1999.0,19.0,8.29,14.25,11.18,18.9,104.3,32.97,28.12
75%,1222.0,2895.0,330600.0,,21.0,2006.0,28.0,11.91,19.1,15.54,37.2,161.5,68.76,59.97


In [343]:
df[["ucs", "safran", "sol"]].corr()

Unnamed: 0,ucs,safran,sol
ucs,1.0,0.937,0.085
safran,0.937,1.0,0.032
sol,0.085,0.032,1.0


### Feature selection

In [362]:
# sol as PCU ?
valuable_columns = ["sol", "annee", "decade", "Tmin", "Tmax", "Tmoy", "Rain", "RG", "im", "croissance"]
df = croissance_et_climat_decadaires[valuable_columns]

In [363]:
df.head()

Unnamed: 0,sol,annee,decade,Tmin,Tmax,Tmoy,Rain,RG,im,croissance
0,330343,1985.0,1.0,-2.289,3.178,0.444,12.3,28.912,43.573,0.032
1,330343,1985.0,2.0,-5.74,-0.41,-3.075,19.5,33.793,104.188,0.0
2,330343,1985.0,3.0,3.77,9.13,6.45,36.8,24.751,82.772,0.046
3,330343,1985.0,4.0,8.23,12.45,10.34,14.4,32.975,26.195,0.075
4,330343,1985.0,5.0,-1.38,2.52,0.57,25.3,48.912,88.562,0.015


In [364]:
df.shape

(18693829, 10)

In [365]:
(df.isna() | df.isnull()).sum()

sol                0
annee          20262
decade         20262
Tmin          126008
Tmax          126008
Tmoy          126008
Rain          126008
RG            126008
im            126008
croissance     20262
dtype: int64

### 10-year average

In [366]:
df_10y = df.groupby(["sol", "annee", "decade"]).sample(n=10, replace=True)
df_10y = df_10y.groupby(["sol", "annee", "decade"]).mean()
df = df_10y.reset_index()

In [367]:
df.head()

Unnamed: 0,sol,annee,decade,Tmin,Tmax,Tmoy,Rain,RG,im,croissance
0,330343,1984.0,1.0,4.36,8.74,6.55,27.663,29.479,61.847,2.433
1,330343,1984.0,2.0,4.141,9.165,6.653,47.712,25.632,106.34,2.007
2,330343,1984.0,3.0,3.219,8.353,5.786,48.57,41.975,113.687,2.087
3,330343,1984.0,4.0,6.646,10.605,8.625,29.62,36.488,58.877,5.457
4,330343,1984.0,5.0,-0.412,6.97,3.279,0.41,97.111,1.195,4.105


In [368]:
df.describe()

Unnamed: 0,sol,annee,decade,Tmin,Tmax,Tmoy,Rain,RG,im,croissance
count,26640.0,26640.0,26640.0,26640.0,26640.0,26640.0,26640.0,26640.0,26640.0,26640.0
mean,331246.125,1998.5,19.0,8.326,14.679,11.502,25.166,108.532,46.621,36.24
std,965.872,8.656,10.677,4.294,5.513,4.82,22.901,65.171,44.761,28.361
min,330343.0,1984.0,1.0,-7.133,-3.274,-5.064,0.0,8.488,0.0,0.0
25%,330506.5,1991.0,10.0,5.14,10.494,7.929,7.49,47.459,12.575,11.987
50%,330598.5,1998.5,19.0,8.37,14.319,11.24,19.315,104.861,33.657,28.627
75%,332378.25,2006.0,28.0,11.971,19.145,15.578,36.44,160.483,67.221,56.786
max,332668.0,2013.0,37.0,19.287,30.25,24.554,199.57,297.139,405.879,132.326


### Reshape the dataframe

In [369]:
df["annee"] = df["annee"].astype("int64")
df["decade"] = df["decade"].astype("int64")

In [370]:
dfgroup = df
dfgroup.head()

Unnamed: 0,sol,annee,decade,Tmin,Tmax,Tmoy,Rain,RG,im,croissance
0,330343,1984,1,4.36,8.74,6.55,27.663,29.479,61.847,2.433
1,330343,1984,2,4.141,9.165,6.653,47.712,25.632,106.34,2.007
2,330343,1984,3,3.219,8.353,5.786,48.57,41.975,113.687,2.087
3,330343,1984,4,6.646,10.605,8.625,29.62,36.488,58.877,5.457
4,330343,1984,5,-0.412,6.97,3.279,0.41,97.111,1.195,4.105


In [371]:
dfgroup = dfgroup.groupby(["sol", "annee"]).count()
dfgroup.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,decade,Tmin,Tmax,Tmoy,Rain,RG,im,croissance
sol,annee,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
330343,1984,37,37,37,37,37,37,37,37
330343,1985,37,37,37,37,37,37,37,37
330343,1986,37,37,37,37,37,37,37,37
330343,1987,37,37,37,37,37,37,37,37
330343,1988,37,37,37,37,37,37,37,37


In [372]:
df.describe()

Unnamed: 0,sol,annee,decade,Tmin,Tmax,Tmoy,Rain,RG,im,croissance
count,26640.0,26640.0,26640.0,26640.0,26640.0,26640.0,26640.0,26640.0,26640.0,26640.0
mean,331246.125,1998.5,19.0,8.326,14.679,11.502,25.166,108.532,46.621,36.24
std,965.872,8.656,10.677,4.294,5.513,4.82,22.901,65.171,44.761,28.361
min,330343.0,1984.0,1.0,-7.133,-3.274,-5.064,0.0,8.488,0.0,0.0
25%,330506.5,1991.0,10.0,5.14,10.494,7.929,7.49,47.459,12.575,11.987
50%,330598.5,1998.5,19.0,8.37,14.319,11.24,19.315,104.861,33.657,28.627
75%,332378.25,2006.0,28.0,11.971,19.145,15.578,36.44,160.483,67.221,56.786
max,332668.0,2013.0,37.0,19.287,30.25,24.554,199.57,297.139,405.879,132.326


In [373]:
df.nunique()

sol              24
annee            30
decade           37
Tmin          15361
Tmax          16945
Tmoy          20016
Rain           9431
RG            26418
im            26393
croissance    26521
dtype: int64

In [374]:
(df.isna() | df.isnull()).sum()

sol           0
annee         0
decade        0
Tmin          0
Tmax          0
Tmoy          0
Rain          0
RG            0
im            0
croissance    0
dtype: int64

In [375]:
croissance_et_climat_decadaires_preprocessed = df

## Feature selection of annual valuations

In [382]:
df = valorisation_annuelle

In [383]:
df.head()

Unnamed: 0,ucs,safran,sol,type_de_prairie,gestion,annee,valorisation,RU,cumul_croissance
0,1005,2245,330562,tp3,15,1985.0,4.964,97.42,8.757
1,1005,2245,330562,tp3,15,1986.0,8.173,97.42,14.62
2,1005,2245,330562,tp3,15,1987.0,12.666,97.42,16.287
3,1005,2245,330562,tp3,15,1989.0,5.825,97.42,9.063
4,1005,2245,330562,tp3,15,1990.0,7.212,97.42,11.758


In [398]:
valuable_columns = ["sol", "annee", "cumul_croissance"]
df = valorisation_annuelle[valuable_columns]

In [399]:
(df.isna() | df.isnull()).sum()

sol                 0
annee               0
cumul_croissance    0
dtype: int64

### Reshape the dataframe

In [400]:
df = df.reset_index(drop=True)
df[["annee"]] = df[["annee"]].astype("int64")

In [402]:
df.head()

Unnamed: 0,sol,annee,cumul_croissance
0,330562,1985,8.757
1,330562,1986,14.62
2,330562,1987,16.287
3,330562,1989,9.063
4,330562,1990,11.758


In [403]:
df.describe()

Unnamed: 0,sol,annee,cumul_croissance
count,535406.0,535406.0,535406.0
mean,330824.012,1998.691,13.771
std,652.279,8.625,3.777
min,330343.0,1984.0,0.754
25%,330556.0,1991.0,11.155
50%,330563.0,1999.0,13.813
75%,330638.0,2006.0,16.441
max,332668.0,2013.0,26.845


In [404]:
valorisation_annuelle_preprocessed = df

## Concatenation

In [412]:
X = croissance_et_climat_decadaires_preprocessed.merge(valorisation_annuelle_preprocessed, how='right', on=["sol", "annee"])
#[
#    croissance_et_climat_decadaires_preprocessed.loc[:, ["sol", "annee", "decade", "Tmin", "Tmax", "Tmoy", "Rain", "RG", "im", "croissance"]],
#    valorisation_annuelle_preprocessed.loc[:, ["sol", "annee", "cumul_croissance"]]
#    ]
#y = df.loc[:, ["cumul_croissance"]].values.astype(float)
X

Unnamed: 0,sol,annee,decade,Tmin,Tmax,Tmoy,Rain,RG,im,croissance,cumul_croissance
0,330562,1985,1,-2.469,2.649,0.090,11.370,34.941,40.947,1.626,8.757
1,330562,1985,2,-5.949,-0.489,-3.219,20.730,31.961,112.320,0.100,8.757
2,330562,1985,3,3.502,8.492,5.997,55.533,24.470,127.971,5.773,8.757
3,330562,1985,4,7.921,11.343,9.632,14.890,36.340,28.279,21.971,8.757
4,330562,1985,5,-0.337,2.970,1.317,33.590,46.650,108.607,6.008,8.757
...,...,...,...,...,...,...,...,...,...,...,...
19810017,330563,2013,33,4.050,8.046,6.048,24.200,31.805,56.089,7.619,11.723
19810018,330563,2013,34,3.246,8.283,5.764,2.450,30.156,5.661,5.094,11.723
19810019,330563,2013,35,2.180,8.217,5.199,5.090,39.567,12.497,6.870,11.723
19810020,330563,2013,36,6.543,10.685,8.614,99.760,22.958,197.960,13.085,11.723


In [413]:
X.describe()

Unnamed: 0,sol,annee,decade,Tmin,Tmax,Tmoy,Rain,RG,im,croissance,cumul_croissance
count,19810000.0,19810000.0,19810000.0,19810000.0,19810000.0,19810000.0,19810000.0,19810000.0,19810000.0,19810000.0,19810000.0
mean,330800.0,1999.0,19.0,8.25,14.64,11.44,25.81,108.5,48.05,37.19,13.77
std,652.3,8.625,10.68,4.311,5.544,4.851,23.28,64.88,45.73,28.44,3.777
min,330300.0,1984.0,1.0,-7.133,-3.274,-5.064,0.0,8.488,0.0,0.0,0.7538
25%,330600.0,1991.0,10.0,5.042,10.43,7.854,7.85,47.54,13.16,12.55,11.16
50%,330600.0,1999.0,19.0,8.289,14.27,11.15,19.88,105.7,34.5,30.24,13.81
75%,330600.0,2006.0,28.0,11.93,19.14,15.54,37.49,160.9,69.65,58.49,16.44
max,332700.0,2013.0,37.0,19.29,30.25,24.55,199.6,297.1,405.9,132.3,26.84
