# Exploratory Data Analysis

In [1]:
# Importing the libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Define settings
import os

pd.set_option("display.precision", 3)
data_dir_path = os.getcwd() + '/dataverse_files/'

In [3]:
# Importing data
import pyreadr

croissance_et_climat_decadaires = pyreadr.read_r(data_dir_path + "croissance_et_climat_decadaires.rds" ).popitem()[1]
valorisation_annuelle = pyreadr.read_r(data_dir_path + "valorisation_annuelle.rds" ).popitem()[1]

## Analysis of growths and decadal climates

In [339]:
df = croissance_et_climat_decadaires

In [340]:
df.head()

Unnamed: 0,ucs,safran,sol,type_de_prairie,gestion,annee,decade,Tmin,Tmax,Tmoy,Rain,RG,im,croissance
0,789,2131,330343,tp3,20,1985.0,1.0,-2.289,3.178,0.444,12.3,28.912,43.573,0.032
1,789,2131,330343,tp3,20,1985.0,2.0,-5.74,-0.41,-3.075,19.5,33.793,104.188,0.0
2,789,2131,330343,tp3,20,1985.0,3.0,3.77,9.13,6.45,36.8,24.751,82.772,0.046
3,789,2131,330343,tp3,20,1985.0,4.0,8.23,12.45,10.34,14.4,32.975,26.195,0.075
4,789,2131,330343,tp3,20,1985.0,5.0,-1.38,2.52,0.57,25.3,48.912,88.562,0.015


In [341]:
print(df.shape)
print(df.columns)
print(df.info())

(18693829, 14)
Index(['ucs', 'safran', 'sol', 'type_de_prairie', 'gestion', 'annee', 'decade',
       'Tmin', 'Tmax', 'Tmoy', 'Rain', 'RG', 'im', 'croissance'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18693829 entries, 0 to 18693828
Data columns (total 14 columns):
 #   Column           Dtype  
---  ------           -----  
 0   ucs              int32  
 1   safran           int32  
 2   sol              int32  
 3   type_de_prairie  object 
 4   gestion          int32  
 5   annee            float64
 6   decade           float64
 7   Tmin             float64
 8   Tmax             float64
 9   Tmoy             float64
 10  Rain             float64
 11  RG               float64
 12  im               float64
 13  croissance       float64
dtypes: float64(9), int32(4), object(1)
memory usage: 1.7+ GB
None


In [342]:
df.describe(include= "all")

Unnamed: 0,ucs,safran,sol,type_de_prairie,gestion,annee,decade,Tmin,Tmax,Tmoy,Rain,RG,im,croissance
count,18690000.0,18690000.0,18690000.0,18693829,18690000.0,18670000.0,18670000.0,18570000.0,18570000.0,18570000.0,18570000.0,18570000.0,18570000.0,18670000.0
unique,,,,3,,,,,,,,,,
top,,,,tp4,,,,,,,,,,
freq,,,,10563388,,,,,,,,,,
mean,1054.0,2523.0,330800.0,,13.62,1999.0,19.0,8.249,14.64,11.45,25.78,108.5,47.98,37.1
std,222.9,550.8,653.1,,9.407,8.626,10.68,4.365,5.607,4.893,24.99,65.81,48.97,30.66
min,640.0,1309.0,330300.0,,1.0,1984.0,1.0,-8.76,-5.38,-7.02,0.0,7.17,0.0,0.0
25%,888.0,2120.0,330600.0,,5.0,1991.0,10.0,5.04,10.43,7.855,6.8,46.84,11.44,10.9
50%,1080.0,2497.0,330600.0,,15.0,1999.0,19.0,8.29,14.25,11.18,18.9,104.3,32.97,28.12
75%,1222.0,2895.0,330600.0,,21.0,2006.0,28.0,11.91,19.1,15.54,37.2,161.5,68.76,59.97


In [343]:
df[["ucs", "safran", "sol"]].corr()

Unnamed: 0,ucs,safran,sol
ucs,1.0,0.937,0.085
safran,0.937,1.0,0.032
sol,0.085,0.032,1.0


#### Define the columns used in the paper

In [344]:
# sol as PCU ?
valuable_columns = ["sol", "annee", "decade", "Tmin", "Tmax", "Tmoy", "Rain", "RG", "im", "croissance"]
df = croissance_et_climat_decadaires[valuable_columns]

In [345]:
df.head()

Unnamed: 0,sol,annee,decade,Tmin,Tmax,Tmoy,Rain,RG,im,croissance
0,330343,1985.0,1.0,-2.289,3.178,0.444,12.3,28.912,43.573,0.032
1,330343,1985.0,2.0,-5.74,-0.41,-3.075,19.5,33.793,104.188,0.0
2,330343,1985.0,3.0,3.77,9.13,6.45,36.8,24.751,82.772,0.046
3,330343,1985.0,4.0,8.23,12.45,10.34,14.4,32.975,26.195,0.075
4,330343,1985.0,5.0,-1.38,2.52,0.57,25.3,48.912,88.562,0.015


In [346]:
df.shape

(18693829, 10)

In [347]:
(df.isna() | df.isnull()).sum()

sol                0
annee          20262
decade         20262
Tmin          126008
Tmax          126008
Tmoy          126008
Rain          126008
RG            126008
im            126008
croissance     20262
dtype: int64

### Mean 10 years per PUD

In [348]:
df_10y = df.groupby(["sol", "annee", "decade"]).sample(n=10, replace=True)
df_10y = df_10y.groupby(["sol", "annee", "decade"]).mean()
df = df_10y.reset_index()

In [349]:
df.head()

Unnamed: 0,sol,annee,decade,Tmin,Tmax,Tmoy,Rain,RG,im,croissance
0,330343,1984.0,1.0,4.319,8.684,6.501,28.333,28.023,63.585,2.26
1,330343,1984.0,2.0,4.054,9.178,6.616,44.8,25.059,99.597,2.095
2,330343,1984.0,3.0,3.101,8.089,5.595,52.089,40.774,123.923,1.863
3,330343,1984.0,4.0,6.569,10.477,8.523,30.73,37.455,61.77,5.387
4,330343,1984.0,5.0,-0.275,7.285,3.505,0.0,96.539,0.0,4.565


In [350]:
df.describe()

Unnamed: 0,sol,annee,decade,Tmin,Tmax,Tmoy,Rain,RG,im,croissance
count,26640.0,26640.0,26640.0,26640.0,26640.0,26640.0,26640.0,26640.0,26640.0,26640.0
mean,331246.125,1998.5,19.0,8.325,14.679,11.502,25.188,108.53,46.667,36.185
std,965.872,8.656,10.677,4.296,5.513,4.821,22.997,65.176,44.969,28.308
min,330343.0,1984.0,1.0,-7.271,-3.352,-5.194,0.0,8.641,0.0,0.0
25%,330506.5,1991.0,10.0,5.137,10.488,7.942,7.47,47.399,12.461,11.981
50%,330598.5,1998.5,19.0,8.356,14.319,11.25,19.345,105.313,33.619,28.48
75%,332378.25,2006.0,28.0,11.96,19.153,15.588,36.47,160.627,67.194,56.757
max,332668.0,2013.0,37.0,19.272,30.474,24.582,190.74,297.139,386.31,129.955


### Preprocess and sort the dataframe

In [351]:
df["annee"] = df["annee"].astype("int64")
df["decade"] = df["decade"].astype("int64")

In [352]:
dfgroup = df
dfgroup.head()

Unnamed: 0,sol,annee,decade,Tmin,Tmax,Tmoy,Rain,RG,im,croissance
0,330343,1984,1,4.319,8.684,6.501,28.333,28.023,63.585,2.26
1,330343,1984,2,4.054,9.178,6.616,44.8,25.059,99.597,2.095
2,330343,1984,3,3.101,8.089,5.595,52.089,40.774,123.923,1.863
3,330343,1984,4,6.569,10.477,8.523,30.73,37.455,61.77,5.387
4,330343,1984,5,-0.275,7.285,3.505,0.0,96.539,0.0,4.565


In [353]:
dfgroup = dfgroup.groupby(["sol", "annee"]).count()
dfgroup.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,decade,Tmin,Tmax,Tmoy,Rain,RG,im,croissance
sol,annee,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
330343,1984,37,37,37,37,37,37,37,37
330343,1985,37,37,37,37,37,37,37,37
330343,1986,37,37,37,37,37,37,37,37
330343,1987,37,37,37,37,37,37,37,37
330343,1988,37,37,37,37,37,37,37,37


In [354]:
df.describe()

Unnamed: 0,sol,annee,decade,Tmin,Tmax,Tmoy,Rain,RG,im,croissance
count,26640.0,26640.0,26640.0,26640.0,26640.0,26640.0,26640.0,26640.0,26640.0,26640.0
mean,331246.125,1998.5,19.0,8.325,14.679,11.502,25.188,108.53,46.667,36.185
std,965.872,8.656,10.677,4.296,5.513,4.821,22.997,65.176,44.969,28.308
min,330343.0,1984.0,1.0,-7.271,-3.352,-5.194,0.0,8.641,0.0,0.0
25%,330506.5,1991.0,10.0,5.137,10.488,7.942,7.47,47.399,12.461,11.981
50%,330598.5,1998.5,19.0,8.356,14.319,11.25,19.345,105.313,33.619,28.48
75%,332378.25,2006.0,28.0,11.96,19.153,15.588,36.47,160.627,67.194,56.757
max,332668.0,2013.0,37.0,19.272,30.474,24.582,190.74,297.139,386.31,129.955


In [355]:
df.nunique()

sol              24
annee            30
decade           37
Tmin          15354
Tmax          16909
Tmoy          20085
Rain           9355
RG            26418
im            26371
croissance    26510
dtype: int64

In [356]:
(df.isna() | df.isnull()).sum()

sol           0
annee         0
decade        0
Tmin          0
Tmax          0
Tmoy          0
Rain          0
RG            0
im            0
croissance    0
dtype: int64

## Analysis of annual valuations

In [86]:
df = valorisation_annuelle

In [87]:
valorisation_annuelle.head()

Unnamed: 0,ucs,safran,sol,type_de_prairie,gestion,annee,valorisation,RU,cumul_croissance
0,1005,2245,330562,tp3,15,1985.0,4.964,97.42,8.757
1,1005,2245,330562,tp3,15,1986.0,8.173,97.42,14.62
2,1005,2245,330562,tp3,15,1987.0,12.666,97.42,16.287
3,1005,2245,330562,tp3,15,1989.0,5.825,97.42,9.063
4,1005,2245,330562,tp3,15,1990.0,7.212,97.42,11.758


In [88]:
valorisation_annuelle.describe()

Unnamed: 0,ucs,safran,sol,gestion,annee,valorisation,RU,cumul_croissance
count,535406.0,535406.0,535406.0,535406.0,535406.0,535406.0,535406.0,535406.0
mean,1050.229,2509.312,330824.012,13.939,1998.691,9.799,102.657,13.771
std,221.389,548.252,652.279,9.559,8.625,3.713,41.15,3.777
min,640.0,1309.0,330343.0,1.0,1984.0,0.0,21.139,0.754
25%,875.0,2115.0,330556.0,5.0,1991.0,7.131,89.18,11.155
50%,1080.0,2493.0,330563.0,15.0,1999.0,9.824,97.42,13.813
75%,1218.0,2889.0,330638.0,22.0,2006.0,12.452,148.68,16.441
max,1610.0,3886.0,332668.0,30.0,2013.0,23.039,351.24,26.845


In [89]:
print(valorisation_annuelle.shape)
print(valorisation_annuelle.columns)
print(valorisation_annuelle.info())

(535406, 9)
Index(['ucs', 'safran', 'sol', 'type_de_prairie', 'gestion', 'annee',
       'valorisation', 'RU', 'cumul_croissance'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 535406 entries, 0 to 535405
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   ucs               535406 non-null  int32  
 1   safran            535406 non-null  int32  
 2   sol               535406 non-null  int32  
 3   type_de_prairie   535406 non-null  object 
 4   gestion           535406 non-null  int32  
 5   annee             535406 non-null  float64
 6   valorisation      535406 non-null  float64
 7   RU                535406 non-null  float64
 8   cumul_croissance  535406 non-null  float64
dtypes: float64(4), int32(4), object(1)
memory usage: 28.6+ MB
None


In [90]:
for column in valorisation_annuelle.columns:
    if column in croissance_et_climat_decadaires.columns:
        print(column)

ucs
safran
sol
type_de_prairie
gestion
annee


In [91]:
valuable_columns = ["sol", "annee", "cumul_croissance"]
df = df[valuable_columns]

In [92]:
(df.isna() | df.isnull()).sum()

sol                 0
annee               0
cumul_croissance    0
dtype: int64

In [93]:
df.groupby("annee").nunique()

Unnamed: 0_level_0,sol,cumul_croissance
annee,Unnamed: 1_level_1,Unnamed: 2_level_1
1984.0,24,7786
1985.0,24,16089
1986.0,24,15765
1987.0,24,10789
1988.0,24,13187
1989.0,24,16097
1990.0,24,10595
1991.0,24,16558
1992.0,24,13431
1993.0,24,10739


### Mean 10 years per PUD

In [94]:
df_10y = df.groupby(["sol", "annee"]).sample(n=10, replace=True)
df = df_10y.groupby(["sol", "annee"]).mean()
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,cumul_croissance
sol,annee,Unnamed: 2_level_1
330343,1984.0,7.237
330343,1985.0,8.309
330343,1986.0,13.034
330343,1987.0,14.227
330343,1988.0,13.353


### Preprocess and sort the dataframe

In [95]:
df.reset_index()
df["annee"] = df["annee"].astype("int64")
df = df.sort_values(by=["sol", "annee"])
df.head()

KeyError: 'annee'

In [None]:
df.describe(include= "all")

## Preprocessing

In [None]:
X = croissance_et_climat_decadaires.loc[:, ["annee","decade","Tmin","Tmax","Tmoy","Rain","RG","im"]].values.astype(float)
y = valorisation_annuelle.loc[:, ["cumul_croissance"]].values.astype(float)