# Exploratory Data Analysis

In [1]:
# Importing the libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Define settings
import os

pd.set_option("display.precision", 3)
data_dir_path = os.getcwd() + '/dataverse_files/'

In [3]:
# Importing data
import pyreadr

croissance_et_climat_decadaires = pyreadr.read_r(data_dir_path + "croissance_et_climat_decadaires.rds" ).popitem()[1]
valorisation_annuelle = pyreadr.read_r(data_dir_path + "valorisation_annuelle.rds" ).popitem()[1]

## Analysis of growths and decadal climates

In [121]:
df = croissance_et_climat_decadaires

In [122]:
df.head()

Unnamed: 0,ucs,safran,sol,type_de_prairie,gestion,annee,decade,Tmin,Tmax,Tmoy,Rain,RG,im,croissance
0,789,2131,330343,tp3,20,1985.0,1.0,-2.289,3.178,0.444,12.3,28.912,43.573,0.032
1,789,2131,330343,tp3,20,1985.0,2.0,-5.74,-0.41,-3.075,19.5,33.793,104.188,0.0
2,789,2131,330343,tp3,20,1985.0,3.0,3.77,9.13,6.45,36.8,24.751,82.772,0.046
3,789,2131,330343,tp3,20,1985.0,4.0,8.23,12.45,10.34,14.4,32.975,26.195,0.075
4,789,2131,330343,tp3,20,1985.0,5.0,-1.38,2.52,0.57,25.3,48.912,88.562,0.015


In [123]:
df[["ucs", "safran", "sol"]].corr()

Unnamed: 0,ucs,safran,sol
ucs,1.0,0.937,0.085
safran,0.937,1.0,0.032
sol,0.085,0.032,1.0


In [124]:
print(df.shape)
print(df.columns)
print(df.info())

(18693829, 14)
Index(['ucs', 'safran', 'sol', 'type_de_prairie', 'gestion', 'annee', 'decade',
       'Tmin', 'Tmax', 'Tmoy', 'Rain', 'RG', 'im', 'croissance'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18693829 entries, 0 to 18693828
Data columns (total 14 columns):
 #   Column           Dtype  
---  ------           -----  
 0   ucs              int32  
 1   safran           int32  
 2   sol              int32  
 3   type_de_prairie  object 
 4   gestion          int32  
 5   annee            float64
 6   decade           float64
 7   Tmin             float64
 8   Tmax             float64
 9   Tmoy             float64
 10  Rain             float64
 11  RG               float64
 12  im               float64
 13  croissance       float64
dtypes: float64(9), int32(4), object(1)
memory usage: 1.7+ GB
None


#### Define the columns used in the paper

In [125]:
valuable_columns = ["sol", "annee","decade","Tmin","Tmax","Tmoy","Rain","RG","im"]
df = df[valuable_columns]

In [126]:
df.head()

Unnamed: 0,sol,annee,decade,Tmin,Tmax,Tmoy,Rain,RG,im
0,330343,1985.0,1.0,-2.289,3.178,0.444,12.3,28.912,43.573
1,330343,1985.0,2.0,-5.74,-0.41,-3.075,19.5,33.793,104.188
2,330343,1985.0,3.0,3.77,9.13,6.45,36.8,24.751,82.772
3,330343,1985.0,4.0,8.23,12.45,10.34,14.4,32.975,26.195
4,330343,1985.0,5.0,-1.38,2.52,0.57,25.3,48.912,88.562


In [127]:
df.shape

(18693829, 9)

In [128]:
(df.isna() | df.isnull()).sum()

sol            0
annee      20262
decade     20262
Tmin      126008
Tmax      126008
Tmoy      126008
Rain      126008
RG        126008
im        126008
dtype: int64

### Mean 10 years per ucs

In [129]:
df_10y = df.groupby(["sol", "annee"]).sample(n=10, replace=True)
df = df_10y.groupby(["sol", "annee"]).mean()
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,decade,Tmin,Tmax,Tmoy,Rain,RG,im
sol,annee,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
330343,1984.0,19.1,8.129,15.779,11.954,12.37,129.508,21.14
330343,1985.0,21.7,8.623,14.116,11.369,22.21,94.708,40.904
330343,1986.0,12.5,4.724,11.072,7.898,14.75,95.614,33.283
330343,1987.0,11.5,4.307,10.641,7.474,16.61,104.955,34.164
330343,1988.0,23.5,9.671,16.8,13.236,16.19,123.197,24.19


### Preprocess and sort the dataframe

In [131]:
df = df.reset_index()
df["annee"] = df["annee"].astype("int64")
df["decade"] = df["decade"].astype("int64")

In [132]:
df.head()

Unnamed: 0,sol,annee,decade,Tmin,Tmax,Tmoy,Rain,RG,im
0,330343,1984,19,8.129,15.779,11.954,12.37,129.508,21.14
1,330343,1985,21,8.623,14.116,11.369,22.21,94.708,40.904
2,330343,1986,12,4.724,11.072,7.898,14.75,95.614,33.283
3,330343,1987,11,4.307,10.641,7.474,16.61,104.955,34.164
4,330343,1988,23,9.671,16.8,13.236,16.19,123.197,24.19


In [None]:
df.describe(include= "all")

In [None]:
df.sort_values(by=["annee","decade"]).head()

## Analysis of annual valuations

In [None]:
df = valorisation_annuelle

In [None]:
valorisation_annuelle.head()

In [None]:
valorisation_annuelle.describe()

In [None]:
print(valorisation_annuelle.shape)
print(valorisation_annuelle.columns)
print(valorisation_annuelle.info())

In [None]:
for column in valorisation_annuelle.columns:
    if column in croissance_et_climat_decadaires.columns:
        print(column)

## Preprocessing

In [None]:
X = croissance_et_climat_decadaires.loc[:, ["annee","decade","Tmin","Tmax","Tmoy","Rain","RG","im"]].values.astype(float)
y = valorisation_annuelle.loc[:, ["cumul_croissance"]].values.astype(float)