# Exploratory Data Analysis

In [1]:
# Importing the libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Define settings
import os

pd.set_option("display.precision", 3)
data_dir_path = os.getcwd() + '/dataverse_files/'

In [3]:
# Importing data
import pyreadr

croissance_et_climat_decadaires = pyreadr.read_r(data_dir_path + "croissance_et_climat_decadaires.rds" ).popitem()[1]
valorisation_annuelle = pyreadr.read_r(data_dir_path + "valorisation_annuelle.rds" ).popitem()[1]

## Analysis of growths and decadal climates

In [48]:
df = croissance_et_climat_decadaires

In [49]:
df.head()

Unnamed: 0,ucs,safran,sol,type_de_prairie,gestion,annee,decade,Tmin,Tmax,Tmoy,Rain,RG,im,croissance
0,789,2131,330343,tp3,20,1985.0,1.0,-2.289,3.178,0.444,12.3,28.912,43.573,0.032
1,789,2131,330343,tp3,20,1985.0,2.0,-5.74,-0.41,-3.075,19.5,33.793,104.188,0.0
2,789,2131,330343,tp3,20,1985.0,3.0,3.77,9.13,6.45,36.8,24.751,82.772,0.046
3,789,2131,330343,tp3,20,1985.0,4.0,8.23,12.45,10.34,14.4,32.975,26.195,0.075
4,789,2131,330343,tp3,20,1985.0,5.0,-1.38,2.52,0.57,25.3,48.912,88.562,0.015


In [50]:
print(df.shape)
print(df.columns)
print(df.info())

(18693829, 14)
Index(['ucs', 'safran', 'sol', 'type_de_prairie', 'gestion', 'annee', 'decade',
       'Tmin', 'Tmax', 'Tmoy', 'Rain', 'RG', 'im', 'croissance'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18693829 entries, 0 to 18693828
Data columns (total 14 columns):
 #   Column           Dtype  
---  ------           -----  
 0   ucs              int32  
 1   safran           int32  
 2   sol              int32  
 3   type_de_prairie  object 
 4   gestion          int32  
 5   annee            float64
 6   decade           float64
 7   Tmin             float64
 8   Tmax             float64
 9   Tmoy             float64
 10  Rain             float64
 11  RG               float64
 12  im               float64
 13  croissance       float64
dtypes: float64(9), int32(4), object(1)
memory usage: 1.7+ GB
None


In [51]:
# To check null values row-wise
df.isnull().sum()

ucs                     0
safran                  0
sol                     0
type_de_prairie         0
gestion                 0
annee               20262
decade              20262
Tmin               126008
Tmax               126008
Tmoy               126008
Rain               126008
RG                 126008
im                 126008
croissance          20262
dtype: int64

#### Define the columns used in the paper

In [52]:
valuable_columns = ["ucs", "annee","decade","Tmin","Tmax","Tmoy","Rain","RG","im"]
df = df[valuable_columns]

In [53]:
df.head()

Unnamed: 0,ucs,annee,decade,Tmin,Tmax,Tmoy,Rain,RG,im
0,789,1985.0,1.0,-2.289,3.178,0.444,12.3,28.912,43.573
1,789,1985.0,2.0,-5.74,-0.41,-3.075,19.5,33.793,104.188
2,789,1985.0,3.0,3.77,9.13,6.45,36.8,24.751,82.772
3,789,1985.0,4.0,8.23,12.45,10.34,14.4,32.975,26.195
4,789,1985.0,5.0,-1.38,2.52,0.57,25.3,48.912,88.562


In [54]:
df.shape

(18693829, 9)

In [55]:
df.isnull().sum()

ucs            0
annee      20262
decade     20262
Tmin      126008
Tmax      126008
Tmoy      126008
Rain      126008
RG        126008
im        126008
dtype: int64

### Mean 10 years per ucs

In [68]:
df.groupby(["ucs"]).head()#.sample(n=10, random_state=1).mean()

Unnamed: 0,ucs,annee,decade,Tmin,Tmax,Tmoy,Rain,RG,im
0,789,1985.0,1.0,-2.289,3.178,0.444,12.3,28.912,43.573
1,789,1985.0,2.0,-5.740,-0.410,-3.075,19.5,33.793,104.188
2,789,1985.0,3.0,3.770,9.130,6.450,36.8,24.751,82.772
3,789,1985.0,4.0,8.230,12.450,10.340,14.4,32.975,26.195
4,789,1985.0,5.0,-1.380,2.520,0.570,25.3,48.912,88.562
...,...,...,...,...,...,...,...,...,...
18429805,998,1985.0,1.0,-4.389,1.033,-1.678,7.3,39.908,32.455
18429806,998,1985.0,2.0,-7.950,-1.160,-4.555,17.6,37.915,119.596
18429807,998,1985.0,3.0,2.750,8.140,5.445,42.7,26.083,102.292
18429808,998,1985.0,4.0,7.810,11.320,9.565,13.4,42.318,25.341


### Drop  N/A values

In [56]:
df = df.dropna(subset= ["decade", "annee"])

In [57]:
df.isnull().sum()

ucs            0
annee          0
decade         0
Tmin      105746
Tmax      105746
Tmoy      105746
Rain      105746
RG        105746
im        105746
dtype: int64

### Preprocess and sort the dataframe

In [46]:
df["annee"] = df["annee"].astype("int64")
df["decade"] = df["decade"].astype("int64")

In [59]:
df.describe(include= "all")

Unnamed: 0,ucs,annee,decade,Tmin,Tmax,Tmoy,Rain,RG,im
count,18670000.0,18670000.0,18670000.0,18570000.0,18570000.0,18570000.0,18570000.0,18570000.0,18570000.0
mean,1054.0,1999.0,19.0,8.249,14.64,11.45,25.78,108.5,47.98
std,222.9,8.626,10.68,4.365,5.607,4.893,24.99,65.81,48.97
min,640.0,1984.0,1.0,-8.76,-5.38,-7.02,0.0,7.17,0.0
25%,888.0,1991.0,10.0,5.04,10.43,7.855,6.8,46.84,11.44
50%,1080.0,1999.0,19.0,8.29,14.25,11.18,18.9,104.3,32.97
75%,1222.0,2006.0,28.0,11.91,19.1,15.54,37.2,161.5,68.76
max,1610.0,2013.0,37.0,20.69,31.17,25.3,240.4,299.0,500.7


In [None]:
df.sort_values(by=["annee","decade"]).head()

## Analysis of annual valuations

In [None]:
df = valorisation_annuelle

In [None]:
valorisation_annuelle.head()

In [None]:
valorisation_annuelle.describe()

In [None]:
print(valorisation_annuelle.shape)
print(valorisation_annuelle.columns)
print(valorisation_annuelle.info())

In [None]:
for column in valorisation_annuelle.columns:
    if column in croissance_et_climat_decadaires.columns:
        print(column)

## Preprocessing

In [None]:
X = croissance_et_climat_decadaires.loc[:, ["annee","decade","Tmin","Tmax","Tmoy","Rain","RG","im"]].values.astype(float)
y = valorisation_annuelle.loc[:, ["cumul_croissance"]].values.astype(float)