In [2]:
# Import the libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [8]:
# Import data

if __name__ == "__main__":
    import pyreadr
    import os

    pd.set_option("display.precision", 3)
    data_dir_path = os.getcwd() + '/dataverse_files/'

    croissance_et_climat_decadaires = pyreadr.read_r(data_dir_path + "croissance_et_climat_decadaires.rds" ).popitem()[1]
    valorisation_annuelle = pyreadr.read_r(data_dir_path + "valorisation_annuelle.rds" ).popitem()[1]

### Global variables

In [3]:
nb_annees = 1

## Data Preparation

### Growths and decadal climates

In [9]:
identifier_columns = ["ucs", "safran", "sol"] + ["type_de_prairie"] + ["gestion"]
valuable_columns = identifier_columns + ["annee", "decade", "Tmin", "Tmax", "Tmoy", "Rain", "RG", "im", "croissance"]
df = croissance_et_climat_decadaires.loc[:, valuable_columns]

In [10]:
annee_inf = df["annee"].max() - nb_annees
df = df.loc[(df["annee"] > annee_inf), :]

df.sort_values(by= identifier_columns + ["annee", "decade"], inplace=True)
df.drop("type_de_prairie", axis= 'columns', inplace= True)
identifier_columns.remove("type_de_prairie")

df.dropna(inplace= True)

df["annee"] = df["annee"].astype("int64")
df["decade"] = df["decade"].astype("int64")

croissance_et_climat_decadaires_preprocessed = df

### Annual valuations

In [11]:
valuable_columns = identifier_columns + ["annee", "cumul_croissance"]
df = valorisation_annuelle.loc[:, valuable_columns]

df = df.reset_index(drop=True)
df[["annee"]] = df[["annee"]].astype("int64")

annee_inf = df["annee"].max() - nb_annees
df = df.loc[(df["annee"] > annee_inf), :]

valorisation_annuelle_preprocessed = df

### Concatenation

In [12]:
data_set = croissance_et_climat_decadaires_preprocessed.merge(valorisation_annuelle_preprocessed, how='left', on= identifier_columns + ["annee"])

In [13]:
WrongCumul = data_set.groupby(identifier_columns+["annee"]).sum(numeric_only= True)["croissance"] / 100 != data_set.groupby(identifier_columns + ["annee"]).last()["cumul_croissance"]
WrongCumul.sum()

data_set = data_set.merge(WrongCumul.reset_index(), how='left', on= identifier_columns+["annee"])
WrongIndexes = data_set[data_set[0] == True].index

data_set.drop(WrongIndexes, inplace= True)
data_set.drop(columns=0, inplace= True)

In [14]:
data_columns = ["Tmin", "Tmax", "Tmoy", "Rain", "RG", "im", "croissance", "cumul_croissance"]
data_set = data_set.loc[:, data_columns]

data_set_Y = pd.DataFrame({'croissance' : data_set.pop('croissance')})
data_set_X = data_set

## Data Preprocessing

In [None]:
X, Y = list(), list()

last_progression = 0
window_size = 4
data_id_size = 37*nb_annees # number of decades per year # PAS BATCH

# build batches
for data_id_index in range(0, len(data_set) - data_id_size, data_id_size): # for each batch
    batch_end_index = data_id_index + data_id_size
    # build samples by concatenating all entries that fit in the window
    for index in range(data_id_index, batch_end_index - window_size): # for each window in the batch
        x_c = data_set_X.iloc[index:(index + window_size), :] # concatenate climate data of entries that fit in the window
        x_g = data_set_Y.iloc[index:(index + window_size - 1), 0] # concatenate daily growth of the beginning of the window
        x_c = x_c.values.reshape(-1)
        x_g = x_g.values.reshape(-1)
        x = list(x_c) + list(x_g)
        
        y = data_set_Y.iloc[index + window_size, 0] # daily growth of the end of the window
        X.append(x)
        Y.append(y)
    progression = data_id_index / (len(data_set) - data_id_size) * 100
    if progression - last_progression > 1 and not int(progression) % 5:
        print(int(progression), '%')
        last_progression = progression

In [None]:
import pickle

with open("x.pickle", "wb") as outfile:
    pickle.dump(X, outfile)
    
with open("y.pickle", "wb") as outfile:
    pickle.dump(Y, outfile)