In [130]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression
import sklearn.metrics as metrics
from sklearn.preprocessing import MinMaxScaler

In [131]:
# def regression_results(y_true, y_pred):
#     # Regression metrics
#     explained_variance = metrics.explained_variance_score(y_true, y_pred)
#     mean_absolute_error = metrics.mean_absolute_error(y_true, y_pred) 
#     mse = metrics.mean_squared_error(y_true, y_pred) 
#     median_absolute_error = metrics.median_absolute_error(y_true, y_pred)
#     r2 = metrics.r2_score(y_true, y_pred)
    
#     print('explained_variance: ', round(explained_variance, 4))    
#     print('r2: ', r2)
#     print('MAE: ', round(mean_absolute_error, 4))
#     print('MSE: ', round(mse, 4))
#     print('RMSE: ', round(np.sqrt(mse), 4))

In [132]:
df_atributes = pd.read_csv('data/atributos_soja.csv', delimiter = ';').T # read with transpose dataset
df_price = pd.read_csv('data/soja_cepea_preco.csv', delimiter = ';')
df_price

Unnamed: 0,data,preco_br,preco_us
0,29/07/1997,1804,1666
1,30/07/1997,1797,1659
2,31/07/1997,1817,1678
3,01/08/1997,181,1671
4,04/08/1997,1823,1683
...,...,...,...
5861,11/02/2021,15973,2962
5862,12/02/2021,16006,2976
5863,17/02/2021,15895,2933
5864,18/02/2021,15796,2904


### Tratamento dataset de preços da soja (target)

In [133]:
data = df_price['data']
ano = [a.split('/')[-1] for a in data]
df_price['ano'] = ano
df_price = df_price.drop(['data'], axis = 1)
df_price

Unnamed: 0,preco_br,preco_us,ano
0,1804,1666,1997
1,1797,1659,1997
2,1817,1678,1997
3,181,1671,1997
4,1823,1683,1997
...,...,...,...
5861,15973,2962,2021
5862,16006,2976,2021
5863,15895,2933,2021
5864,15796,2904,2021


### Tratamento dataset de atributos da soja

In [134]:
header = df_atributes.iloc[0] #grab the first row for the header
df_atributes = df_atributes[1:] #take the data less the header row
df_atributes.columns = header #set the header row as the df header
df_atributes['ano'] = df_atributes.index

### Unindo os 2 datasets

In [135]:
df = pd.merge(df_atributes, df_price, on = 'ano')
df

Unnamed: 0,area_plantada,area_colhida,qtd_produzida,valor_producao,rend_med_producao,pib,pib_per_capita,ano,preco_br,preco_us
0,11508120,11486478,26392636,6438004,2297,952089,572902,1997,1804,1666
1,11508120,11486478,26392636,6438004,2297,952089,572902,1997,1797,1659
2,11508120,11486478,26392636,6438004,2297,952089,572902,1997,1817,1678
3,11508120,11486478,26392636,6438004,2297,952089,572902,1997,181,1671
4,11508120,11486478,26392636,6438004,2297,952089,572902,1997,1823,1683
...,...,...,...,...,...,...,...,...,...,...
5579,3.59303e+07,3.58814e+07,1.14269e+08,1.25591e+08,3185,,,2019,8295,2025
5580,3.59303e+07,3.58814e+07,1.14269e+08,1.25591e+08,3185,,,2019,8279,2032
5581,3.59303e+07,3.58814e+07,1.14269e+08,1.25591e+08,3185,,,2019,8307,2045
5582,3.59303e+07,3.58814e+07,1.14269e+08,1.25591e+08,3185,,,2019,8271,2043


### Higienização

In [136]:
df = df.dropna()
df = df.astype(str).stack().str.replace(',','.').unstack().astype(float)
df['ano'] = df['ano'].astype(int)

In [137]:
year_list = df['ano'].unique().tolist()
splits = {'train': [], 'test': []}

for idx, yr in enumerate(year_list[:-1]):
    train_yr = year_list[:idx+1]
    test_yr = [year_list[idx+1]]
    print('TRAIN: ', train_yr, 'TEST: ',test_yr)
    
    splits['train'].append(df.loc[df.ano.isin(train_yr), :])
    splits['test'].append(df.loc[df.ano.isin(test_yr), :])
    

TRAIN:  [1997] TEST:  [1998]
TRAIN:  [1997, 1998] TEST:  [1999]
TRAIN:  [1997, 1998, 1999] TEST:  [2000]
TRAIN:  [1997, 1998, 1999, 2000] TEST:  [2001]
TRAIN:  [1997, 1998, 1999, 2000, 2001] TEST:  [2002]
TRAIN:  [1997, 1998, 1999, 2000, 2001, 2002] TEST:  [2003]
TRAIN:  [1997, 1998, 1999, 2000, 2001, 2002, 2003] TEST:  [2004]
TRAIN:  [1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004] TEST:  [2005]
TRAIN:  [1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005] TEST:  [2006]
TRAIN:  [1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006] TEST:  [2007]
TRAIN:  [1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007] TEST:  [2008]
TRAIN:  [1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008] TEST:  [2009]
TRAIN:  [1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009] TEST:  [2010]
TRAIN:  [1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010] TEST:  [2011]
TRAIN:  [1997, 1998, 1999, 2000, 2001, 2002, 200

In [138]:
model = LinearRegression()

for i, yr in enumerate(year_list[:-1]):
    X_train = splits['train'][i].drop(['preco_us'], axis = 1).values # X
    y_train = splits['train'][i]['preco_us'].values # Y
    
    X_test =  splits['test'][i].drop(['preco_us'], axis = 1).values
    y_test = splits['test'][i]['preco_us'].values
    
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    
    r2 = metrics.r2_score(y_test, pred)
    print(r2)

-0.3623321395828898
-5.4568350633694435e+25
-2.3910033480080166e+23
-3440.281599438019
-2.82659699625952e+23
-23.109729445761428
-70.72313925348759
-4419.286620866772
-56574.32723575937
-15.235677691049599
0.1979600039565521
-2.329948345488427
-2.298411946947467
-0.15760420461854419
0.07176265366410384
-2.4584739186822993
-1.9455981964277087
-19.375687661550245
-5.139787393470917
-60.01458058589338
-3.4829182339876867
