In [104]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression
import sklearn.metrics as metrics
from sklearn.preprocessing import MinMaxScaler

In [105]:
# def regression_results(y_true, y_pred):
#     # Regression metrics
#     explained_variance = metrics.explained_variance_score(y_true, y_pred)
#     mean_absolute_error = metrics.mean_absolute_error(y_true, y_pred) 
#     mse = metrics.mean_squared_error(y_true, y_pred) 
#     median_absolute_error = metrics.median_absolute_error(y_true, y_pred)
#     r2 = metrics.r2_score(y_true, y_pred)
    
#     print('explained_variance: ', round(explained_variance, 4))    
#     print('r2: ', r2)
#     print('MAE: ', round(mean_absolute_error, 4))
#     print('MSE: ', round(mse, 4))
#     print('RMSE: ', round(np.sqrt(mse), 4))

In [106]:
df_atributes = pd.read_csv('data/atributos_soja.csv', delimiter = ';').T # read with transpose dataset
df_price = pd.read_csv('data/soja_cepea_preco.csv', delimiter = ';')
df_price

Unnamed: 0,data,preco_br,preco_us
0,29/07/1997,1804,1666
1,30/07/1997,1797,1659
2,31/07/1997,1817,1678
3,01/08/1997,181,1671
4,04/08/1997,1823,1683
...,...,...,...
5861,11/02/2021,15973,2962
5862,12/02/2021,16006,2976
5863,17/02/2021,15895,2933
5864,18/02/2021,15796,2904


### Tratamento dataset de preços da soja (target)

In [107]:
data = df_price['data']
ano = [a.split('/')[-1] for a in data]
df_price['ano'] = ano
df_price = df_price.drop(['data'], axis = 1)
df_price

Unnamed: 0,preco_br,preco_us,ano
0,1804,1666,1997
1,1797,1659,1997
2,1817,1678,1997
3,181,1671,1997
4,1823,1683,1997
...,...,...,...
5861,15973,2962,2021
5862,16006,2976,2021
5863,15895,2933,2021
5864,15796,2904,2021


### Tratamento dataset de atributos da soja

In [108]:
header = df_atributes.iloc[0] #grab the first row for the header
df_atributes = df_atributes[1:] #take the data less the header row
df_atributes.columns = header #set the header row as the df header
df_atributes['ano'] = df_atributes.index

### Unindo os 2 datasets

In [109]:
df = pd.merge(df_atributes, df_price, on = 'ano')
df

Unnamed: 0,area_plantada,area_colhida,qtd_produzida,valor_producao,rend_med_producao,pib,pib_per_capita,ano,preco_br,preco_us
0,11508120,11486478,26392636,6438004,2297,952089,572902,1997,1804,1666
1,11508120,11486478,26392636,6438004,2297,952089,572902,1997,1797,1659
2,11508120,11486478,26392636,6438004,2297,952089,572902,1997,1817,1678
3,11508120,11486478,26392636,6438004,2297,952089,572902,1997,181,1671
4,11508120,11486478,26392636,6438004,2297,952089,572902,1997,1823,1683
...,...,...,...,...,...,...,...,...,...,...
5579,3.59303e+07,3.58814e+07,1.14269e+08,1.25591e+08,3185,,,2019,8295,2025
5580,3.59303e+07,3.58814e+07,1.14269e+08,1.25591e+08,3185,,,2019,8279,2032
5581,3.59303e+07,3.58814e+07,1.14269e+08,1.25591e+08,3185,,,2019,8307,2045
5582,3.59303e+07,3.58814e+07,1.14269e+08,1.25591e+08,3185,,,2019,8271,2043


### Higienização

In [110]:
df = df.dropna()
df = df.astype(str).stack().str.replace(',','.').unstack().astype(float)
df['ano'] = df['ano'].astype(int)
df

Unnamed: 0,area_plantada,area_colhida,qtd_produzida,valor_producao,rend_med_producao,pib,pib_per_capita,ano,preco_br,preco_us
0,11508120.0,11486478.0,26392636.0,6438004.0,2297.0,952089.0,5729.02,1997,18.04,16.66
1,11508120.0,11486478.0,26392636.0,6438004.0,2297.0,952089.0,5729.02,1997,17.97,16.59
2,11508120.0,11486478.0,26392636.0,6438004.0,2297.0,952089.0,5729.02,1997,18.17,16.78
3,11508120.0,11486478.0,26392636.0,6438004.0,2297.0,952089.0,5729.02,1997,18.10,16.71
4,11508120.0,11486478.0,26392636.0,6438004.0,2297.0,952089.0,5729.02,1997,18.23,16.83
...,...,...,...,...,...,...,...,...,...,...
5328,34838351.0,34777936.0,117912450.0,127895812.0,3390.0,7004141.0,33593.82,2018,74.90,19.45
5329,34838351.0,34777936.0,117912450.0,127895812.0,3390.0,7004141.0,33593.82,2018,74.54,19.16
5330,34838351.0,34777936.0,117912450.0,127895812.0,3390.0,7004141.0,33593.82,2018,74.15,18.92
5331,34838351.0,34777936.0,117912450.0,127895812.0,3390.0,7004141.0,33593.82,2018,74.12,19.01


In [112]:
year_list = df['ano'].unique().tolist()
splits = {'train': [], 'test': []}

for idx, yr in enumerate(year_list[:-1]):
    train_yr = year_list[idx:idx+1]
    test_yr = [year_list[idx+1]]
    print('TRAIN: ', train_yr, 'TEST: ',test_yr)
    
    splits['train'].append(df.loc[df.ano.isin(train_yr), :])
    splits['test'].append(df.loc[df.ano.isin(test_yr), :])
    

TRAIN:  [1997] TEST:  [1998]
TRAIN:  [1998] TEST:  [1999]
TRAIN:  [1999] TEST:  [2000]
TRAIN:  [2000] TEST:  [2001]
TRAIN:  [2001] TEST:  [2002]
TRAIN:  [2002] TEST:  [2003]
TRAIN:  [2003] TEST:  [2004]
TRAIN:  [2004] TEST:  [2005]
TRAIN:  [2005] TEST:  [2006]
TRAIN:  [2006] TEST:  [2007]
TRAIN:  [2007] TEST:  [2008]
TRAIN:  [2008] TEST:  [2009]
TRAIN:  [2009] TEST:  [2010]
TRAIN:  [2010] TEST:  [2011]
TRAIN:  [2011] TEST:  [2012]
TRAIN:  [2012] TEST:  [2013]
TRAIN:  [2013] TEST:  [2014]
TRAIN:  [2014] TEST:  [2015]
TRAIN:  [2015] TEST:  [2016]
TRAIN:  [2016] TEST:  [2017]
TRAIN:  [2017] TEST:  [2018]


In [115]:
model = LinearRegression()

for i, yr in enumerate(year_list[:-1]):
    X_train = splits['train'][i].drop(['preco_us'], axis = 1).values # X
    y_train = splits['train'][i]['preco_us'].values # Y
    
    X_test =  splits['test'][i].drop(['preco_us'], axis = 1).values
    y_test = splits['test'][i]['preco_us'].values
    
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    
    r2 = metrics.r2_score(y_test, pred)
    print(r2)

R²: 
-0.3623321395828898
R²: 
-62.28065201560994
R²: 
0.26258647757541165
R²: 
-1.0620372540918765
R²: 
0.24129884830970272
R²: 
-0.03166867574919041
R²: 
0.8622079789836681
R²: 
-7.187150532353359
R²: 
0.258524983453693
R²: 
0.4496975936044435
R²: 
-1.9830948267365285
R²: 
-3.2018565639910346
R²: 
-0.03759898438227016
R²: 
-0.4331512132506701
R²: 
0.673069788052617
R²: 
-3.8124907517297135
R²: 
-0.5825482218953555
R²: 
-175.00426098686114
R²: 
-1.530274955066393
R²: 
-3.5834256316476383
R²: 
-10.875763256441694
