In [178]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import pandas as pd
import sklearn.metrics as metrics
import numpy as np

In [179]:
df = pd.read_csv('data/data_soja_cepea.csv')
df['data'] = pd.to_datetime(df['data'])
df = df.set_index('data')
# df = df.sort_index()
df

Unnamed: 0_level_0,preco_br
data,Unnamed: 1_level_1
1997-07-31,18.04
1997-08-01,17.97
1997-08-04,18.17
1997-08-05,18.10
1997-08-06,18.23
...,...
2021-03-08,164.85
2021-03-09,166.40
2021-03-10,169.58
2021-03-11,171.42


### Feature Engineering

In [180]:
df2 = df[['preco_br']]

df2.loc[:, 'yesterday'] = df2.loc[:,'preco_br'].shift()
df2.loc[:, 'yesterday_dif'] = df2.loc[:, 'yesterday'].diff()

df2.loc[:, 'yesterday_1'] = df2.loc[:, 'yesterday'].shift()
# df2.loc[:, 'yesterday_1_dif'] = df2.loc[:, 'yesterday_1'].diff()

df2.loc[:, 'last_week'] = df2.loc[:, 'yesterday'].shift(7)
# df2.loc[:, 'last_week_diff'] = df2.loc[:, 'last_week'].diff()


# df2.loc[:, 'last_year'] = df2.loc[:, 'yesterday'].shift(365)
# df2.loc[:, 'last_year_diff'] = df2.loc[:, 'last_year'].diff()
# df2.loc[:, 'last_year_diff'] = df2['yesterday'].sub(df2['last_year'], axis = 0) 

# df2.loc[:, 'last_month'] = df2.loc[:, 'yesterday'].shift(30)
# df2.loc[:, 'last_month_diff'] = df2.loc[:, 'last_month'].diff()

df2 = df2.dropna()
df2

Unnamed: 0_level_0,preco_br,yesterday,yesterday_dif,yesterday_1,last_week
data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1997-08-12,18.05,18.07,-0.08,18.15,18.04
1997-08-13,17.85,18.05,-0.02,18.07,17.97
1997-08-14,17.99,17.85,-0.20,18.05,18.17
1997-08-15,18.00,17.99,0.14,17.85,18.10
1997-08-18,18.07,18.00,0.01,17.99,18.23
...,...,...,...,...,...
2021-03-08,164.85,165.47,1.92,163.55,158.02
2021-03-09,166.40,164.85,-0.62,165.47,159.32
2021-03-10,169.58,166.40,1.55,164.85,159.54
2021-03-11,171.42,169.58,3.18,166.40,160.30


In [181]:
df2.to_csv('data/final/soja_cepea_final.csv')

In [182]:
df2['2020-01']

Unnamed: 0_level_0,preco_br,yesterday,yesterday_dif,yesterday_1,last_week
data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02,82.71,83.07,0.28,82.79,83.11
2020-01-03,82.8,82.71,-0.36,83.07,82.76
2020-01-06,82.92,82.8,0.09,82.71,82.75
2020-01-07,83.01,82.92,0.12,82.8,82.85
2020-01-08,82.83,83.01,0.09,82.92,82.6
2020-01-09,82.81,82.83,-0.18,83.01,82.95
2020-01-10,82.95,82.81,-0.02,82.83,82.79
2020-01-13,82.83,82.95,0.14,82.81,83.07
2020-01-14,82.83,82.83,-0.12,82.95,82.71
2020-01-15,83.59,82.83,0.0,82.83,82.8


### Cross-Validation on Time Series 

In [183]:
year_list = df2.index.year.unique().tolist()
splits = {'train': [], 'test': []}

for idx, yr in enumerate(year_list[:-1]):
    train_yr = year_list[:idx+1]
    test_yr = [year_list[idx+1]]
    print('TRAIN: ', train_yr, 'TEST: ',test_yr)
    
    splits['train'].append(df2.loc[df2.index.year.isin(train_yr), :])
    splits['test'].append(df2.loc[df2.index.year.isin(test_yr), :])
    

TRAIN:  [1997] TEST:  [1998]
TRAIN:  [1997, 1998] TEST:  [1999]
TRAIN:  [1997, 1998, 1999] TEST:  [2000]
TRAIN:  [1997, 1998, 1999, 2000] TEST:  [2001]
TRAIN:  [1997, 1998, 1999, 2000, 2001] TEST:  [2002]
TRAIN:  [1997, 1998, 1999, 2000, 2001, 2002] TEST:  [2003]
TRAIN:  [1997, 1998, 1999, 2000, 2001, 2002, 2003] TEST:  [2004]
TRAIN:  [1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004] TEST:  [2005]
TRAIN:  [1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005] TEST:  [2006]
TRAIN:  [1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006] TEST:  [2007]
TRAIN:  [1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007] TEST:  [2008]
TRAIN:  [1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008] TEST:  [2009]
TRAIN:  [1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009] TEST:  [2010]
TRAIN:  [1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010] TEST:  [2011]
TRAIN:  [1997, 1998, 1999, 2000, 2001, 2002, 200

In [184]:
model = LinearRegression()

for i, yr in enumerate(year_list[:-1]):
    X_train = splits['train'][i].drop(['preco_br'], axis = 1).values # X
    y_train = splits['train'][i]['preco_br'].values # Y
    
    X_test =  splits['test'][i].drop(['preco_br'], axis = 1).values
    y_test = splits['test'][i]['preco_br'].values
    
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    
    r2 = metrics.r2_score(y_test, pred)
    print(r2)

0.9647473595766295
0.9838644280205713
0.9723161258115381
0.9957975600068381
0.9969287318781354
0.9888294953266141
0.9923628064054468
0.9555945941032941
0.9883423648532476
0.9966011791987118
0.9446223642808493
0.9706159526654996
0.9957312877410435
0.9623741701532316
0.9984030618147888
0.9886500981660539
0.9745588400655343
0.9928081746998386
0.9813450890464588
0.9683720171210196
0.9893617168437778
0.9890124847562364
0.9985888275663115
0.9092292483610024


### Testando a regressao com os dados aleatorizados

In [185]:
X = df2.drop(['preco_br'], axis = 1).values
y = df2.loc[:, 'preco_br']

In [186]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

In [187]:
model_v1 = LinearRegression()
model_v1.fit(X_train, y_train)
predict_v1 = model_v1.predict(X_test)
metrics.r2_score(y_test, predict_v1)

0.9996253312086565