In [54]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import pandas as pd
import sklearn.metrics as metrics
import numpy as np

In [55]:
df = pd.read_csv('data/soja_cepea_brasil.csv', delimiter = ';')
df['data'] = pd.to_datetime(df['data'])
df = df.set_index('data')
df = df.sort_index()

In [56]:
df = df.astype(str).stack().str.replace(',','.').unstack().astype(float)
# df = df.drop(['preco_us'], axis = 1)

### Feature Engineering

In [57]:
df2 = df[['preco_br']]

df2.loc[:, 'yesterday'] = df2.loc[:,'preco_br'].shift()
df2.loc[:, 'yesterday_dif'] = df2.loc[:, 'yesterday'].diff()

df2.loc[:, 'yesterday_1'] = df2.loc[:,'yesterday'].shift()
# df2.loc[:, 'yesterday_1_dif'] = df2.loc[:, 'yesterday_1'].diff()

df2.loc[:, 'last_week'] = df2.loc[:, 'preco_br'].shift(7)
# df2.loc[:, 'last_week_diff'] = df2.loc[:, 'last_week'].diff()

df2.loc[:, 'last_year'] = df2.loc[:, 'preco_br'].shift(365)
# df2.loc[:, 'last_year_diff'] = df2.loc[:, 'last_year'].diff()

df2.loc[:, 'last_month'] = df2.loc[:, 'preco_br'].shift(30)
# df2.loc[:, 'last_month_diff'] = df2.loc[:, 'last_month'].diff()

df2 = df2.dropna()
df2

Unnamed: 0_level_0,preco_br,yesterday,yesterday_dif,yesterday_1,last_week,last_year,last_month
data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1999-01-18,14.39,13.91,0.02,13.89,19.83,18.10,14.88
1999-01-19,15.13,14.39,0.48,13.91,21.10,19.09,17.86
1999-01-20,15.62,15.13,0.74,14.39,21.46,19.24,15.83
1999-01-21,16.97,15.62,0.49,15.13,20.56,20.85,14.54
1999-01-22,17.16,16.97,1.35,15.62,13.86,19.21,14.58
...,...,...,...,...,...,...,...
2021-10-02,161.66,164.64,0.84,163.80,155.72,82.36,144.73
2021-11-01,164.28,161.66,-2.98,164.64,165.29,82.10,144.89
2021-11-02,159.73,164.28,2.62,161.66,158.46,82.15,166.63
2021-12-01,165.92,159.73,-4.55,164.28,159.16,82.37,166.53


In [58]:
df2.to_csv('data/final/soja_cepea_final.csv')

In [59]:
year_list = df2.index.year.unique().tolist()
splits = {'train': [], 'test': []}

for idx, yr in enumerate(year_list[:-1]):
    train_yr = year_list[:idx+1]
    test_yr = [year_list[idx+1]]
    print('TRAIN: ', train_yr, 'TEST: ',test_yr)
    
    splits['train'].append(df2.loc[df2.index.year.isin(train_yr), :])
    splits['test'].append(df2.loc[df2.index.year.isin(test_yr), :])
    

TRAIN:  [1999] TEST:  [2000]
TRAIN:  [1999, 2000] TEST:  [2001]
TRAIN:  [1999, 2000, 2001] TEST:  [2002]
TRAIN:  [1999, 2000, 2001, 2002] TEST:  [2003]
TRAIN:  [1999, 2000, 2001, 2002, 2003] TEST:  [2004]
TRAIN:  [1999, 2000, 2001, 2002, 2003, 2004] TEST:  [2005]
TRAIN:  [1999, 2000, 2001, 2002, 2003, 2004, 2005] TEST:  [2006]
TRAIN:  [1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006] TEST:  [2007]
TRAIN:  [1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007] TEST:  [2008]
TRAIN:  [1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008] TEST:  [2009]
TRAIN:  [1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009] TEST:  [2010]
TRAIN:  [1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010] TEST:  [2011]
TRAIN:  [1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011] TEST:  [2012]
TRAIN:  [1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012] TEST:  [2013]
TRAIN:  [1999, 2000, 2001, 2002, 2003, 2004, 200

In [60]:
model = LinearRegression()

for i, yr in enumerate(year_list[:-1]):
    X_train = splits['train'][i].drop(['preco_br'], axis = 1).values # X
    y_train = splits['train'][i]['preco_br'].values # Y
    
    X_test =  splits['test'][i].drop(['preco_br'], axis = 1).values
    y_test = splits['test'][i]['preco_br'].values
    
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    
    r2 = metrics.r2_score(y_test, pred)
    print(r2)

0.25431118321534985
0.5827414871539407
0.6930488759548136
0.47182083611742565
0.620964184288983
0.3324776460326927
0.47423478486311554
0.6768805612848392
-0.29192037115075786
0.41353111455006264
0.5871780919555272
0.22624987571448563
0.6592714138770502
0.4934352999133126
0.4641384222185645
0.6938690208674296
0.33525362329588493
0.23926312798972416
0.49341107866990586
0.6069006429992245
0.6443945586410968
-2.031430223394808


In [61]:
# X_train = df ['2010':'2016'].drop(['price_br'], axis = 1)
# y_train = df.loc ['2010':'2016', 'price_br']
# X_test = df ['2017':].drop(['price_br'], axis = 1)
# y_test = df.loc ['2017':, 'price_br']

In [7]:
X = df2.drop(['preco_br'], axis = 1).values
y = df2.loc[:, 'preco_br']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

In [9]:
model_v1 = LinearRegression()
model_v1.fit(X_train, y_train)
predict_v1 = model_v1.predict(X_test)
metrics.r2_score(y_test, predict_v1)

0.9732448177420547