In [43]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression
import sklearn.metrics as metrics

In [73]:
def regression_results(y_true, y_pred):
    # Regression metrics
    explained_variance = metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error = metrics.mean_absolute_error(y_true, y_pred) 
    mse = metrics.mean_squared_error(y_true, y_pred) 
    median_absolute_error = metrics.median_absolute_error(y_true, y_pred)
    r2 = metrics.r2_score(y_true, y_pred)
    
    print('explained_variance: ', round(explained_variance, 4))    
    print('r2: ', r2)
    print('MAE: ', round(mean_absolute_error, 4))
    print('MSE: ', round(mse, 4))
    print('RMSE: ', round(np.sqrt(mse), 4))

In [74]:
df = pd.read_csv('data/final.csv')
df

Unnamed: 0,data,prec_total_diario,pressao_atm_med_dia,temp_ponto_orvalho_med,temp_max_dia,temp_med_dia,temp_min_dia,umi_rel_ar_med,umi_rel_ar_min,vento_raj_max_dia,vento_vel_med_dia,ano,area_plantada,area_colhida,qtd_produzida
0,2003-01-17,0.0,950.507143,21.254167,30.3,24.883333,22.0,81.458333,54.0,7.2,1.935714,2003,320186.0,320186.0,902924.0
1,2003-01-18,26.6,951.679167,20.950000,25.0,22.454167,21.9,91.250000,82.0,10.6,2.062500,2003,320186.0,320186.0,902924.0
2,2003-01-19,49.2,951.700000,20.483333,25.6,22.445833,21.2,88.833333,73.0,9.5,2.379167,2003,320186.0,320186.0,902924.0
3,2003-01-20,5.6,951.050000,20.483333,28.6,23.491667,21.2,83.833333,63.0,9.5,2.675000,2003,320186.0,320186.0,902924.0
4,2003-01-21,0.4,949.629167,19.716667,28.5,23.829167,21.1,79.041667,57.0,9.7,2.487500,2003,320186.0,320186.0,902924.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4151,2019-03-28,10.4,952.745833,22.754167,28.0,23.495833,22.3,95.833333,82.0,6.5,1.450000,2019,380000.0,380000.0,1276800.0
4152,2019-03-29,3.0,953.150000,22.504167,31.2,24.895833,21.1,87.916667,62.0,9.1,1.333333,2019,380000.0,380000.0,1276800.0
4153,2019-03-30,1.8,953.454167,22.433333,30.8,24.612500,21.2,88.875000,63.0,6.9,1.587500,2019,380000.0,380000.0,1276800.0
4154,2019-03-31,1.4,952.016667,22.966667,31.5,25.708333,21.9,85.916667,60.0,7.2,1.162500,2019,380000.0,380000.0,1276800.0


In [75]:
tscv = TimeSeriesSplit()

In [76]:
year_list = df['ano'].unique().tolist()
splits = {'train': [], 'test': []}

for idx, yr in enumerate(year_list[:-1]):
    train_yr = year_list[:idx+1]
    test_yr = [year_list[idx+1]]
    print('TRAIN: ', train_yr, 'TEST: ',test_yr)
    
    splits['train'].append(df.loc[df.ano.isin(train_yr), :])
    splits['test'].append(df.loc[df.ano.isin(test_yr), :])
    

TRAIN:  [2003] TEST:  [2004]
TRAIN:  [2003, 2004] TEST:  [2005]
TRAIN:  [2003, 2004, 2005] TEST:  [2006]
TRAIN:  [2003, 2004, 2005, 2006] TEST:  [2007]
TRAIN:  [2003, 2004, 2005, 2006, 2007] TEST:  [2008]
TRAIN:  [2003, 2004, 2005, 2006, 2007, 2008] TEST:  [2009]
TRAIN:  [2003, 2004, 2005, 2006, 2007, 2008, 2009] TEST:  [2010]
TRAIN:  [2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010] TEST:  [2011]
TRAIN:  [2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011] TEST:  [2012]
TRAIN:  [2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012] TEST:  [2013]
TRAIN:  [2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013] TEST:  [2014]
TRAIN:  [2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014] TEST:  [2015]
TRAIN:  [2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015] TEST:  [2016]
TRAIN:  [2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016] TEST:  [2017]
TRAIN:  [2003, 2004, 2005, 2006, 2007, 2008, 200

In [115]:
model = LinearRegression()

for i, yr in enumerate(year_list[:-1]):
    print('\n\n ----------- Roud '+str(i)+ ' ------------\n')
    X_train = splits['train'][i].drop(['umi_rel_ar_med', 'data', 'ano'], axis = 1).values # X
    y_train = splits['train'][i]['umi_rel_ar_med'].values # Y
    
    X_test =  splits['test'][i].drop(['umi_rel_ar_med', 'data', 'ano'], axis = 1).values
    y_test = splits['test'][i]['umi_rel_ar_med'].values
    
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    
    r2 = metrics.r2_score(y_test, pred)
    print(r2)



 ----------- Roud 0 ------------

0.9909638005005712


 ----------- Roud 1 ------------

-1.077590064205714e+25


 ----------- Roud 2 ------------

0.9882310978729555


 ----------- Roud 3 ------------

0.9844234539552928


 ----------- Roud 4 ------------

0.9909564360625638


 ----------- Roud 5 ------------

0.9736917024371932


 ----------- Roud 6 ------------

0.9844533397974531


 ----------- Roud 7 ------------

0.9853234273196528


 ----------- Roud 8 ------------

0.9930497160503019


 ----------- Roud 9 ------------

0.9891806463390536


 ----------- Roud 10 ------------

0.9850348675093781


 ----------- Roud 11 ------------

0.990965083101736


 ----------- Roud 12 ------------

0.9863426427091341


 ----------- Roud 13 ------------

0.9789817813483972


 ----------- Roud 14 ------------

0.9901337390193646


 ----------- Roud 15 ------------

0.9653136557536459
