In [1]:
import numpy as np
import pandas as pd

import scipy

from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

import matplotlib.pyplot as plt

import math
import time
from sklearn.model_selection import KFold

In [2]:
alltime = time.time()

combined = pd.read_csv('../data/all-agg.csv')
feats = ['YEAR', 'GSP', 'GDD', 'GSTmax', 'GSTmin', 'frost', 'summer' ,'HWI', 'CWI', 'dry' ,'wet', 'PRCP95P']
X = combined[feats]
# a subset of features that performs better
X = combined[['YEAR', 'GSP', 'GSTmax', 'frost', 'summer', 'PRCP95P']]

y = {
    'corn': combined['corn'],
    'beans': combined['beans']
}

N=500

results = {
    'corn': {},
    'beans': {}
}

training = False

In [3]:
def run_cv(model, y, N=100):
    lrmse = []
    lr2 = []
    cv = KFold(n_splits=5, shuffle=True)
    scoring = {
        'neg_mean_squared_error': 'neg_mean_squared_error',
        'r2': 'r2'
    }
    for _ in range(N):
        sc = cross_validate(model, X, y, scoring=scoring, cv=cv, n_jobs=-1)
        lrmse.append(math.sqrt(-sc['test_neg_mean_squared_error'].mean()))
        lr2.append(sc['test_r2'].mean())
        
    return (lrmse, lr2)

In [4]:
def resdict(lrmse, lr2):
    return {
        'rmse': np.mean(lrmse),
        'r2': np.mean(lr2),
        'rmse_std': np.std(lrmse),
        'r2_std': np.std(lr2),
    }

In [5]:
training = False
ab_model = {}
if training:
    base_est = DecisionTreeRegressor(max_depth=4)

    for _ in range(10):
        for crop in y.keys():
            parameters = {
                'n_estimators': scipy.stats.randint(low=100, high=400),
                'learning_rate': scipy.stats.uniform(loc=0.1, scale=0.9)
            }

            t = time.time()
            cv = KFold(n_splits=5, shuffle=True)
            gs = RandomizedSearchCV(AdaBoostRegressor(base_est), parameters, n_iter=1,
                                    cv=cv, n_jobs=-1, scoring='r2')
            gs.fit(X, y[crop])
            print('grid search %f sec' % (time.time() - t))

            ab_model[crop] = gs.best_estimator_
            print(ab_model)

            lrmse, lr2 = run_cv(ab_model[crop], y[crop], N=N)
            print(resdict(lrmse, lr2))
else:
    ab_model['corn'] = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=3),
                  learning_rate=0.85, n_estimators=275)
    ab_model['beans'] = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=3),
                  learning_rate=0.85, n_estimators=275)

In [6]:
print(ab_model)
for crop in y.keys():
    lrmse, lr2 = run_cv(ab_model[crop], y[crop], N=N)
    results[crop]['ab'] = resdict(lrmse, lr2)
    
print(results['corn']['ab'])
print(results['beans']['ab'])

{'corn': AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=3),
                  learning_rate=0.85, n_estimators=275), 'beans': AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=3),
                  learning_rate=0.85, n_estimators=275)}
{'rmse': 0.37689036888155053, 'r2': 0.8373271508150764, 'rmse_std': 0.025443526321151967, 'r2_std': 0.02799078698306834}
{'rmse': 0.46274867679527004, 'r2': 0.7564484748927284, 'rmse_std': 0.019616936795223576, 'r2_std': 0.035252617002168425}


In [17]:
training = False
if training:
    parameters = {
        'n_estimators': scipy.stats.randint(low=100, high=400),
        'learning_rate': scipy.stats.uniform(loc=0.1, scale=0.9)
    }

    gb_model = {}
    for crop in y.keys():
        t = time.time()
        cv = KFold(n_splits=5, shuffle=True)
        gs = RandomizedSearchCV(GradientBoostingRegressor(), parameters, n_iter=200,
                                cv=cv, n_jobs=-1, scoring='r2')
        gs.fit(X, y[crop])
        print('grid search %f sec' % (time.time() - t))

        gb_model[crop] = gs.best_estimator_
else:
    gb_model={'corn': GradientBoostingRegressor(learning_rate=0.55, n_estimators=125), 'beans': GradientBoostingRegressor(learning_rate=0.55, n_estimators=125)}

print(gb_model)

{'corn': GradientBoostingRegressor(learning_rate=0.55, n_estimators=125), 'beans': GradientBoostingRegressor(learning_rate=0.55, n_estimators=125)}


In [18]:
for crop in y.keys():
    lrmse, lr2 = run_cv(gb_model[crop], y[crop], N=N)
    results[crop]['gb'] = resdict(lrmse, lr2)

print(results['corn']['gb'])
print(results['beans']['gb'])

{'rmse': 0.38460701247894824, 'r2': 0.830640963416758, 'rmse_std': 0.028215080438080194, 'r2_std': 0.0309159727496158}
{'rmse': 0.5029265280966241, 'r2': 0.7129133438204611, 'rmse_std': 0.030693070882966984, 'r2_std': 0.04384260099669492}


In [9]:
for crop in y.keys():
    svr_model = SVR(kernel='linear', C=0.1, epsilon=0.15)
    lrmse, lr2 = run_cv(svr_model, y[crop], N=N)
    results[crop]['svr'] = resdict(lrmse, lr2)

print(results['corn']['svr'])
print(results['beans']['svr'])

{'rmse': 0.3525606165495622, 'r2': 0.8599002013417177, 'rmse_std': 0.009475514889185205, 'r2_std': 0.015585704760093091}
{'rmse': 0.3894697639566626, 'r2': 0.8260083185354391, 'rmse_std': 0.009602022382671164, 'r2_std': 0.021109948855027472}


In [15]:
for crop in y.keys():
    svrq_model = SVR(kernel='poly', coef0=9.44, C=0.087, epsilon=0.24, degree=2)
    lrmse, lr2 = run_cv(svrq_model, y[crop], N=N)
    results[crop]['svrq'] = resdict(lrmse, lr2)

print(results['corn']['svrq'])
print(results['beans']['svrq'])

{'rmse': 0.3179872760469577, 'r2': 0.8850082767805576, 'rmse_std': 0.011489370882797664, 'r2_std': 0.014828877080184223}
{'rmse': 0.37595657264279325, 'r2': 0.8358372525837884, 'rmse_std': 0.012394556401895001, 'r2_std': 0.021111304579894807}


In [11]:
for crop in y.keys():
    pca_model = make_pipeline(PCA(n_components=5), LinearRegression())
    lrmse, lr2 = run_cv(pca_model, y[crop], N=N)
    results[crop]['pca'] = resdict(lrmse, lr2)

print(results['corn']['pca'])
print(results['beans']['pca'])

{'rmse': 0.3606375114228954, 'r2': 0.8516465204096294, 'rmse_std': 0.012169350932533482, 'r2_std': 0.020996309459964797}
{'rmse': 0.39094703544519954, 'r2': 0.8247037686944088, 'rmse_std': 0.01120265274685137, 'r2_std': 0.020598225399405812}


In [12]:
for crop in y.keys():
    ridge_model = Ridge(2.7)
    lrmse, lr2 = run_cv(ridge_model, y[crop], N=N)
    results[crop]['ridge'] = resdict(lrmse, lr2)

print(results['corn']['ridge'])
print(results['beans']['ridge'])

{'rmse': 0.36148862876396526, 'r2': 0.8514189631439746, 'rmse_std': 0.01070522374940561, 'r2_std': 0.018528688440966445}
{'rmse': 0.3882292893777043, 'r2': 0.8281620531946978, 'rmse_std': 0.010971501569235988, 'r2_std': 0.021317291881433234}


In [13]:
for crop in y.keys():
    rf_model = RandomForestRegressor(min_samples_leaf=3, n_estimators=400)
    lrmse, lr2 = run_cv(ridge_model, y[crop], N=N)
    results[crop]['rf'] = resdict(lrmse, lr2)

print(results['corn']['rf'])
print(results['beans']['rf'])

{'rmse': 0.3618571168383701, 'r2': 0.8520272550004429, 'rmse_std': 0.011068148037869771, 'r2_std': 0.01970798859135039}
{'rmse': 0.38938435949123684, 'r2': 0.8278231181417143, 'rmse_std': 0.01035006547824853, 'r2_std': 0.01855764236033004}


In [16]:
for crop in y.keys():
    df = pd.DataFrame(results[crop])
    print()
    print(crop)
    print(df.T.sort_values('r2', ascending=False))


corn
           rmse        r2  rmse_std    r2_std
svrq   0.317987  0.885008  0.011489  0.014829
svr    0.352561  0.859900  0.009476  0.015586
rf     0.361857  0.852027  0.011068  0.019708
pca    0.360638  0.851647  0.012169  0.020996
ridge  0.361489  0.851419  0.010705  0.018529
ab     0.376890  0.837327  0.025444  0.027991
gb     0.387371  0.828982  0.028114  0.029253

beans
           rmse        r2  rmse_std    r2_std
svrq   0.375957  0.835837  0.012395  0.021111
ridge  0.388229  0.828162  0.010972  0.021317
rf     0.389384  0.827823  0.010350  0.018558
svr    0.389470  0.826008  0.009602  0.021110
pca    0.390947  0.824704  0.011203  0.020598
ab     0.462749  0.756448  0.019617  0.035253
gb     0.486159  0.731827  0.024370  0.035773
