In [1]:
import numpy as np
import pandas as pd

import scipy

from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

import matplotlib.pyplot as plt

import math
import time
from sklearn.model_selection import KFold

In [2]:
alltime = time.time()

combined = pd.read_csv('../data/all-agg.csv')
feats = ['YEAR', 'GSP', 'GDD', 'GSTmax', 'GSTmin', 'frost', 'summer' ,'HWI', 'CWI', 'dry' ,'wet', 'PRCP95P']
X = combined[feats]
# a subset of features that performs better
X = combined[['YEAR', 'GSP', 'GSTmax', 'frost', 'summer', 'PRCP95P']]

y = {
    'corn': combined['corn'],
    'beans': combined['beans']
}

N=500

results = {
    'corn': {},
    'beans': {}
}

In [3]:
def run_cv(model, y, N=100):
    lrmse = []
    lr2 = []
    cv = KFold(n_splits=5, shuffle=True)
    scoring = {
        'neg_mean_squared_error': 'neg_mean_squared_error',
        'r2': 'r2'
    }
    for _ in range(N):
        sc = cross_validate(model, X, y, scoring=scoring, cv=cv, n_jobs=-1)
        lrmse.append(math.sqrt(-sc['test_neg_mean_squared_error'].mean()))
        lr2.append(sc['test_r2'].mean())
        
    return (lrmse, lr2)

In [4]:
def resdict(lrmse, lr2):
    return {
        'rmse': np.mean(lrmse),
        'r2': np.mean(lr2),
        'rmse_std': np.std(lrmse),
        'r2_std': np.std(lr2),
    }

In [5]:
base_est = DecisionTreeRegressor(max_depth=3)

ab_model = {}
for crop in y.keys():
    parameters = {
        'n_estimators': [200,300,500],
        'learning_rate': [0.2, 0.4, 0.6, 0.8, 1.0]
    }

    t = time.time()
    cv = KFold(n_splits=5, shuffle=True)
    gs = GridSearchCV(AdaBoostRegressor(base_est), parameters, cv=cv, n_jobs=-1, scoring='r2')
    gs.fit(X, y[crop])
    print('grid search %f sec' % (time.time() - t))

    ab_model[crop] = gs.best_estimator_

grid search 3.830197 sec
grid search 3.274939 sec


In [6]:
for crop in y.keys():
    lrmse, lr2 = run_cv(ab_model[crop], y[crop], N=N)
    results[crop]['ab'] = resdict(lrmse, lr2)
    
print(results['corn']['ab'])
print(results['beans']['ab'])

{'rmse': 0.3759118045843643, 'r2': 0.8373339708886751, 'rmse_std': 0.026133847885898284, 'r2_std': 0.028540964019921865}
{'rmse': 0.4629232844889582, 'r2': 0.7559436017449961, 'rmse_std': 0.019671289930085687, 'r2_std': 0.03110317495153947}


In [7]:
parameters = {
    'n_estimators': [200,300,500],
    'learning_rate': [0.2, 0.4, 0.6, 0.8, 1.0]
}

gb_model = {}
for crop in y.keys():
    t = time.time()
    cv = KFold(n_splits=5, shuffle=True)
    gs = GridSearchCV(GradientBoostingRegressor(), parameters, cv=cv, n_jobs=-1, scoring='r2')
    gs.fit(X, y[crop])
    print('grid search %f sec' % (time.time() - t))

    gb_model[crop] = gs.best_estimator_


grid search 1.050233 sec
grid search 1.091724 sec


In [8]:
for crop in y.keys():
    lrmse, lr2 = run_cv(gb_model[crop], y[crop], N=N)
    results[crop]['gb'] = resdict(lrmse, lr2)

print(results['corn']['gb'])
print(results['beans']['gb'])

{'rmse': 0.3788636321960838, 'r2': 0.8352624657852562, 'rmse_std': 0.02637442989973804, 'r2_std': 0.028480159711414835}
{'rmse': 0.49108928330468743, 'r2': 0.7245995144002588, 'rmse_std': 0.026052846560337736, 'r2_std': 0.0386444216525506}


In [9]:
for crop in y.keys():
    svr_model = SVR(kernel='linear', C=0.1, epsilon=0.15)
    lrmse, lr2 = run_cv(svr_model, y[crop], N=N)
    results[crop]['svr'] = resdict(lrmse, lr2)

print(results['corn']['svr'])
print(results['beans']['svr'])

{'rmse': 0.35302147401764283, 'r2': 0.8583237465407576, 'rmse_std': 0.00961424233452572, 'r2_std': 0.017648942428249062}
{'rmse': 0.38852794167539356, 'r2': 0.8260493807294674, 'rmse_std': 0.009172036790616744, 'r2_std': 0.02224923197848941}


In [10]:
for crop in y.keys():
    svrq_model = SVR(kernel='poly', coef0=9.44, C=0.087, epsilon=0.24, degree=2)
    lrmse, lr2 = run_cv(svrq_model, y[crop], N=N)
    results[crop]['svrq'] = resdict(lrmse, lr2)

print(results['corn']['svrq'])
print(results['beans']['svrq'])

{'rmse': 0.3186079479001509, 'r2': 0.8850571260981057, 'rmse_std': 0.011976997115599112, 'r2_std': 0.016732124862675182}
{'rmse': 0.37658994656746736, 'r2': 0.8350881779884587, 'rmse_std': 0.013560156618885925, 'r2_std': 0.021595195463161913}


In [11]:
for crop in y.keys():
    pca_model = make_pipeline(PCA(n_components=5), LinearRegression())
    lrmse, lr2 = run_cv(pca_model, y[crop], N=N)
    results[crop]['pca'] = resdict(lrmse, lr2)

print(results['corn']['pca'])
print(results['beans']['pca'])

{'rmse': 0.35927326063863163, 'r2': 0.8515493505220965, 'rmse_std': 0.011067724822244036, 'r2_std': 0.021438932074762294}
{'rmse': 0.39094153331905745, 'r2': 0.8241870075704598, 'rmse_std': 0.011262684660171999, 'r2_std': 0.0206656690074072}


In [12]:
for crop in y.keys():
    ridge_model = Ridge(2.7)
    lrmse, lr2 = run_cv(ridge_model, y[crop], N=N)
    results[crop]['ridge'] = resdict(lrmse, lr2)

print(results['corn']['ridge'])
print(results['beans']['ridge'])

{'rmse': 0.3611208218631796, 'r2': 0.8530222666750664, 'rmse_std': 0.01083253464578352, 'r2_std': 0.0189879791513637}
{'rmse': 0.3886888247039051, 'r2': 0.827689922670936, 'rmse_std': 0.010835514680277181, 'r2_std': 0.019218031950744164}


In [13]:
for crop in y.keys():
    rf_model = RandomForestRegressor(min_samples_leaf=3, n_estimators=400)
    lrmse, lr2 = run_cv(ridge_model, y[crop], N=N)
    results[crop]['rf'] = resdict(lrmse, lr2)

print(results['corn']['rf'])
print(results['beans']['rf'])

{'rmse': 0.36066160340749037, 'r2': 0.8515114655100106, 'rmse_std': 0.01114534092170545, 'r2_std': 0.018769565669572566}
{'rmse': 0.3879774068675635, 'r2': 0.8273994227109842, 'rmse_std': 0.010482993653413713, 'r2_std': 0.021944421569151225}


In [14]:
for crop in y.keys():
    df = pd.DataFrame(results[crop])
    print()
    print(crop)
    print(df.T.sort_values('r2', ascending=False))


corn
           rmse        r2  rmse_std    r2_std
svrq   0.318608  0.885057  0.011977  0.016732
svr    0.353021  0.858324  0.009614  0.017649
ridge  0.361121  0.853022  0.010833  0.018988
pca    0.359273  0.851549  0.011068  0.021439
rf     0.360662  0.851511  0.011145  0.018770
ab     0.375912  0.837334  0.026134  0.028541
gb     0.378864  0.835262  0.026374  0.028480

beans
           rmse        r2  rmse_std    r2_std
svrq   0.376590  0.835088  0.013560  0.021595
ridge  0.388689  0.827690  0.010836  0.019218
rf     0.387977  0.827399  0.010483  0.021944
svr    0.388528  0.826049  0.009172  0.022249
pca    0.390942  0.824187  0.011263  0.020666
ab     0.462923  0.755944  0.019671  0.031103
gb     0.491089  0.724600  0.026053  0.038644
