The notebook is meant to help the user experiment with different models and features. This notebook assumes that there is a saved csv called 'filteredAggregateData.csv' somewhere on your local harddrive. The location must be specified below.

The cell imports all of the relevant packages.

In [1]:
############## imports
# general
import statistics
import datetime
from sklearn.externals import joblib # save and load models
import random
# data manipulation and exploration
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

## machine learning stuff
# preprocessing
from sklearn import preprocessing
# feature selection
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.feature_selection import f_regression
# pipeline
from sklearn.pipeline import Pipeline
# train/testing
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score  
# error calculations
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# models
from sklearn.linear_model import LinearRegression # linear regression
from sklearn.linear_model import BayesianRidge #bayesisan ridge regression
from sklearn.svm import SVR  # support vector machines regression
from sklearn.gaussian_process import GaussianProcessRegressor # import GaussianProcessRegressor
from sklearn.neighbors import KNeighborsRegressor # k-nearest neightbors for regression
from sklearn.neural_network import MLPRegressor # neural network for regression
from sklearn.tree import DecisionTreeRegressor # decision tree regressor
from sklearn.ensemble import RandomForestRegressor  # random forest regression
from sklearn.ensemble import AdaBoostRegressor # adaboost for regression
# saving models
# from sklearn.externals import joblib
import joblib




Imports the API. 'APILoc' is the location of 'API.py' on your local harddrive.

In [2]:
# import the API
APILoc = r"C:\Users\thejo\Documents\school\AI in AG research\API"

import sys
sys.path.insert(0, APILoc)

from API import *

Load the dataset. Note that the location of the dataset must be specified.

In [3]:
# get aggregate data
aggDataLoc = r'C:\Users\thejo\Documents\school\AI in AG research\experiment\aggregateData_KY.csv'
#aggDataLoc = r'C:\Users\thejo\Documents\school\AI in AG research\experiment\aggregateDataWithVariety.csv'
targetDataLoc = r'C:\Users\thejo\Documents\school\AI in AG research\experiment\aggregateData_GAonly_Annual_final.csv'

aggDf = pd.read_csv(aggDataLoc)
#aggDf = aggDf.drop("Unnamed: 0",axis=1)
targetDf = pd.read_csv(targetDataLoc)
#targetDf = targetDf.drop("Unnamed: 0",axis=1)


Test to see if the dataset was loaded properly. A table of the first 5 datapoints should appear.

In [4]:
aggDf.head()
#targetDf.head()

Unnamed: 0,State,City,Date Sown,Variety,Date of Cut,Julian Day,Yield (tons/acre),Time Since Sown (Days),Time Since Last Harvest (Days),Total Radiation (MJ/m^2),Total Rainfall (mm),Avg Air Temp (C),Avg Min Temp (C),Avg Max Temp (C),Avg Soil Moisture (%)
0,Kentucky,Lexington,8/9/2012,55V50,8/12/2013,224,7.01,368,1,3273.52,804.6,17.607333,12.617071,22.598714,0.313481
1,Kentucky,Lexington,8/9/2012,Phoenix,8/12/2013,224,6.97,368,1,3273.52,804.6,17.607333,12.617071,22.598714,0.313481
2,Kentucky,Lexington,8/9/2012,Evermore,8/12/2013,224,6.96,368,1,3273.52,804.6,17.607333,12.617071,22.598714,0.313481
3,Kentucky,Lexington,8/9/2012,4030,8/12/2013,224,6.95,368,1,3273.52,804.6,17.607333,12.617071,22.598714,0.313481
4,Kentucky,Lexington,8/9/2012,Caliber,8/12/2013,224,6.87,368,1,3273.52,804.6,17.607333,12.617071,22.598714,0.313481


Filter out features that will not be made available for feature selection. All of the features in the list 'XColumnsToKeep' will be made available for feature selection. The features to include are: <br>
"Julian Day" <br>
"Time Since Sown (Days)" <br>
"Time Since Last Harvest (Days)" <br>
"Total Radiation (MJ/m^2)" <br>
"Total Rainfall (mm)" <br>
"Avg Air Temp (C)" <br>
"Avg Min Temp (C)" <br>
"Avg Max Temp (C)"<br>
"Avg Soil Moisture (%)"<br>
"Day Length (hrs)"<br>
"Percent Cover (%)"<br>

In [5]:
# filter out the features that will not be used by the machine learning models

# the features to keep:
# xColumnsToKeep = ["Julian Day", "Time Since Sown (Days)", "Time Since Last Harvest (Days)", "Total Radiation (MJ/m^2)",
#                "Total Rainfall (mm)", "Avg Air Temp (C)", "Avg Min Temp (C)", "Avg Max Temp (C)",
#                  "Avg Soil Moisture (%)", "Day Length (hrs)"], "Percent Cover (%)"]

xColumnsToKeep = ["Julian Day", "Time Since Sown (Days)", "Total Radiation (MJ/m^2)",
               "Total Rainfall (mm)", "Avg Air Temp (C)", "Avg Min Temp (C)", "Avg Max Temp (C)",
                 "Avg Soil Moisture (%)"]


#xColumnsToKeep = ["Julian Day", "Time Since Sown (Days)", "Total Radiation (MJ/m^2)", "Total Rainfall (mm)"]

# the target to keep
yColumnsToKeep = ["Yield (tons/acre)"]

# get a dataframe containing the features and the targets
xDf = aggDf[xColumnsToKeep]
test_xDf = targetDf[xColumnsToKeep]
yDf = aggDf[yColumnsToKeep]
test_yDf = targetDf[yColumnsToKeep]

# reset the index
xDf = xDf.reset_index(drop=True)
yDf = yDf.reset_index(drop=True)
test_xDf = xDf.reset_index(drop=True)
test_yDf = yDf.reset_index(drop=True)

pd.set_option('display.max_rows', 2500)
pd.set_option('display.max_columns', 500)

xCols = list(xDf)

Test to see if the features dataframe and the target dataframe were successfully made.

In [6]:
xDf.head()

Unnamed: 0,Julian Day,Time Since Sown (Days),Total Radiation (MJ/m^2),Total Rainfall (mm),Avg Air Temp (C),Avg Min Temp (C),Avg Max Temp (C),Avg Soil Moisture (%)
0,224,368,3273.52,804.6,17.607333,12.617071,22.598714,0.313481
1,224,368,3273.52,804.6,17.607333,12.617071,22.598714,0.313481
2,224,368,3273.52,804.6,17.607333,12.617071,22.598714,0.313481
3,224,368,3273.52,804.6,17.607333,12.617071,22.598714,0.313481
4,224,368,3273.52,804.6,17.607333,12.617071,22.598714,0.313481


In [7]:
yDf.head()

Unnamed: 0,Yield (tons/acre)
0,7.01
1,6.97
2,6.96
3,6.95
4,6.87


Lets now define the parameters that will be used to run the machine learning experiments. Note that parameter grids could be made that will allow sci-kit learn to use a 5-fold gridsearch to find the model's best hyperparameters. The parameter grids that are defined here will specify the possible values for the grid search. <br>
<br>
Once the parameter grids are defined, a list of tuples must also be defined. The tuples must take the form of: <br>
(sci-kit learn model, appropriate parameter grid, name of the file to be saved). <br>
<br>
Then the number of iterations should be made. This is represented by the variable 'N'. Each model will be evaluated N times (via N-fold cross validation), and the average results of the models over those N iterations will be returned. <br>
<br>
'workingDir' is the directory in which all of the results will be saved. <br>
<br>
'numFeatures' is the number of features that will be selected (via feature selection).

In [8]:

# hide the warnings because training the neural network caues lots of warnings.
import warnings
warnings.filterwarnings('ignore')

# make the parameter grids for sklearn's gridsearchcv
rfParamGrid = {
        'model__n_estimators': [5, 10, 25, 50, 100], # Number of estimators
        'model__max_depth': [5, 10, 15, 20], # Maximum depth of the tree
        'model__criterion': ["mae"]
    }
knnParamGrid ={
        'model__n_neighbors':[2,5,10],
        'model__weights': ['uniform', 'distance'],
        'model__leaf_size': [5, 10, 30, 50]    
    }
svrParamGrid = {
        'model__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'model__C': [0.1, 1.0, 5.0, 10.0],
        'model__gamma': ["scale", "auto"],
        'model__degree': [2,3,4,5]
    }
nnParamGrid = {
        'model__hidden_layer_sizes':[(3), (5), (10), (3,3), (5,5), (7,7)],
        'model__solver': ['sgd', 'adam'],
        'model__learning_rate' : ['constant', 'invscaling', 'adaptive'],
        'model__learning_rate_init': [0.1, 0.01, 0.001]      
    }

linRegParamGrid = {}

bayesParamGrid={
        'model__n_iter':[100,300,500]
    }

dtParamGrid = {
    'model__criterion': ['mae'],
    'model__max_depth': [5,10,25,50,100]
    }

aModelList = [(RandomForestRegressor(), rfParamGrid, "rfTup.pkl"),
              (KNeighborsRegressor(), knnParamGrid, "knnTup.pkl"),
              (SVR(), svrParamGrid, "svrTup.pkl"),
             #(MLPRegressor(), nnParamGrid, "nnTup.pkl")],
             (LinearRegression(), linRegParamGrid, "linRegTup.pkl"),
             (BayesianRidge(), bayesParamGrid, "bayesTup.pkl"),
             (DecisionTreeRegressor(), dtParamGrid, "dtTup.pkl")]

N = 10
workingDir = r"C:\Users\thejo\Documents\school\AI in AG research\experiment"
numFeatures = 8 # 11


This cell will run the tests and save the results.

In [9]:
saveMLResults(test_xDf, test_yDf, N, xDf, yDf, aModelList, workingDir, numFeatures, printResults=True)

model:  rfTup
Avg MAE:  333.706
Avg R squared:  0.977
Best MAE:  231.922
Best R squared:  0.995
Parameters of the best model:  {'model__criterion': 'mae', 'model__max_depth': 15, 'model__n_estimators': 50}
Features selected by best model:  ['Julian Day', 'Time Since Sown (Days)', 'Total Radiation (MJ/m^2)', 'Total Rainfall (mm)', 'Avg Air Temp (C)', 'Avg Min Temp (C)', 'Avg Max Temp (C)', 'Avg Soil Moisture (%)']
 
test results on our test data: 
results:
0.14755217391304343
0.9856952595988276
model:  knnTup
Avg MAE:  353.832
Avg R squared:  0.975
Best MAE:  262.857
Best R squared:  0.994
Parameters of the best model:  {'model__leaf_size': 30, 'model__n_neighbors': 10, 'model__weights': 'distance'}
Features selected by best model:  ['Julian Day', 'Time Since Sown (Days)', 'Total Radiation (MJ/m^2)', 'Total Rainfall (mm)', 'Avg Air Temp (C)', 'Avg Min Temp (C)', 'Avg Max Temp (C)', 'Avg Soil Moisture (%)']
 
test results on our test data: 
results:
0.15640916149068324
0.986041680184709
