In [1]:
############## imports
# general
import statistics
import datetime
from sklearn.externals import joblib # save and load models
import random
# data manipulation and exploration
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

## machine learning stuff
# preprocessing
from sklearn import preprocessing
# feature selection
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.feature_selection import f_regression
# pipeline
from sklearn.pipeline import Pipeline
# train/testing
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score  
# error calculations
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# models
from sklearn.linear_model import LinearRegression # linear regression
from sklearn.linear_model import BayesianRidge #bayesisan ridge regression
from sklearn.svm import SVR  # support vector machines regression
from sklearn.gaussian_process import GaussianProcessRegressor # import GaussianProcessRegressor
from sklearn.neighbors import KNeighborsRegressor # k-nearest neightbors for regression
from sklearn.neural_network import MLPRegressor # neural network for regression
from sklearn.tree import DecisionTreeRegressor # decision tree regressor
from sklearn.ensemble import RandomForestRegressor  # random forest regression
from sklearn.ensemble import AdaBoostRegressor # adaboost for regression
# saving models
# from sklearn.externals import joblib
import joblib




In [2]:
# import the API
APILoc = r"C:\Users\thejo\Documents\school\AI in AG research\API"

import sys
sys.path.insert(0, APILoc)

from API import *

In [3]:
# get aggregate data
aggDataLoc = r'C:\Users\thejo\Documents\school\AI in AG research\experiment\aggregateData_MS_KY_GA.csv'
#aggDataLoc = r'C:\Users\thejo\Documents\school\AI in AG research\experiment\aggregateDataWithVariety.csv'
#targetDataLoc = r'C:\Users\thejo\Documents\school\AI in AG research\experiment\aggregateData_GAonly_Annual_final.csv'

aggDf = pd.read_csv(aggDataLoc)
#aggDf = aggDf.drop("Unnamed: 0",axis=1)
#targetDf = pd.read_csv(targetDataLoc)
#targetDf = targetDf.drop("Unnamed: 0",axis=1)


In [4]:
aggDf.head()
#targetDf.head()

Unnamed: 0,State,City,Date Sown,Variety,Date of Cut,Julian Day,Yield (tons/acre),Time Since Sown (Days),Time Since Last Harvest (Days),Total Radiation (MJ/m^2),Total Rainfall (mm),Avg Air Temp (C),Avg Min Temp (C),Avg Max Temp (C),Avg Soil Moisture (%)
0,MS,Holly Springs,9/28/2012,Alfalfagraze 600RR,11/5/2013,309,2.827,402,1,5908.17,1678.9,12.3,5.65,18.95,0.34
1,MS,Holly Springs,9/28/2012,Ameristand 815TRR,11/5/2013,309,2.649,402,1,5908.17,1678.9,12.3,5.65,18.95,0.34
2,MS,Holly Springs,9/28/2012,Bulldog 505,11/5/2013,309,2.612,402,1,5908.17,1678.9,12.3,5.65,18.95,0.34
3,MS,Holly Springs,9/28/2012,DKA41-18RR,11/5/2013,309,2.58,402,1,5908.17,1678.9,12.3,5.65,18.95,0.34
4,MS,Holly Springs,9/28/2012,DKA65-10RR,11/5/2013,309,2.4715,402,1,5908.17,1678.9,12.3,5.65,18.95,0.34


In [5]:
# filter out the features that will not be used by the machine learning models

# the features to keep:
# xColumnsToKeep = ["Julian Day", "Time Since Sown (Days)", "Time Since Last Harvest (Days)", "Total Radiation (MJ/m^2)",
#                "Total Rainfall (mm)", "Avg Air Temp (C)", "Avg Min Temp (C)", "Avg Max Temp (C)",
#                  "Avg Soil Moisture (%)", "Day Length (hrs)"], "Percent Cover (%)"]

xColumnsToKeep = ["Julian Day", "Time Since Sown (Days)", "Total Radiation (MJ/m^2)",
               "Total Rainfall (mm)", "Avg Air Temp (C)", "Avg Min Temp (C)", "Avg Max Temp (C)",
                 "Avg Soil Moisture (%)"]


#xColumnsToKeep = ["Julian Day", "Time Since Sown (Days)", "Total Radiation (MJ/m^2)", "Total Rainfall (mm)"]

# the target to keep
yColumnsToKeep = ["Yield (tons/acre)"]

# get a dataframe containing the features and the targets
xDf = aggDf[xColumnsToKeep]
#yDf = targetDf[yColumnsToKeep]
yDf = aggDf[yColumnsToKeep]

# reset the index
xDf = xDf.reset_index(drop=True)
yDf = yDf.reset_index(drop=True)

pd.set_option('display.max_rows', 2500)
pd.set_option('display.max_columns', 500)

xCols = list(xDf)

In [6]:
xDf.head()

Unnamed: 0,Julian Day,Time Since Sown (Days),Total Radiation (MJ/m^2),Total Rainfall (mm),Avg Air Temp (C),Avg Min Temp (C),Avg Max Temp (C),Avg Soil Moisture (%)
0,309,402,5908.17,1678.9,12.3,5.65,18.95,0.34
1,309,402,5908.17,1678.9,12.3,5.65,18.95,0.34
2,309,402,5908.17,1678.9,12.3,5.65,18.95,0.34
3,309,402,5908.17,1678.9,12.3,5.65,18.95,0.34
4,309,402,5908.17,1678.9,12.3,5.65,18.95,0.34


In [7]:
yDf.head()

Unnamed: 0,Yield (tons/acre)
0,2.827
1,2.649
2,2.612
3,2.58
4,2.4715


In [8]:

# hide the warnings because training the neural network caues lots of warnings.
import warnings
warnings.filterwarnings('ignore')

# make the parameter grids for sklearn's gridsearchcv
rfParamGrid = {
        'model__n_estimators': [5, 10, 25, 50, 100], # Number of estimators
        'model__max_depth': [5, 10, 15, 20], # Maximum depth of the tree
        'model__criterion': ["mae"]
    }
knnParamGrid ={
        'model__n_neighbors':[2,5,10],
        'model__weights': ['uniform', 'distance'],
        'model__leaf_size': [5, 10, 30, 50]    
    }
svrParamGrid = {
        'model__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'model__C': [0.1, 1.0, 5.0, 10.0],
        'model__gamma': ["scale", "auto"],
        'model__degree': [2,3,4,5]
    }
nnParamGrid = {
        'model__hidden_layer_sizes':[(3), (5), (10), (3,3), (5,5), (7,7)],
        'model__solver': ['sgd', 'adam'],
        'model__learning_rate' : ['constant', 'invscaling', 'adaptive'],
        'model__learning_rate_init': [0.1, 0.01, 0.001]      
    }

linRegParamGrid = {}

bayesParamGrid={
        'model__n_iter':[100,300,500]
    }

dtParamGrid = {
    'model__criterion': ['mae'],
    'model__max_depth': [5,10,25,50,100]
    }

aModelList = [(RandomForestRegressor(), rfParamGrid, "rfTup.pkl")]

N = 10
workingDir = r"C:\Users\thejo\Documents\school\AI in AG research\experiment"
numFeatures = 8 # 11


In [9]:
saveMLResults(N, xDf, yDf, aModelList, workingDir, numFeatures, printResults=True)

IndexError: arrays used as indices must be of integer (or boolean) type