In [132]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.svm import SVR
import math



In [133]:
# Import CVS
cropsDF = pd.read_csv('../data/agri/frontiers/Processed_Iowa+Cerro+Gordo_1960+2009_Annual+Crop.csv')

# Sort by year
cropsDF = cropsDF.sort_values(by='Year')
cropsDF.head()

Unnamed: 0,Program,Year,Period,Geo Level,State,Ag District,County,Commodity,Data Item,Domain Category,Value
960,SURVEY,1960,YEAR,COUNTY,IOWA,NORTH CENTRAL,CERRO GORDO,SOYBEANS,"SOYBEANS - YIELD, MEASURED IN BU / ACRE",NOT SPECIFIED,24.5
950,SURVEY,1960,YEAR,COUNTY,IOWA,NORTH CENTRAL,CERRO GORDO,CORN,CORN - ACRES PLANTED,NOT SPECIFIED,165100.0
951,SURVEY,1960,YEAR,COUNTY,IOWA,NORTH CENTRAL,CERRO GORDO,CORN,"CORN, GRAIN - ACRES HARVESTED",NOT SPECIFIED,159300.0
952,SURVEY,1960,YEAR,COUNTY,IOWA,NORTH CENTRAL,CERRO GORDO,CORN,"CORN, GRAIN - PRODUCTION, MEASURED IN BU",NOT SPECIFIED,10498000.0
953,SURVEY,1960,YEAR,COUNTY,IOWA,NORTH CENTRAL,CERRO GORDO,CORN,"CORN, GRAIN - YIELD, MEASURED IN BU / ACRE",NOT SPECIFIED,65.9


In [134]:
# Filter data, look for 'BU/ACRE' and filter CORN & SOYBEANS
cornDF = cropsDF.loc[(cropsDF['Data Item'].str.contains('BU / ACRE')) & (cropsDF['Commodity'] == 'CORN')]
beansDF = cropsDF.loc[(cropsDF['Data Item'].str.contains('BU / ACRE')) & (cropsDF['Commodity'] == 'SOYBEANS')]

cornDF = cornDF[['Year', 'Value']]
beansDF = beansDF[['Year', 'Value']]


In [135]:
# GSP - Growing season precipitation
# GDD - Growing degree days
# GSTmax - Daily max temp avg
# GSTmin - Daily min temp avg
# frost - days < 0 degrees (C)
# summer - days > 25 degrees (C)


weatherDF = pd.read_csv('../data/wx/wx-frontier-agg.csv')
weatherDF.head()

Unnamed: 0,INDEX,YEAR,GSP,GDD,GSTmax,GSTmin,frost,summer,HWI,CWI,dry,wet,PRCP95P
0,0,1941,3.869512,1555.1,25.865244,13.09939,2,94,11,7,13,5,4
1,1,1942,3.379878,1327.1,24.22622,11.957927,6,81,7,12,15,3,5
2,2,1943,3.396341,1303.8,23.991463,11.908537,7,80,6,7,10,4,2
3,3,1944,3.342331,1467.1,24.992073,12.89939,4,91,12,13,15,6,2
4,4,1945,3.471951,1130.0,23.137195,10.643293,6,66,5,13,19,3,5


In [136]:
# These columns will not be averaged into the final dataframe
weatherAvg = weatherDF.drop(['INDEX', 'YEAR', 'frost', 'summer', 'HWI', 'CWI', 'dry', 'wet', 'PRCP95P'], axis=1)

# Taking the mean of GSP, GDD, GSTmax, and GSTmin
weatherAvg['avg'] = weatherAvg.mean(axis=1)
weatherAvg.head()


Unnamed: 0,GSP,GDD,GSTmax,GSTmin,avg
0,3.869512,1555.1,25.865244,13.09939,399.483537
1,3.379878,1327.1,24.22622,11.957927,341.666006
2,3.396341,1303.8,23.991463,11.908537,335.774085
3,3.342331,1467.1,24.992073,12.89939,377.083449
4,3.471951,1130.0,23.137195,10.643293,291.81311


In [137]:
# Getting the 'YEAR' column and sorting the years that we want
weatherDF = weatherDF[['YEAR']]
weatherDF = weatherDF.loc[(weatherDF['YEAR'] >= 1960) & (weatherDF['YEAR'] <= 2009)]

# Creating a new 'avg' column and setting it to the mean calculated in the cell above
weatherDF['avg'] = weatherAvg[['avg']]
weatherDF.head()

Unnamed: 0,YEAR,avg
19,1960,356.002591
20,1961,344.20564
21,1962,342.824543
22,1963,389.793369
23,1964,349.246037


In [138]:
# Creating a new 'avg' column in the crops dataframes
# and setting the to be the 'avg' values in the weather data
# also changing them to numpy arrays

cornDF['avg'] = weatherDF['avg'].to_numpy()
beansDF['avg'] = weatherDF['avg'].to_numpy()

cornDF.head()

Unnamed: 0,Year,Value,avg
953,1960,65.9,356.002591
942,1961,73.5,344.20564
931,1962,73.3,342.824543
924,1963,76.3,389.793369
913,1964,75.1,349.246037


In [139]:
# Changing the 'Value' column from strings to floats
cornDF["Value"] = cornDF["Value"].astype(float)
beansDF["Value"] = beansDF["Value"].astype(float)

## Corn Data with weather data

In [140]:
# Separating the columns to split the data
X = cornDF['avg']
y = cornDF['Value']

# Splitting data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [141]:
# Using a SVR model, values
# were taken from the frontiers paper.
model = SVR(C=0.1, epsilon=0.15)

In [142]:
model.fit(X_train.to_numpy().reshape(-1, 1), y_train.to_numpy())

SVR(C=0.1, epsilon=0.15)

In [143]:
# Predict values using the testing data
predictions = model.predict(X_test.to_numpy().reshape(-1, 1))

In [144]:
# Calculate the RMSE (Every time the notebook runs the value changes)

rmse = cross_val_score(SVR(), y_test.to_numpy().reshape(-1, 1), predictions, scoring="neg_mean_squared_error", cv=5)
rmse = np.absolute(rmse)
rmse = math.sqrt(rmse.mean())
rmse

0.17962383042381877

In [145]:
# This function is just to check the max, min, and avg of the rmse
# doing X iterations

def max_min_rmse(model, iter):
    max = 0
    min = 9999
    avg = 0
    for i in range(iter):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        model.fit(X_train.to_numpy().reshape(-1, 1), y_train.to_numpy())
        n = model.predict(X_test.to_numpy().reshape(-1, 1))
        rmse = cross_val_score(SVR(), y_test.to_numpy().reshape(-1, 1), n, scoring="neg_mean_squared_error", cv=5)
        rmse = np.absolute(rmse)
        rmse = math.sqrt(rmse.mean())
        avg += rmse

        if min > rmse:
            min = rmse
        if max < rmse:
            max = rmse
    return round(min, 2), round(max, 2), round((avg/iter), 2)

In [146]:
# Running the function above
iter = 1000
min, max, avg = max_min_rmse(model, iter)

print('Min RMSE: ', min, ' in ', iter, ' iterations')
print('Max RMSE: ', max, ' in ', iter, ' iterations')
print('Avg RMSE: ', avg, ' in ', iter, ' iterations')

Min RMSE:  0.01  in  1000  iterations
Max RMSE:  0.68  in  1000  iterations
Avg RMSE:  0.24  in  1000  iterations


## SOYBEANS with weather data

In [147]:
# Separating the columns to split the data
X = beansDF['avg']
y = beansDF['Value']

# Splitting data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [148]:
model.fit(X_train.to_numpy().reshape(-1, 1), y_train.to_numpy())

SVR(C=0.1, epsilon=0.15)

In [149]:
predictions = model.predict(X_test.to_numpy().reshape(-1, 1))

In [150]:
# Calculate the RMSE (Every time the notebook runs the value changes)

rmse = cross_val_score(SVR(), y_test.to_numpy().reshape(-1, 1), predictions, scoring="neg_mean_squared_error", cv=5)
rmse = np.absolute(rmse)
rmse = math.sqrt(rmse.mean())
rmse

0.09719359479377898

In [151]:
# This function is just to check the max, min, and avg of the rmse
# doing X iterations

def max_min_rmse(model, iter):
    max = 0
    min = 9999
    avg = 0
    for i in range(iter):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        model.fit(X_train.to_numpy().reshape(-1, 1), y_train.to_numpy())
        n = model.predict(X_test.to_numpy().reshape(-1, 1))
        rmse = cross_val_score(SVR(), y_test.to_numpy().reshape(-1, 1), n, scoring="neg_mean_squared_error", cv=5)
        rmse = np.absolute(rmse)
        rmse = math.sqrt(rmse.mean())
        avg += rmse

        if min > rmse:
            min = rmse
        if max < rmse:
            max = rmse
    return round(min, 2), round(max, 2), round((avg/iter), 2)

In [152]:
# Running the function above
iter = 1000
min, max, avg = max_min_rmse(model, iter)

print('Min RMSE: ', min, ' in ', iter, ' iterations')
print('Max RMSE: ', max, ' in ', iter, ' iterations')
print('Avg RMSE: ', avg, ' in ', iter, ' iterations')

Min RMSE:  0.01  in  1000  iterations
Max RMSE:  0.38  in  1000  iterations
Avg RMSE:  0.13  in  1000  iterations
