In [1166]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
import math

In [1167]:
# Import CVS
cropsDF = pd.read_csv('../data/agri/frontiers/Processed_Iowa+Cerro+Gordo_1960+2009_Annual+Crop.csv')

# Sort by year
cropsDF = cropsDF.sort_values(by='Year')
cropsDF.head()

# jimg Change Year to YEAR so it matches wx data
cropsDF['YEAR'] = cropsDF.Year
cropsDF.drop('Year', axis=1, inplace=True)

cropsDF.Value = cropsDF.Value.astype('float')

cropsDF

Unnamed: 0,Program,Period,Geo Level,State,Ag District,County,Commodity,Data Item,Domain Category,Value,YEAR
151,SURVEY,YEAR,COUNTY,IOWA,NORTH CENTRAL,CERRO GORDO,SOYBEANS,"SOYBEANS - YIELD, MEASURED IN BU / ACRE",NOT SPECIFIED,19.6,1940
150,SURVEY,YEAR,COUNTY,IOWA,NORTH CENTRAL,CERRO GORDO,CORN,"CORN, GRAIN - YIELD, MEASURED IN BU / ACRE",NOT SPECIFIED,51.8,1940
148,SURVEY,YEAR,COUNTY,IOWA,NORTH CENTRAL,CERRO GORDO,CORN,"CORN, GRAIN - YIELD, MEASURED IN BU / ACRE",NOT SPECIFIED,46.5,1941
149,SURVEY,YEAR,COUNTY,IOWA,NORTH CENTRAL,CERRO GORDO,SOYBEANS,"SOYBEANS - YIELD, MEASURED IN BU / ACRE",NOT SPECIFIED,14.9,1941
146,SURVEY,YEAR,COUNTY,IOWA,NORTH CENTRAL,CERRO GORDO,CORN,"CORN, GRAIN - YIELD, MEASURED IN BU / ACRE",NOT SPECIFIED,56.3,1942
...,...,...,...,...,...,...,...,...,...,...,...
4,SURVEY,YEAR,COUNTY,IOWA,NORTH CENTRAL,CERRO GORDO,CORN,"CORN, GRAIN - YIELD, MEASURED IN BU / ACRE",NOT SPECIFIED,163.1,2013
3,SURVEY,YEAR,COUNTY,IOWA,NORTH CENTRAL,CERRO GORDO,SOYBEANS,"SOYBEANS - YIELD, MEASURED IN BU / ACRE",NOT SPECIFIED,50.2,2014
2,SURVEY,YEAR,COUNTY,IOWA,NORTH CENTRAL,CERRO GORDO,CORN,"CORN, GRAIN - YIELD, MEASURED IN BU / ACRE",NOT SPECIFIED,168.3,2014
1,SURVEY,YEAR,COUNTY,IOWA,NORTH CENTRAL,CERRO GORDO,SOYBEANS,"SOYBEANS - YIELD, MEASURED IN BU / ACRE",NOT SPECIFIED,58.9,2015


In [1168]:
cornDF = cropsDF.loc[(cropsDF['Data Item'].str.contains('BU / ACRE')) & (cropsDF['Commodity'] == 'CORN')]
beansDF = cropsDF.loc[(cropsDF['Data Item'].str.contains('BU / ACRE')) & (cropsDF['Commodity'] == 'SOYBEANS')]

cornDF = cornDF[['YEAR', 'Value']]
beansDF = beansDF[['YEAR', 'Value']]

In [1169]:
weatherDF = pd.read_csv('../data/wx/wx-frontier-agg.csv')
weatherDF.head()

Unnamed: 0,INDEX,YEAR,GSP,GDD,GSTmax,GSTmin,frost,summer,HWI,CWI,dry,wet,PRCP95P
0,0,1941,3.869512,1555.1,25.865244,13.09939,2,94,11,7,13,5,4
1,1,1942,3.379878,1327.1,24.22622,11.957927,6,81,7,12,15,3,5
2,2,1943,3.396341,1303.8,23.991463,11.908537,7,80,6,7,10,4,2
3,3,1944,3.342331,1467.1,24.992073,12.89939,4,91,12,13,15,6,2
4,4,1945,3.471951,1130.0,23.137195,10.643293,6,66,5,13,19,3,5


In [1170]:
combined = weatherDF.merge(cornDF, on='YEAR')
combined['corn'] = combined.Value
combined.drop('Value', axis=1, inplace=True)
combined = combined.merge(beansDF, on='YEAR')
combined['beans'] = combined.Value
combined.drop('Value', axis=1, inplace=True)
combined

Unnamed: 0,INDEX,YEAR,GSP,GDD,GSTmax,GSTmin,frost,summer,HWI,CWI,dry,wet,PRCP95P,corn,beans
0,0,1941,3.869512,1555.100,25.865244,13.099390,2,94,11,7,13,5,4,46.5,14.9
1,1,1942,3.379878,1327.100,24.226220,11.957927,6,81,7,12,15,3,5,56.3,17.7
2,2,1943,3.396341,1303.800,23.991463,11.908537,7,80,6,7,10,4,2,56.6,18.8
3,3,1944,3.342331,1467.100,24.992073,12.899390,4,91,12,13,15,6,2,49.8,18.6
4,4,1945,3.471951,1130.000,23.137195,10.643293,6,66,5,13,19,3,5,39.8,16.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,70,2011,2.235671,1428.125,24.988415,12.427744,2,92,9,10,14,4,0,169.3,47.7
71,71,2012,1.259451,1458.000,26.367683,11.412805,9,107,11,11,27,3,0,124.8,42.5
72,72,2013,3.569817,1394.850,24.716463,12.293902,4,82,7,10,21,15,0,163.1,40.3
73,73,2014,3.509146,1235.675,23.610366,11.458841,6,82,6,13,14,7,0,168.3,50.2


In [1171]:
feats = ['YEAR', 'GSP', 'GDD', 'GSTmax', 'GSTmin', 'frost', 'summer' ,'HWI', 'CWI', 'dry' ,'wet', 'PRCP95P']
ys = ['corn', 'beans']

def norm(s):
    return (s - s.mean()) / s.std()

for f in feats + ys:
    combined[f] = norm(combined[f])
    

combined = combined.sample(frac=1)

combined.head()

Unnamed: 0,INDEX,YEAR,GSP,GDD,GSTmax,GSTmin,frost,summer,HWI,CWI,dry,wet,PRCP95P,corn,beans
22,22,-0.688247,0.062407,1.278939,1.021384,1.355812,-1.079558,0.898257,0.643805,0.297969,0.916596,-0.674822,0.986352,-0.717118,-0.549258
40,40,0.137649,1.535339,-0.374068,-0.260272,-0.449251,0.093875,-0.212661,-1.201497,-0.035578,0.062625,1.012233,0.986352,0.601863,0.097032
27,27,-0.458831,0.562682,-0.834311,-1.024401,-0.393845,-0.688414,-1.164877,-0.278846,-0.702673,-1.00484,-0.253058,0.408411,-0.0485,-0.09774
74,74,1.697676,0.391036,0.229469,-0.078438,0.601793,-1.079558,0.263446,-1.509047,-0.035578,1.130089,-0.253058,-0.74747,2.034942,2.283793
35,35,-0.091766,-1.729857,0.369476,1.460766,-1.205353,2.049595,1.374365,0.643805,-0.369125,-0.364361,-1.096586,-1.325411,-0.31549,-0.088887


## Corn Data with weather data

In [1172]:
X = combined[feats]
y = combined['corn']

In [1173]:
pca = make_pipeline(PCA(n_components=8), LinearRegression())

rmse = -cross_val_score(pca, X, y, scoring="neg_mean_squared_error", cv=5).mean()
r2 = cross_val_score(pca, X, y, scoring="r2", cv=5).mean()

print('r2 %f rmse %f' %(r2, math.sqrt(rmse)))


r2 0.860091 rmse 0.357233


## SOYBEANS with weather data

In [1174]:
X = combined[feats]
y = combined['beans']

In [1175]:
pca = make_pipeline(PCA(n_components=8), LinearRegression())

rmse = -cross_val_score(pca, X, y, scoring="neg_mean_squared_error", cv=5).mean()
r2 = cross_val_score(pca, X, y, scoring="r2", cv=5).mean()

print('r2 %f rmse %f' %(r2, math.sqrt(rmse)))

r2 0.809950 rmse 0.402286
