In [749]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
import plotly.express as px
import math
from sklearn.model_selection import KFold


In [750]:
# Import CVS
cropsDF = pd.read_csv('../data/agri/frontiers/Processed_Iowa+Cerro+Gordo_1960+2009_Annual+Crop.csv')

# Sort by year
cropsDF = cropsDF.sort_values(by='Year')
cropsDF.head()

# jimg Change Year to YEAR so it matches wx data
cropsDF['YEAR'] = cropsDF.Year
cropsDF.drop('Year', axis=1, inplace=True)

cropsDF.Value = cropsDF.Value.astype('float')

cropsDF

Unnamed: 0,Program,Period,Geo Level,State,Ag District,County,Commodity,Data Item,Domain Category,Value,YEAR
151,SURVEY,YEAR,COUNTY,IOWA,NORTH CENTRAL,CERRO GORDO,SOYBEANS,"SOYBEANS - YIELD, MEASURED IN BU / ACRE",NOT SPECIFIED,19.6,1940
150,SURVEY,YEAR,COUNTY,IOWA,NORTH CENTRAL,CERRO GORDO,CORN,"CORN, GRAIN - YIELD, MEASURED IN BU / ACRE",NOT SPECIFIED,51.8,1940
148,SURVEY,YEAR,COUNTY,IOWA,NORTH CENTRAL,CERRO GORDO,CORN,"CORN, GRAIN - YIELD, MEASURED IN BU / ACRE",NOT SPECIFIED,46.5,1941
149,SURVEY,YEAR,COUNTY,IOWA,NORTH CENTRAL,CERRO GORDO,SOYBEANS,"SOYBEANS - YIELD, MEASURED IN BU / ACRE",NOT SPECIFIED,14.9,1941
146,SURVEY,YEAR,COUNTY,IOWA,NORTH CENTRAL,CERRO GORDO,CORN,"CORN, GRAIN - YIELD, MEASURED IN BU / ACRE",NOT SPECIFIED,56.3,1942
...,...,...,...,...,...,...,...,...,...,...,...
4,SURVEY,YEAR,COUNTY,IOWA,NORTH CENTRAL,CERRO GORDO,CORN,"CORN, GRAIN - YIELD, MEASURED IN BU / ACRE",NOT SPECIFIED,163.1,2013
3,SURVEY,YEAR,COUNTY,IOWA,NORTH CENTRAL,CERRO GORDO,SOYBEANS,"SOYBEANS - YIELD, MEASURED IN BU / ACRE",NOT SPECIFIED,50.2,2014
2,SURVEY,YEAR,COUNTY,IOWA,NORTH CENTRAL,CERRO GORDO,CORN,"CORN, GRAIN - YIELD, MEASURED IN BU / ACRE",NOT SPECIFIED,168.3,2014
1,SURVEY,YEAR,COUNTY,IOWA,NORTH CENTRAL,CERRO GORDO,SOYBEANS,"SOYBEANS - YIELD, MEASURED IN BU / ACRE",NOT SPECIFIED,58.9,2015


In [751]:
cornDF = cropsDF.loc[(cropsDF['Data Item'].str.contains('BU / ACRE')) & (cropsDF['Commodity'] == 'CORN')]
beansDF = cropsDF.loc[(cropsDF['Data Item'].str.contains('BU / ACRE')) & (cropsDF['Commodity'] == 'SOYBEANS')]

cornDF = cornDF[['YEAR', 'Value']]
beansDF = beansDF[['YEAR', 'Value']]

In [752]:
weatherDF = pd.read_csv('../data/wx/wx-frontier-agg-3.csv')
weatherDF.head()

Unnamed: 0.1,Unnamed: 0,YEAR,GSP,GDD,GSTmin,GSTmax,frost,summer,HWI,CWI,dry,wet,PRCP95P
0,1941,1941,3.869512,1555.1,13.09939,25.865244,2,94,21.0,12.0,13,5,4.0
1,1942,1942,3.379878,1327.1,11.957927,24.22622,6,81,11.0,26.0,15,3,5.0
2,1943,1943,3.396341,1303.8,11.908537,23.991463,7,80,12.0,17.0,10,4,2.0
3,1944,1944,3.342331,1467.1,12.89939,24.992073,4,91,17.0,14.0,15,6,2.0
4,1945,1945,3.471951,1130.0,10.643293,23.137195,6,66,10.0,27.0,19,3,5.0


In [753]:
combined = weatherDF.merge(cornDF, on='YEAR')
combined['corn'] = combined.Value
combined.drop('Value', axis=1, inplace=True)
combined = combined.merge(beansDF, on='YEAR')
combined['beans'] = combined.Value
combined.drop('Value', axis=1, inplace=True)
combined

Unnamed: 0.1,Unnamed: 0,YEAR,GSP,GDD,GSTmin,GSTmax,frost,summer,HWI,CWI,dry,wet,PRCP95P,corn,beans
0,1941,1941,3.869512,1555.100,13.099390,25.865244,2,94,21.0,12.0,13,5,4.0,46.5,14.9
1,1942,1942,3.379878,1327.100,11.957927,24.226220,6,81,11.0,26.0,15,3,5.0,56.3,17.7
2,1943,1943,3.396341,1303.800,11.908537,23.991463,7,80,12.0,17.0,10,4,2.0,56.6,18.8
3,1944,1944,3.342331,1467.100,12.899390,24.992073,4,91,17.0,14.0,15,6,2.0,49.8,18.6
4,1945,1945,3.471951,1130.000,10.643293,23.137195,6,66,10.0,27.0,19,3,5.0,39.8,16.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,2011,2011,2.235671,1428.125,12.427744,24.988415,2,92,20.0,11.0,14,4,0.0,169.3,47.7
71,2012,2012,1.259451,1458.000,11.412805,26.367683,9,107,27.0,15.0,27,3,0.0,124.8,42.5
72,2013,2013,3.569817,1394.850,12.293902,24.716463,4,82,11.0,11.0,21,15,0.0,163.1,40.3
73,2014,2014,3.509146,1235.675,11.458841,23.610366,6,82,2.0,19.0,14,7,0.0,168.3,50.2


In [754]:
feats = ['YEAR', 'GSP', 'GDD', 'GSTmax', 'GSTmin', 'frost', 'summer' ,'HWI', 'CWI', 'dry' ,'wet', 'PRCP95P']
ys = ['corn', 'beans']

def norm(s):
    return (s - s.mean()) / s.std()

for f in feats + ys:
    combined[f] = norm(combined[f])
    

combined = combined.sample(frac=1)

combined.head()

Unnamed: 0.1,Unnamed: 0,YEAR,GSP,GDD,GSTmin,GSTmax,frost,summer,HWI,CWI,dry,wet,PRCP95P,corn,beans
16,1957,-0.963546,-0.495798,-0.45374,0.368506,-0.982439,0.093875,-0.926823,-0.523721,-1.269346,-0.150868,-0.674822,-1.293909,-1.125591,-0.850269
1,1942,-1.651793,0.074208,-0.192291,0.116055,-0.387069,0.485019,-0.371364,-0.414182,1.459312,-0.150868,-1.096586,1.611575,-1.173512,-1.36376
72,2013,1.60591,0.296989,0.331767,0.575132,0.101876,-0.29727,-0.292013,-0.414182,-0.694892,1.130089,3.964579,-1.293909,1.263635,0.637082
49,1990,0.550598,1.630459,-0.373101,0.205621,-0.73675,-0.29727,-0.450715,-0.78057,-0.630513,0.276118,-0.253058,-0.131715,0.663476,0.964654
11,1952,-1.192962,-1.112649,-1.022082,-1.17161,-0.751954,3.223027,-0.292013,-0.414182,1.459312,2.838033,-0.674822,-0.712812,-1.013774,-0.938802


In [755]:
X = combined[feats]

pca = PCA().fit(X)

xAXIS = np.cumsum(pca.explained_variance_ratio_)

pcaDF = pd.DataFrame({'y': xAXIS})
pcaDF['x'] = np.arange(0, 12)

fig = px.scatter()

fig = px.line(
    pcaDF,
    x = 'x',
    y = 'y',

    labels = {
        'x' : 'No. of Components',
        'y' : 'Cumulative Explained Variance'
    },
)

fig.show()

## Corn Data with weather data

In [756]:
X = combined[feats]
y = combined['corn']

In [757]:
pca = make_pipeline(PCA(n_components=8), LinearRegression())

cv = KFold(n_splits=5, shuffle=True)

rmse = -cross_val_score(pca, X, y, scoring="neg_mean_squared_error", cv=cv).mean()
r2 = cross_val_score(pca, X, y, scoring="r2", cv=cv).mean()

print('r2 %f rmse %f' %(r2, math.sqrt(rmse)))


r2 0.845961 rmse 0.351449


## SOYBEANS with weather data

In [758]:
X = combined[feats]
y = combined['beans']

In [759]:
pca = make_pipeline(PCA(n_components=8), LinearRegression())

cv = KFold(n_splits=5, shuffle=True)

rmse = -cross_val_score(pca, X, y, scoring="neg_mean_squared_error", cv=cv).mean()
r2 = cross_val_score(pca, X, y, scoring="r2", cv=cv).mean()

print('r2 %f rmse %f' %(r2, math.sqrt(rmse)))

r2 0.772680 rmse 0.385786
