In [117]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
import plotly.express as px


In [2]:
# Import CVS
cropsDF = pd.read_csv('../data/agri/frontiers/Processed_Iowa+Cerro+Gordo_1960+2009_Annual+Crop.csv')

# Sort by year
cropsDF = cropsDF.sort_values(by='Year')
cropsDF.head()

# jimg Change Year to YEAR so it matches wx data
cropsDF['YEAR'] = cropsDF.Year
cropsDF.drop('Year', axis=1, inplace=True)

cropsDF.Value = cropsDF.Value.astype('float')

cropsDF

Unnamed: 0,Program,Period,Geo Level,State,Ag District,County,Commodity,Data Item,Domain Category,Value,YEAR
151,SURVEY,YEAR,COUNTY,IOWA,NORTH CENTRAL,CERRO GORDO,SOYBEANS,"SOYBEANS - YIELD, MEASURED IN BU / ACRE",NOT SPECIFIED,19.6,1940
150,SURVEY,YEAR,COUNTY,IOWA,NORTH CENTRAL,CERRO GORDO,CORN,"CORN, GRAIN - YIELD, MEASURED IN BU / ACRE",NOT SPECIFIED,51.8,1940
148,SURVEY,YEAR,COUNTY,IOWA,NORTH CENTRAL,CERRO GORDO,CORN,"CORN, GRAIN - YIELD, MEASURED IN BU / ACRE",NOT SPECIFIED,46.5,1941
149,SURVEY,YEAR,COUNTY,IOWA,NORTH CENTRAL,CERRO GORDO,SOYBEANS,"SOYBEANS - YIELD, MEASURED IN BU / ACRE",NOT SPECIFIED,14.9,1941
146,SURVEY,YEAR,COUNTY,IOWA,NORTH CENTRAL,CERRO GORDO,CORN,"CORN, GRAIN - YIELD, MEASURED IN BU / ACRE",NOT SPECIFIED,56.3,1942
...,...,...,...,...,...,...,...,...,...,...,...
4,SURVEY,YEAR,COUNTY,IOWA,NORTH CENTRAL,CERRO GORDO,CORN,"CORN, GRAIN - YIELD, MEASURED IN BU / ACRE",NOT SPECIFIED,163.1,2013
3,SURVEY,YEAR,COUNTY,IOWA,NORTH CENTRAL,CERRO GORDO,SOYBEANS,"SOYBEANS - YIELD, MEASURED IN BU / ACRE",NOT SPECIFIED,50.2,2014
2,SURVEY,YEAR,COUNTY,IOWA,NORTH CENTRAL,CERRO GORDO,CORN,"CORN, GRAIN - YIELD, MEASURED IN BU / ACRE",NOT SPECIFIED,168.3,2014
1,SURVEY,YEAR,COUNTY,IOWA,NORTH CENTRAL,CERRO GORDO,SOYBEANS,"SOYBEANS - YIELD, MEASURED IN BU / ACRE",NOT SPECIFIED,58.9,2015


In [3]:
cornDF = cropsDF.loc[(cropsDF['Data Item'].str.contains('BU / ACRE')) & (cropsDF['Commodity'] == 'CORN')]
beansDF = cropsDF.loc[(cropsDF['Data Item'].str.contains('BU / ACRE')) & (cropsDF['Commodity'] == 'SOYBEANS')]

cornDF = cornDF[['YEAR', 'Value']]
beansDF = beansDF[['YEAR', 'Value']]

In [4]:
weatherDF = pd.read_csv('../data/wx/wx-frontier-agg.csv')
weatherDF.head()

Unnamed: 0,INDEX,YEAR,GSP,GDD,GSTmax,GSTmin,frost,summer,HWI,CWI,dry,wet,PRCP95P
0,0,1941,3.869512,1555.1,25.865244,13.09939,2,94,11,7,13,5,4
1,1,1942,3.379878,1327.1,24.22622,11.957927,6,81,7,12,15,3,5
2,2,1943,3.396341,1303.8,23.991463,11.908537,7,80,6,7,10,4,2
3,3,1944,3.342331,1467.1,24.992073,12.89939,4,91,12,13,15,6,2
4,4,1945,3.471951,1130.0,23.137195,10.643293,6,66,5,13,19,3,5


In [5]:
combined = weatherDF.merge(cornDF, on='YEAR')
combined['corn'] = combined.Value
combined.drop('Value', axis=1, inplace=True)
combined = combined.merge(beansDF, on='YEAR')
combined['beans'] = combined.Value
combined.drop('Value', axis=1, inplace=True)
combined

Unnamed: 0,INDEX,YEAR,GSP,GDD,GSTmax,GSTmin,frost,summer,HWI,CWI,dry,wet,PRCP95P,corn,beans
0,0,1941,3.869512,1555.100,25.865244,13.099390,2,94,11,7,13,5,4,46.5,14.9
1,1,1942,3.379878,1327.100,24.226220,11.957927,6,81,7,12,15,3,5,56.3,17.7
2,2,1943,3.396341,1303.800,23.991463,11.908537,7,80,6,7,10,4,2,56.6,18.8
3,3,1944,3.342331,1467.100,24.992073,12.899390,4,91,12,13,15,6,2,49.8,18.6
4,4,1945,3.471951,1130.000,23.137195,10.643293,6,66,5,13,19,3,5,39.8,16.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,70,2011,2.235671,1428.125,24.988415,12.427744,2,92,9,10,14,4,0,169.3,47.7
71,71,2012,1.259451,1458.000,26.367683,11.412805,9,107,11,11,27,3,0,124.8,42.5
72,72,2013,3.569817,1394.850,24.716463,12.293902,4,82,7,10,21,15,0,163.1,40.3
73,73,2014,3.509146,1235.675,23.610366,11.458841,6,82,6,13,14,7,0,168.3,50.2


In [6]:
feats = ['YEAR', 'GSP', 'GDD', 'GSTmax', 'GSTmin', 'frost', 'summer' ,'HWI', 'CWI', 'dry' ,'wet', 'PRCP95P']
ys = ['corn', 'beans']

def norm(s):
    return (s - s.mean()) / s.std()

for f in feats + ys:
    combined[f] = norm(combined[f])
    

combined = combined.sample(frac=1)

combined.head()

Unnamed: 0,INDEX,YEAR,GSP,GDD,GSTmax,GSTmin,frost,summer,HWI,CWI,dry,wet,PRCP95P,corn,beans
34,34,-0.137649,-0.968896,1.363833,1.210212,1.279994,-0.29727,0.580852,0.336255,-0.035578,1.343582,-1.096586,-0.169529,-0.329182,0.008499
13,13,-1.101196,2.681429,0.144189,-0.507785,1.006298,-0.29727,0.025392,-0.586396,0.631516,-0.577854,-0.674822,1.564293,-1.082234,-0.929949
20,20,-0.780013,-0.713931,-0.109525,0.122857,-0.40426,-0.688414,0.104744,-0.278846,1.965704,-0.791347,-0.674822,-0.74747,-0.781013,-0.646644
21,21,-0.73413,1.146634,-0.163865,-0.700262,0.606376,-1.861846,-0.768121,0.028705,-1.03622,-0.577854,-1.096586,0.408411,-0.785577,-0.779443
31,31,-0.275299,0.385672,-0.253593,-0.375515,-0.031833,0.485019,-0.688769,1.566457,0.297969,-0.791347,-0.253058,-0.169529,0.17057,0.167858


In [116]:
X = combined[feats]

pca = PCA().fit(X)

xAXIS = np.cumsum(pca.explained_variance_ratio_)

pcaDF = pd.DataFrame({'y': xAXIS})
pcaDF['x'] = np.arange(0, 12)

fig = px.scatter()

fig = px.line(
    pcaDF,
    x = 'x',
    y = 'y',

    labels = {
        'x' : 'No. of Components',
        'y' : 'Cumulative Explained Variance'
    },
)

fig.show()

## Corn Data with weather data

In [7]:
X = combined[feats]
y = combined['corn']

In [8]:
pca = make_pipeline(PCA(n_components=8), LinearRegression())

rmse = -cross_val_score(pca, X, y, scoring="neg_mean_squared_error", cv=5).mean()
r2 = cross_val_score(pca, X, y, scoring="r2", cv=5).mean()

print('r2 %f rmse %f' %(r2, math.sqrt(rmse)))


r2 0.852529 rmse 0.369048


## SOYBEANS with weather data

In [9]:
X = combined[feats]
y = combined['beans']

In [10]:
pca = make_pipeline(PCA(n_components=8), LinearRegression())

rmse = -cross_val_score(pca, X, y, scoring="neg_mean_squared_error", cv=5).mean()
r2 = cross_val_score(pca, X, y, scoring="r2", cv=5).mean()

print('r2 %f rmse %f' %(r2, math.sqrt(rmse)))

r2 0.838246 rmse 0.393482
