#### Idea1: Can we improve yield prediction for different crops across different counties in the US? Understanding crop yield data is essential to ensure food security and climate change, and changes in weather patterns are affecting crop yield throughout. The goal of this project is to be able to predict the yield of crops in the US using publicly available datasets on weather, soil quality, amount of crop irrigated and other features. 

#### This project predicts the yield of wheat in different counties in the US. The datasets for this project is from https://github.com/aerialintel/data-science-exercise and includes data for both 2013 and 2014. 


#### The 2013 dataset is loaded here. 




In [98]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import descartes
import geopandas as gpd
from shapely.geometry import Point, Polygon
import datetime as dt
from scipy import stats

%matplotlib inline

In [99]:
#### Reading in dataset

In [100]:
df1_2013 = pd.read_csv('Wheat2013.csv',header = 0)
df1_2014 = pd.read_csv('Wheat2014.csv',header = 0)

In [119]:
df1_2013['precipIntensity'].head(10)

0    0.0000
1    0.0001
2    0.0001
3    0.0002
4    0.0003
5    0.0000
6    0.0000
7    0.0000
8    0.0000
9    0.0000
Name: precipIntensity, dtype: float64

In [102]:
df1_2014.tail(2)

Unnamed: 0,CountyName,State,Latitude,Longitude,Date,apparentTemperatureMax,apparentTemperatureMin,cloudCover,dewPoint,humidity,...,precipTypeIsOther,pressure,temperatureMax,temperatureMin,visibility,windBearing,windSpeed,NDVI,DayInSeason,Yield
182547,Miami,Kansas,38.564866,-94.748021,6/3/2015 0:00,75.35,64.44,0.13,62.74,0.84,...,0,1013.01,75.35,64.44,9.28,132,6.26,129.93895,185,50.1
182548,Miami,Kansas,38.392008,-94.85543,6/3/2015 0:00,75.27,64.57,0.13,62.72,0.84,...,0,1013.0,75.27,64.57,9.26,132,6.23,129.784332,185,50.1


In [103]:
## Converting date/time column to datetime format
df1_2013['Date'] = pd.to_datetime(df1_2013['Date'])
df1_2014['Date'] = pd.to_datetime(df1_2014['Date'])


In [104]:
# Adding a column for year for both datasets
df1_2013['year'] = df1_2013['Date'].dt.year
df1_2014['year'] = df1_2014['Date'].dt.year

In [105]:
### Standardizing data on a county basis, based on day in season, location of farm 
df2_2013 = df1_2013.groupby(['CountyName','State','DayInSeason'], as_index=False).agg({'apparentTemperatureMax':"mean", 'apparentTemperatureMin':"mean",
                                                                                     'cloudCover':"mean", 'dewPoint':"mean", 'humidity':"mean", "precipIntensity":"mean",
                                                                                      'precipIntensityMax':"mean",'precipProbability':"min",'precipAccumulation':"mean",
                                                                                      'precipTypeIsRain':"median", 'precipTypeIsSnow':"median",'precipTypeIsOther':"median",
                                                                                       'pressure':"mean", 'temperatureMax':"mean",
                                                                                       'temperatureMin':"mean", 'visibility':"mean", 'windBearing':"mean", 'windSpeed':"mean",
                                                                                       'NDVI':"mean", 'Yield':"mean", 'year':"max"})

In [106]:
df2_2013.head()

Unnamed: 0,CountyName,State,DayInSeason,apparentTemperatureMax,apparentTemperatureMin,cloudCover,dewPoint,humidity,precipIntensity,precipIntensityMax,...,precipTypeIsOther,pressure,temperatureMax,temperatureMin,visibility,windBearing,windSpeed,NDVI,Yield,year
0,Adams,Washington,0,31.896667,24.838333,0.303333,29.483333,0.928333,0.000117,0.00225,...,0,1026.718333,34.068333,27.396667,2.825,166.333333,1.38,131.792425,35.7,2013
1,Adams,Washington,1,41.881667,28.278333,0.178333,37.35,0.873333,0.006667,0.037267,...,0,1012.293333,47.5,33.525,6.58,212.333333,7.953333,131.634992,35.7,2013
2,Adams,Washington,2,39.118333,20.125,0.046667,25.056667,0.628333,0.000233,0.0035,...,0,1008.456667,46.168333,29.596667,9.913333,261.0,8.965,140.91481,35.7,2013
3,Adams,Washington,3,22.208333,6.82,0.051667,9.633333,0.496667,8.3e-05,0.0007,...,0,1022.33,31.915,16.205,9.67,14.666667,10.8,138.661329,35.7,2013
4,Adams,Washington,4,23.566667,1.811667,0.013333,5.556667,0.605,0.0,0.0,...,0,1028.12,26.183333,10.88,9.99,75.666667,2.691667,136.871195,35.7,2013


In [107]:
### Standardizing data on a county basis, based on day in season, location of farm 
df2_2014 = df1_2014.groupby(['CountyName','State','DayInSeason'], as_index=False).agg({'apparentTemperatureMax':"mean", 'apparentTemperatureMin':"mean",
                                                                                     'cloudCover':"mean", 'dewPoint':"mean", 'humidity':"mean", "precipIntensity":"mean",
                                                                                      'precipIntensityMax':"mean",'precipProbability':"min",'precipAccumulation':"mean",
                                                                                      'precipTypeIsRain':"median", 'precipTypeIsSnow':"median",'precipTypeIsOther':"median",
                                                                                       'pressure':"mean", 'temperatureMax':"mean",
                                                                                       'temperatureMin':"mean", 'visibility':"mean", 'windBearing':"mean", 'windSpeed':"mean",
                                                                                       'NDVI':"mean", 'Yield':"mean", 'year':"max"})

In [108]:
df2_2014.head()

Unnamed: 0,CountyName,State,DayInSeason,apparentTemperatureMax,apparentTemperatureMin,cloudCover,dewPoint,humidity,precipIntensity,precipIntensityMax,...,precipTypeIsOther,pressure,temperatureMax,temperatureMin,visibility,windBearing,windSpeed,NDVI,Yield,year
0,Adams,Washington,0,19.448333,-2.028333,0.018333,7.061667,0.683333,0.0,0.0,...,0,1027.846667,25.023333,8.055,9.998333,65.833333,4.691667,135.336891,35.6,2014
1,Adams,Washington,1,25.16,5.945,0.17,12.493333,0.748333,0.0,0.0,...,0,1024.78,25.16,12.05,9.921667,150.166667,0.991667,131.745382,35.6,2014
2,Adams,Washington,2,22.111667,6.661667,0.076667,15.65,0.773333,0.0,0.0,...,0,1025.961667,29.646667,15.711667,9.355,180.833333,5.546667,132.001272,35.6,2014
3,Adams,Washington,3,28.166667,8.843333,0.086667,18.106667,0.75,1.7e-05,0.000233,...,0,1018.151667,32.463333,18.628333,9.671667,132.666667,4.383333,131.834915,35.6,2014
4,Adams,Washington,4,30.038333,22.323333,0.5,25.768333,0.841667,0.00255,0.01385,...,0,1016.981667,31.941667,28.215,7.711667,80.833333,2.873333,130.630046,35.6,2014


In [113]:
df2_2013[df2_2013['pressure'].isna()]

Unnamed: 0,CountyName,State,DayInSeason,apparentTemperatureMax,apparentTemperatureMin,cloudCover,dewPoint,humidity,precipIntensity,precipIntensityMax,...,precipTypeIsOther,pressure,temperatureMax,temperatureMin,visibility,windBearing,windSpeed,NDVI,Yield,year
24699,Johnston,Oklahoma,37,20.624,0.644,0.0,3.484,0.528,0.0,0.0,...,0,,28.936,10.704,10.0,323.0,4.974,157.6,33.6,2014
24700,Johnston,Oklahoma,53,51.774,19.832,0.0,20.118,0.512,0.0,0.0,...,0,,51.774,25.06,10.0,67.0,5.572,150.8,33.6,2014
24701,Johnston,Oklahoma,101,80.386,53.566,0.0,48.862,0.598,0.0,0.0,...,0,,82.164,53.566,9.742,215.0,5.63,158.8,33.6,2014
24702,Johnston,Oklahoma,133,81.172,63.472,0.0,60.544,0.694,0.0,0.0,...,0,,80.554,63.472,10.0,172.6,13.964,175.2,33.6,2014
24703,Johnston,Oklahoma,149,75.324,55.504,0.0,41.056,0.412,0.0,0.0,...,0,,75.324,55.504,9.292,285.8,5.072,177.6,33.6,2014
26936,Lamar,Texas,46,52.25,24.93,0.0,22.03,0.48,0.0,0.0,...,0,,52.25,31.06,9.97,287.0,4.5,142.0,52.2,2014
26937,Lamar,Texas,110,69.961667,35.908333,0.0,31.045,0.456667,0.0,0.0,...,0,,69.961667,37.068333,9.885,184.166667,1.953333,161.5,52.2,2014
29725,Marshall,Oklahoma,37,21.43,0.1,0.0,2.81,0.495,0.0,0.0,...,0,,30.111667,11.893333,10.0,329.833333,5.981667,150.833333,35.5,2014
29726,Marshall,Oklahoma,53,52.426667,21.145,0.0,19.853333,0.49,0.0,0.0,...,0,,52.426667,26.386667,10.0,65.833333,5.886667,149.666667,35.5,2014
29727,Marshall,Oklahoma,101,80.341667,53.628333,0.0,48.761667,0.6,0.0,0.0,...,0,,81.971667,53.628333,9.643333,205.333333,6.38,153.166667,35.5,2014


In [114]:
## Combining both datasets into one.
df = df2_2013.append(df2_2014)

In [116]:
df.head(20)

Unnamed: 0,CountyName,State,DayInSeason,apparentTemperatureMax,apparentTemperatureMin,cloudCover,dewPoint,humidity,precipIntensity,precipIntensityMax,...,precipTypeIsOther,pressure,temperatureMax,temperatureMin,visibility,windBearing,windSpeed,NDVI,Yield,year
0,Adams,Washington,0,31.896667,24.838333,0.303333,29.483333,0.928333,0.000117,0.00225,...,0,1026.718333,34.068333,27.396667,2.825,166.333333,1.38,131.792425,35.7,2013
1,Adams,Washington,1,41.881667,28.278333,0.178333,37.35,0.873333,0.006667,0.037267,...,0,1012.293333,47.5,33.525,6.58,212.333333,7.953333,131.634992,35.7,2013
2,Adams,Washington,2,39.118333,20.125,0.046667,25.056667,0.628333,0.000233,0.0035,...,0,1008.456667,46.168333,29.596667,9.913333,261.0,8.965,140.91481,35.7,2013
3,Adams,Washington,3,22.208333,6.82,0.051667,9.633333,0.496667,8.3e-05,0.0007,...,0,1022.33,31.915,16.205,9.67,14.666667,10.8,138.661329,35.7,2013
4,Adams,Washington,4,23.566667,1.811667,0.013333,5.556667,0.605,0.0,0.0,...,0,1028.12,26.183333,10.88,9.99,75.666667,2.691667,136.871195,35.7,2013
5,Adams,Washington,5,16.603333,-1.23,0.146667,5.796667,0.738333,0.0,0.0,...,0,1027.923333,20.253333,7.275,9.793333,34.0,2.326667,132.221169,35.7,2013
6,Adams,Washington,6,6.951667,-4.938333,0.05,2.161667,0.596667,0.0,0.0,...,0,1025.996667,19.741667,8.228333,9.818333,27.5,10.621667,135.866127,35.7,2013
7,Adams,Washington,7,12.288333,-10.086667,0.0,-7.82,0.455,0.0,0.0,...,0,1027.831667,17.8,1.606667,9.946667,63.833333,5.205,135.851092,35.7,2013
8,Adams,Washington,8,12.851667,-13.568333,0.05,-2.913333,0.608333,0.0,0.0,...,0,1033.423333,16.555,-0.898333,9.921667,147.833333,2.945,133.03198,35.7,2013
9,Adams,Washington,9,15.7,-1.423333,0.04,4.176667,0.635,0.0,0.0,...,0,1033.105,21.826667,7.406667,9.996667,76.833333,1.813333,134.787722,35.7,2013
