In [22]:
import os
import sqlite3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

In [23]:
path_to_data = '../../data/00_raw/agriculture_prices.db'
conn = sqlite3.connect(path_to_data)
c = conn.cursor()

In [24]:
produce_df = pd.DataFrame(c.execute('''SELECT * FROM agriculture_prices''').fetchall())

In [25]:
produce_df.columns = ['Farm Price', 'Atlanta Retail', 'Chicago Retail', 'Los Angeles Retail', 'NYC Retail', 'Avg Spread', 'Commodity', 'Date']

In [26]:
produce_df.head()

Unnamed: 0,Farm Price,Atlanta Retail,Chicago Retail,Los Angeles Retail,NYC Retail,Avg Spread,Commodity,Date
0,1.16,2.23,1.7,1.99,2.54,82.33%,Strawberries,2019-05-19
1,0.91,2.67,1.89,2.47,2.66,166.21%,Strawberries,2019-05-12
2,0.58,2.42,1.89,2.12,2.91,302.59%,Strawberries,2019-05-05
3,0.58,2.79,1.95,2.38,2.96,334.48%,Strawberries,2019-04-28
4,0.69,2.92,2.04,2.38,3.16,280.43%,Strawberries,2019-04-21


In [27]:
produce_df.tail()

Unnamed: 0,Farm Price,Atlanta Retail,Chicago Retail,Los Angeles Retail,NYC Retail,Avg Spread,Commodity,Date
213791,0.28,1.69,0.0,1.24,1.19,267.86%,Nectarines,2000-07-23
213792,0.26,1.61,0.0,1.37,1.19,300.96%,Nectarines,2000-07-16
213793,0.3,1.71,0.0,1.42,1.19,260.00%,Nectarines,2000-07-09
213794,0.33,1.96,0.0,0.92,1.04,196.97%,Nectarines,2000-07-02
213795,0.39,1.22,0.0,0.92,1.19,113.46%,Nectarines,2000-06-25


In [28]:
produce_df.index = pd.to_datetime(produce_df['Date'])

In [29]:
produce_df.drop(columns=['Date'], inplace=True)

In [30]:
conn.close()

# Cleaning Data and Adding Features
Want to update all prices to reflect 2019 dollars (adjust for inflation), and will be taking an average for all retail prices. A dataframe with mean annual farm prices, mean annual average retail, standard deviations, and year as an index will be created.

Since this needs to be done by commodity, I am also going to make a dictionary that holds data frames by commodity, produce_dict. So for example, typing ``produce_dict['Strawberries']`` will return a dataframe concerning only strawberries.

In [31]:
produce_list = list(produce_df['Commodity'].unique())
produce_list

['Strawberries',
 'Romaine Lettuce',
 'Red Leaf Lettuce',
 'Potatoes',
 'Oranges',
 'Iceberg Lettuce',
 'Green Leaf Lettuce',
 'Celery',
 'Cauliflower',
 'Carrots',
 'Cantaloupe',
 'Broccoli Crowns',
 'Avocados',
 'Broccoli Bunches',
 'Asparagus',
 'Flame Grapes',
 'Thompson Grapes',
 'Honeydews',
 'Tomatoes',
 'Plums',
 'Peaches',
 'Nectarines']

In [32]:
produce_dict = dict()
for produce in produce_list:
    produce_dict.setdefault(produce, produce_df[produce_df['Commodity'] == produce])

In [33]:
produce_dict['Strawberries'].head()

Unnamed: 0_level_0,Farm Price,Atlanta Retail,Chicago Retail,Los Angeles Retail,NYC Retail,Avg Spread,Commodity
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-05-19,1.16,2.23,1.7,1.99,2.54,82.33%,Strawberries
2019-05-12,0.91,2.67,1.89,2.47,2.66,166.21%,Strawberries
2019-05-05,0.58,2.42,1.89,2.12,2.91,302.59%,Strawberries
2019-04-28,0.58,2.79,1.95,2.38,2.96,334.48%,Strawberries
2019-04-21,0.69,2.92,2.04,2.38,3.16,280.43%,Strawberries


# Getting Variance 
Getting variance rather than standard deviation for the retail averages because I will need to find the standard deviation again when I take the yearly mean. Since the standard deviation then becomes the square root of the yearly mean of the variances, it is better to keep this step.

In [34]:
for item in produce_list:
    avg_retail = [np.mean(x[1:5]) for x in produce_dict[item].values]
    avg_retail_std = [np.var(x[1:5],ddof=1) for x in produce_dict[item].values] #sample standard deviation
    produce_dict.get(item).loc[:,'avg_retail'] = avg_retail
    produce_dict.get(item).loc[:,'avg_retail_var'] = avg_retail_std
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Decided to use sample standard deviation above as the data provided is better represented as a sample rather than a population. Say a sample of retail prices in major American cities. It will give more conservative results anyway.

In [35]:
produce_dict['Strawberries']

Unnamed: 0_level_0,Farm Price,Atlanta Retail,Chicago Retail,Los Angeles Retail,NYC Retail,Avg Spread,Commodity,avg_retail,avg_retail_std
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2019-05-19,1.16,2.23,1.70,1.99,2.54,82.33%,Strawberries,2.1150,0.356698
2019-05-12,0.91,2.67,1.89,2.47,2.66,166.21%,Strawberries,2.4225,0.366731
2019-05-05,0.58,2.42,1.89,2.12,2.91,302.59%,Strawberries,2.3350,0.440492
2019-04-28,0.58,2.79,1.95,2.38,2.96,334.48%,Strawberries,2.5200,0.451294
2019-04-21,0.69,2.92,2.04,2.38,3.16,280.43%,Strawberries,2.6250,0.508429
...,...,...,...,...,...,...,...,...,...
1999-08-23,0.72,2.62,0.00,3.12,2.59,189.24%,Strawberries,2.0825,1.409453
1999-08-16,0.58,2.44,0.00,3.22,2.77,263.36%,Strawberries,2.1075,1.440911
1999-08-09,0.65,2.49,0.00,2.99,2.87,221.15%,Strawberries,2.0875,1.407891
1999-08-02,0.71,2.37,0.00,2.87,2.19,161.62%,Strawberries,1.8575,1.271308


# Dropping columns
Dropping unnecessary columns: Atlanta Retail, Chicago Retail, Los Angeles Retail, NYC Retail, Avg Spread, and Commodity (commodity is kept track via key in dictionary; produce_dict)

In [36]:
for item in produce_list:
    produce_dict.get(item).drop(columns=['Atlanta Retail', 'Chicago Retail', 'Los Angeles Retail', 'NYC Retail', 'Avg Spread', 'Commodity'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [37]:
produce_dict['Strawberries']

Unnamed: 0_level_0,Farm Price,avg_retail,avg_retail_std
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-05-19,1.16,2.1150,0.356698
2019-05-12,0.91,2.4225,0.366731
2019-05-05,0.58,2.3350,0.440492
2019-04-28,0.58,2.5200,0.451294
2019-04-21,0.69,2.6250,0.508429
...,...,...,...
1999-08-23,0.72,2.0825,1.409453
1999-08-16,0.58,2.1075,1.440911
1999-08-09,0.65,2.0875,1.407891
1999-08-02,0.71,1.8575,1.271308


# Now to take Annual Averages

In [38]:
produce_yearly_mean_dict = {}
produce_yearly_std_dict = {}

In [20]:
for key, value in produce_dict.items():
    produce_yearly_mean_dict[key] = produce_dict[key].resample('Y').mean()
    produce_yearly_mean_dict[key].columns = ['Annual Mean Farm Price', 'Annual Mean Average Retail', 'Avg_retail_stdev' ]
    produce_yearly_mean_dict[key].index = produce_yearly_mean_dict[key].index.year
    produce_yearly_std_dict[key] = produce_dict[key].resample('Y').std(ddof=1)
    produce_yearly_std_dict[key].index = produce_yearly_std_dict[key].index.year

ValueError: Length mismatch: Expected axis has 3 elements, new values have 2 elements

In [85]:
produce_yearly_mean_dict['Oranges'].head()

Unnamed: 0_level_0,Farm Price,avg_retail,avg_retail_std
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1999,0.867907,2.171453,1.529163
2000,0.54366,1.484884,1.051904
2001,0.577964,1.914683,0.395447
2002,0.652154,1.941311,0.386581
2003,0.569522,1.96999,0.323986


In [86]:
produce_yearly_std_dict['Oranges'].head()

Unnamed: 0_level_0,Farm Price,avg_retail,avg_retail_std
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1999,0.328275,0.244282,0.180796
2000,0.407027,0.729761,0.514033
2001,0.529019,1.057655,0.320102
2002,0.694222,1.364297,0.312343
2003,0.580861,1.329441,0.239301
