In [1]:
import os
import sqlite3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

In [2]:
path_to_data = '../../data/00_raw/agriculture_prices.db'
conn = sqlite3.connect(path_to_data)
c = conn.cursor()

In [3]:
produce_df = pd.DataFrame(c.execute('''SELECT * FROM agriculture_prices''').fetchall())

In [4]:
produce_df.columns = ['Farm Price', 'Atlanta Retail', 'Chicago Retail', 'Los Angeles Retail', 'NYC Retail', 'Avg Spread', 'Commodity', 'Date']

In [5]:
produce_df.head()

Unnamed: 0,Farm Price,Atlanta Retail,Chicago Retail,Los Angeles Retail,NYC Retail,Avg Spread,Commodity,Date
0,1.16,2.23,1.7,1.99,2.54,82.33%,Strawberries,2019-05-19
1,0.91,2.67,1.89,2.47,2.66,166.21%,Strawberries,2019-05-12
2,0.58,2.42,1.89,2.12,2.91,302.59%,Strawberries,2019-05-05
3,0.58,2.79,1.95,2.38,2.96,334.48%,Strawberries,2019-04-28
4,0.69,2.92,2.04,2.38,3.16,280.43%,Strawberries,2019-04-21


In [6]:
produce_df.tail()

Unnamed: 0,Farm Price,Atlanta Retail,Chicago Retail,Los Angeles Retail,NYC Retail,Avg Spread,Commodity,Date
213791,0.28,1.69,0.0,1.24,1.19,267.86%,Nectarines,2000-07-23
213792,0.26,1.61,0.0,1.37,1.19,300.96%,Nectarines,2000-07-16
213793,0.3,1.71,0.0,1.42,1.19,260.00%,Nectarines,2000-07-09
213794,0.33,1.96,0.0,0.92,1.04,196.97%,Nectarines,2000-07-02
213795,0.39,1.22,0.0,0.92,1.19,113.46%,Nectarines,2000-06-25


In [7]:
produce_df.index = pd.to_datetime(produce_df['Date'])

In [8]:
produce_df.drop(columns=['Date'], inplace=True)

In [9]:
conn.close()

# Cleaning Data and Adding Features
Want to update all prices to reflect 2019 dollars (adjust for inflation), and will be taking an average for all retail prices. A dataframe with mean annual farm prices, mean annual average retail, standard deviations, and year as an index will be created.

Since this needs to be done by commodity, I am also going to make a dictionary that holds data frames by commodity, produce_dict. So for example, typing ``produce_dict['Strawberries']`` will return a dataframe concerning only strawberries.

In [10]:
produce_list = list(produce_df['Commodity'].unique())
produce_list

['Strawberries',
 'Romaine Lettuce',
 'Red Leaf Lettuce',
 'Potatoes',
 'Oranges',
 'Iceberg Lettuce',
 'Green Leaf Lettuce',
 'Celery',
 'Cauliflower',
 'Carrots',
 'Cantaloupe',
 'Broccoli Crowns',
 'Avocados',
 'Broccoli Bunches',
 'Asparagus',
 'Flame Grapes',
 'Thompson Grapes',
 'Honeydews',
 'Tomatoes',
 'Plums',
 'Peaches',
 'Nectarines']

In [11]:
# making a quick funciton to use for git
def git(message):
    '''Function adds, commits with message, and pushes to github'''
    !git add .
    !git commit -m f'{message}'
    !git push origin master

In [12]:
git('cleaning notebook')

[master a3f8fd8] fcleaning notebook
 6 files changed, 409 insertions(+), 3177 deletions(-)
 rewrite notebooks/01_exploration/.ipynb_checkpoints/EDA-checkpoint.ipynb (99%)
 rename notebooks/01_exploration/.ipynb_checkpoints/{EDA_LB-checkpoint.ipynb => EDA_LB0-checkpoint.ipynb} (100%)
 copy notebooks/01_exploration/.ipynb_checkpoints/{EDA-checkpoint.ipynb => EDA_LB1-checkpoint.ipynb} (100%)
 rewrite notebooks/01_exploration/EDA.ipynb (99%)
 rename notebooks/01_exploration/{EDA_LB.ipynb => EDA_LB0.ipynb} (100%)
 copy notebooks/01_exploration/{EDA.ipynb => EDA_LB1.ipynb} (99%)
Enumerating objects: 14, done.
Counting objects: 100% (14/14), done.
Delta compression using up to 4 threads
Compressing objects: 100% (8/8), done.
Writing objects: 100% (8/8), 1.38 KiB | 1.38 MiB/s, done.
Total 8 (delta 4), reused 0 (delta 0)
remote: Resolving deltas: 100% (4/4), completed with 4 local objects.[K
To github.com:tim-christy/agriculture-price-analysis.git
   06c16cd..a3f8fd8  master -> master


In [13]:
produce_dict = dict()
for produce in produce_list:
    produce_dict.setdefault(produce, produce_df[produce_df['Commodity'] == produce])

In [14]:
for item in produce_list:
    avg_retail = [np.mean(x[1:5]) for x in produce_dict[item].values]
    avg_retail_std = [np.var(x[1:5],ddof=1) for x in produce_dict[item].values] #sample standard deviation
    produce_dict.get(item).loc[:,'avg_retail'] = avg_retail
    produce_dict.get(item).loc[:,'avg_retail_var'] = avg_retail_std
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [15]:
produce_dict['Strawberries']

Unnamed: 0_level_0,Farm Price,Atlanta Retail,Chicago Retail,Los Angeles Retail,NYC Retail,Avg Spread,Commodity,avg_retail,avg_retail_var
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2019-05-19,1.16,2.23,1.70,1.99,2.54,82.33%,Strawberries,2.1150,0.127233
2019-05-12,0.91,2.67,1.89,2.47,2.66,166.21%,Strawberries,2.4225,0.134492
2019-05-05,0.58,2.42,1.89,2.12,2.91,302.59%,Strawberries,2.3350,0.194033
2019-04-28,0.58,2.79,1.95,2.38,2.96,334.48%,Strawberries,2.5200,0.203667
2019-04-21,0.69,2.92,2.04,2.38,3.16,280.43%,Strawberries,2.6250,0.258500
...,...,...,...,...,...,...,...,...,...
1999-08-23,0.72,2.62,0.00,3.12,2.59,189.24%,Strawberries,2.0825,1.986558
1999-08-16,0.58,2.44,0.00,3.22,2.77,263.36%,Strawberries,2.1075,2.076225
1999-08-09,0.65,2.49,0.00,2.99,2.87,221.15%,Strawberries,2.0875,1.982158
1999-08-02,0.71,2.37,0.00,2.87,2.19,161.62%,Strawberries,1.8575,1.616225


# Adjusting for inflation

Adjusting prices for inflation based on month using the consumer price index data found here [https://www.usinflationcalculator.com/inflation/consumer-price-index-and-annual-percent-changes-from-1913-to-2008/](https://www.usinflationcalculator.com/inflation/consumer-price-index-and-annual-percent-changes-from-1913-to-2008/)

Everything will be changed to correspond to USD in 2020

In [25]:
cpi_df = pd.read_csv('../../data/00_raw/cpi.csv', index_col=0, header=1)

In [26]:
cpi_df

Unnamed: 0_level_0,Jan,Feb,Mar,Apr,May,June,July,Aug,Sep,Oct,Nov,Dec,Avg,Dec-Dec,Avg-Avg
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1913,9.800,9.800,9.800,9.800,9.700,9.800,9.900,9.900,10.000,10.000,10.100,10.000,9.900,–,–
1914,10.000,9.900,9.900,9.800,9.900,9.900,10.000,10.200,10.200,10.100,10.200,10.100,10.000,1,1
1915,10.100,10.000,9.900,10.000,10.100,10.100,10.100,10.100,10.100,10.200,10.300,10.300,10.100,2,1
1916,10.400,10.400,10.500,10.600,10.700,10.800,10.800,10.900,11.100,11.300,11.500,11.600,10.900,12.6,7.9
1917,11.700,12.000,12.000,12.600,12.800,13.000,12.800,13.000,13.300,13.500,13.500,13.700,12.800,18.1,17.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015,233.707,234.722,236.119,236.599,237.805,238.638,238.654,238.316,237.945,237.838,237.336,236.525,237.017,0.7,0.1
2016,236.916,237.111,238.132,239.261,240.236,241.038,240.647,240.853,241.428,241.729,241.353,241.432,240.007,2.1,1.3
2017,242.839,243.603,243.801,244.524,244.733,244.955,244.786,245.519,246.819,246.663,246.669,246.524,245.120,2.1,2.1
2018,247.867,248.991,249.554,250.546,251.588,251.989,252.006,252.146,252.439,252.885,252.038,251.233,251.107,1.9,2.4


In [30]:
produce_dict['Strawberries'].index

DatetimeIndex(['2019-05-19', '2019-05-12', '2019-05-05', '2019-04-28',
               '2019-04-21', '2019-04-14', '2019-04-07', '2019-03-31',
               '2019-03-24', '2019-03-17',
               ...
               '1999-09-26', '1999-09-20', '1999-09-13', '1999-09-06',
               '1999-08-30', '1999-08-23', '1999-08-16', '1999-08-09',
               '1999-08-02', '1999-07-26'],
              dtype='datetime64[ns]', name='Date', length=985, freq=None)

In [42]:
cpi_df.columns[0:13] = list(np.arange(1,13))

TypeError: Index does not support mutable operations