- determine severity of bee loss with colony loss numbers over the years (or by period, noting that 2019 has missing data for one period)
- look at top 10 honey producing or agriculturally significant states to see if any of these states do not compare with the same colony loss trends as over all US 
- compare colony growth/loss findings with national honey report on weight/value of honey production overall and state by state
- determine if honey production is affected by colony growth/loss 


notes: for over all numbers on years will need to skip 2019 and not inlcude 2021 due to missing periods

to fix issue of (x) in US max colonies column which prevents changing the dtype to numeric, i will sub in 0 for that value which is obviously not correct and will sub as na 

for the issue of (z) meaning less than half a unit, i will change to zero

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Colony Counts

In [None]:
colony_counts = pd.read_csv('../data/honeybee_colonies/colony_counts.csv')

In [None]:
colony_counts = colony_counts.drop(columns = ['Unnamed: 0'])

In [None]:
colony_counts.shape

In [None]:
# to fix issues with datatypes, need to change all non numerical values ( like (z) and -)
colony_counts.dtypes

In [None]:
colony_counts['lost_colonies'] = colony_counts['lost_colonies'].str.replace('-', '0')
colony_counts['percent_lost'] = colony_counts['percent_lost'].str.replace('-', '0')
colony_counts['added_colonies'] = colony_counts['added_colonies'].str.replace('-', '0')
colony_counts['renovated'] = colony_counts['renovated'].str.replace('-', '0')
colony_counts['percent_renovated'] = colony_counts['percent_renovated'].str.replace('-', '0')

In [None]:
#because (z) and (x) are in parenthesis, need to individually replace each side of parenthesis, or it will only replace
# what is inside ie z and x, while leaving the ()
colony_counts['max_colonies'] = colony_counts['max_colonies'].str.replace('(', '')

In [None]:
colony_counts['max_colonies'] = colony_counts['max_colonies'].str.replace(')', '')

In [None]:
colony_counts['max_colonies'] = colony_counts['max_colonies'].str.replace('X', '0')

In [None]:
colony_counts['start_colonies'] = colony_counts['start_colonies'].str.replace('(', '')
colony_counts['max_colonies'] = colony_counts['max_colonies'].str.replace('(', '')
colony_counts['lost_colonies'] = colony_counts['lost_colonies'].str.replace('(', '')
colony_counts['percent_lost'] = colony_counts['percent_lost'].str.replace('(', '')
colony_counts['added_colonies'] = colony_counts['added_colonies'].str.replace('(', '')
colony_counts['renovated'] = colony_counts['renovated'].str.replace('(', '')
colony_counts['percent_renovated'] = colony_counts['percent_renovated'].str.replace('(', '')

In [None]:
colony_counts['start_colonies'] = colony_counts['start_colonies'].str.replace(')', '')
colony_counts['max_colonies'] = colony_counts['max_colonies'].str.replace(')', '')
colony_counts['lost_colonies'] = colony_counts['lost_colonies'].str.replace(')', '')
colony_counts['percent_lost'] = colony_counts['percent_lost'].str.replace(')', '')
colony_counts['added_colonies'] = colony_counts['added_colonies'].str.replace(')', '')
colony_counts['renovated'] = colony_counts['renovated'].str.replace(')', '')
colony_counts['percent_renovated'] = colony_counts['percent_renovated'].str.replace(')', '')

In [None]:
colony_counts['start_colonies'] = colony_counts['start_colonies'].str.replace('Z', '0')
colony_counts['max_colonies'] = colony_counts['max_colonies'].str.replace('Z', '0')
colony_counts['lost_colonies'] = colony_counts['lost_colonies'].str.replace('Z', '0')
colony_counts['percent_lost'] = colony_counts['percent_lost'].str.replace('Z', '0')
colony_counts['added_colonies'] = colony_counts['added_colonies'].str.replace('Z', '0')
colony_counts['renovated'] = colony_counts['renovated'].str.replace('Z', '0')
colony_counts['percent_renovated'] = colony_counts['percent_renovated'].str.replace('Z', '0')

In [None]:
# need to remove commas from numbers to convert to int
colony_counts['start_colonies'] = colony_counts['start_colonies'].str.replace(',', '')
colony_counts['max_colonies'] = colony_counts['max_colonies'].str.replace(',', '')
colony_counts['lost_colonies'] = colony_counts['lost_colonies'].str.replace(',', '')
colony_counts['percent_lost'] = colony_counts['percent_lost'].str.replace(',', '')
colony_counts['added_colonies'] = colony_counts['added_colonies'].str.replace(',', '')
colony_counts['renovated'] = colony_counts['renovated'].str.replace(',', '')
colony_counts['percent_renovated'] = colony_counts['percent_renovated'].str.replace(',', '')

In [None]:
colony_counts['start_colonies'] = pd.to_numeric(colony_counts['start_colonies'])

In [None]:
colony_counts['max_colonies'] = pd.to_numeric(colony_counts['max_colonies'])

In [None]:
colony_counts['lost_colonies'] = pd.to_numeric(colony_counts['lost_colonies'])
colony_counts['percent_lost'] = pd.to_numeric(colony_counts['percent_lost'])
colony_counts['added_colonies'] = pd.to_numeric(colony_counts['added_colonies'])
colony_counts['renovated'] = pd.to_numeric(colony_counts['renovated'])
colony_counts['percent_renovated'] = pd.to_numeric(colony_counts['percent_renovated'])

In [None]:
colony_counts.head()

In [None]:
# csv for colony counts with cleaned numeric values (for use in tableau)
# colony_counts.to_csv('../data/honeybee_colonies/colony_counts_clean.csv')

In [None]:
us_colony_counts = colony_counts.loc[colony_counts['state'] == 'United States']

In [None]:
us_colony_counts

In [None]:
# because 2019 and 2021 do not have full accounts of colonies for the year, I need to drop those from the list
us_colony_counts = us_colony_counts.loc[~us_colony_counts['year'].isin([2019, 2021])]

In [None]:
us_colony_counts.head()

### for years observed 2015 - 2020 (excluding 2019) entire colonies tend to die off in colder months, and new colonies are formed with renovation( new queen and implanted workers) but the numbers are struggling to grow, though over all US colony count in the past 5 years has not dropped. 

### lets compare this to honey production values to get an idea of colony health (which can then be compared to stressor data!

In [None]:
#i put the two below facet grids together in tableau
us_counts_grid = sns.FacetGrid(us_colony_counts, col = 'year')
us_counts_grid.map(sns.barplot, 'period', 'start_colonies')

In [None]:
us_counts_grid = sns.FacetGrid(us_colony_counts, col = 'year')
us_counts_grid.map(sns.barplot, 'period', 'lost_colonies')

In [None]:
# eventally find states with over all top number of colonies, first look at colony numbers by state and year
states_colony_counts = colony_counts.loc[~colony_counts['state'].isin(['United States'])]

In [None]:
states_colony_counts = states_colony_counts.loc[~states_colony_counts['year'].isin([2019, 2021])]

In [None]:
states_colony_counts.head()

In [None]:
states_yearly = states_colony_counts.groupby(['state', 'year'])['start_colonies', 'lost_colonies', 'renovated'].agg('mean').reset_index()

In [None]:
states_yearly

In [None]:
# csv for states yearly (for use in tableau)
# states_yearly.to_csv('../data/honeybee_colonies/states_yearly.csv')

### Top 5 states with over all most colonies : How do these colonies compare to national trends? looked at graph comparison of top 3 states in tableau. US overall is on a slight downtrend in colony size, where as cali and florida are on an up trend of overall colonies. 

 #### California
 #### North Dakota
 #### Florida
 #### Texas
 #### Georgia

In [None]:
# over all years, top 5 states with most colonies
states_yearly.groupby('state')['start_colonies'].mean().sort_values(ascending = False).head()

### With over all view of colonies in past 5 years, what is their honey production like, and how does that compare to honey prices. After this, see if trends in stressors affect honey production

# Honey production

In [None]:
honey_yield = pd.read_csv('../data/honey_nass/honey_yield.csv')

In [None]:
honey_yield

In [None]:
honey_yield = honey_yield.drop(columns = ['Unnamed: 0'])

In [None]:
honey_yield.shape

In [None]:
honey_yield['state'] = honey_yield['state'].str.replace('United States 6 7', 'United States')

In [None]:
honey_yield['state'] = honey_yield['state'].str.replace('Other States 5 6', 'Other States')

In [None]:
honey_yield.dtypes

In [None]:
honey_yield.head()

In [None]:
honey_yield['honey_producing_colony_thousands'] = honey_yield['honey_producing_colony_thousands'].str.replace(',', '')
honey_yield['production_1000_pounds'] = honey_yield['production_1000_pounds'].str.replace(',', '')
honey_yield['stocks_dec_15_1000_pounds'] = honey_yield['stocks_dec_15_1000_pounds'].str.replace(',', '')
honey_yield['value_1000_dollars'] = honey_yield['value_1000_dollars'].str.replace(',', '')

In [None]:
honey_yield['honey_producing_colony_thousands'] = pd.to_numeric(honey_yield['honey_producing_colony_thousands'])
honey_yield['production_1000_pounds'] = pd.to_numeric(honey_yield['production_1000_pounds'])
honey_yield['stocks_dec_15_1000_pounds'] = pd.to_numeric(honey_yield['stocks_dec_15_1000_pounds'])
honey_yield['value_1000_dollars'] = pd.to_numeric(honey_yield['value_1000_dollars'])

In [None]:
honey_yield.dtypes

In [None]:
honey_yield['honey_producing_colony_thousands'] = honey_yield['honey_producing_colony_thousands'] * 1000
honey_yield['production_1000_pounds'] = honey_yield['production_1000_pounds'] * 1000
honey_yield['stocks_dec_15_1000_pounds'] = honey_yield['stocks_dec_15_1000_pounds'] * 1000
honey_yield['value_1000_dollars'] = honey_yield['value_1000_dollars'] * 1000

In [None]:
honey_yield.head()

In [None]:
honey_yield = honey_yield.rename(columns = {'honey_producing_colony_thousands': 'honey_producing_colonies', 'production_1000_pounds': 'production_pounds', 'stocks_dec_15_1000_pounds': 'stocks_dec_15_pounds', 'value_1000_dollars': 'value_dollars'})

In [None]:
honey_yield

In [None]:
us_honey_yield = honey_yield.loc[honey_yield['state'] == 'United States']

In [None]:
us_honey_yield

In [None]:
# change yticks from scientific notation at some point
plt.figure(figsize = (10,6))
ax = sns.lineplot(data = us_honey_yield, 
                  x = 'year', 
                  y = 'production_pounds',
                  marker = 'o',
                  markersize = 5,
                  color = 'darkgoldenrod');

plt.title('U.S. Honey Production By Year', fontsize = 16)
plt.ylabel('Pounds of Honey', fontsize = 12)
ax.set(xlabel = None);


In [None]:
plt.figure(figsize = (10,6))
ax = sns.lineplot(data = us_honey_yield, 
                  x = 'year', 
                  y = 'value_dollars',
                  marker = 'o',
                  markersize = 5,
                  color = 'darkgreen');

plt.title('U.S. Total Honey Production Value By Year', fontsize = 16)
plt.ylabel('Value in Dollars', fontsize = 12)
ax.set(xlabel = None);

In [None]:
plt.figure(figsize = (10,6))
ax = sns.lineplot(data = us_honey_yield, 
                  x = 'year', 
                  y = 'avg_price_per_pound_dollars',
                  marker = 'o',
                  markersize = 5,
                  color = 'slategrey');

plt.title('U.S. Honey Price per Pound', fontsize = 16)
plt.ylabel('Value in Dollars', fontsize = 12)
ax.set(xlabel = None);

### As shown by the above three graphs, despite steady colony numbers, honey production is declining, but the price of honey is increasing. Likely the lower yield is driving up the market

### Next steps: see how stressors have changed over the years!