# save the bees dataset 
Using the dataset, I've plotted the data points along with calculated trendlines in order to estimate the possible number of bee colony numbers in the USA in future years.

In [1]:
import pandas as pd
import numpy as np
from bokeh.plotting import figure 
from bokeh.io import output_file, show
from bokeh.models import HoverTool, ColumnDataSource

In [2]:
df=pd.read_csv("bee_colony_survey_data_by_state.csv")

In [3]:
df

Unnamed: 0,year,period,week_ending,state,state_ansi,watershed,data_item,value,cv
0,2017,JAN THRU MAR,,ALABAMA,1,,ADDED & REPLACED,570,
1,2017,JAN THRU MAR,,ARIZONA,4,,ADDED & REPLACED,2900,
2,2017,JAN THRU MAR,,ARKANSAS,5,,ADDED & REPLACED,430,
3,2017,JAN THRU MAR,,CALIFORNIA,6,,ADDED & REPLACED,215000,
4,2017,JAN THRU MAR,,COLORADO,8,,ADDED & REPLACED,100,
...,...,...,...,...,...,...,...,...,...
3391,1987,MARKETING YEAR,,VIRGINIA,51,,INVENTORY,25000,
3392,1987,MARKETING YEAR,,WASHINGTON,53,,INVENTORY,75000,
3393,1987,MARKETING YEAR,,WEST VIRGINIA,54,,INVENTORY,21000,
3394,1987,MARKETING YEAR,,WISCONSIN,55,,INVENTORY,92000,


The data shows different periods of time, some full year and some quarterly, and different data items. I'm just interested in taking the full inventory for each year, totalled for the whole USA:

In [4]:
marketing_year=df[df['period']=='MARKETING YEAR']
year_and_total = marketing_year.groupby('year').value.sum()

The bokeh plot below opens a new HTML file that shows an interactive scatter plot with a linear trend line

In [5]:
output_file("scatter.html")
f=figure(plot_width=800)

x=year_and_total.index
y=year_and_total.values/1000000

cds = ColumnDataSource(data=dict(
    x=x,
    y=y,
    total=year_and_total.values,
    year=year_and_total.index
))

c1=f.circle(x='x',y='y',size=8,source=cds)
l1=f.line(x='x',y='y',source=cds) 

f.title.text="Bee Colonies in the USA from 1987-2017" 
f.xaxis.axis_label="Year"
f.yaxis.axis_label="Bee Colonies (Millions)"
f.xaxis.ticker = [y for y in year_and_total.index if y%2==1]

hover = HoverTool(renderers=[c1,l1],tooltips=[("Year","@year"),("Total", "@total")])
f.add_tools(hover)

par = np.polyfit(x, y, 1, full=True)
slope=par[0][0]
intercept=par[0][1]
y_predicted = [slope*i + intercept  for i in x]
f.line(x,y_predicted,color='red',legend='y='+str(round(slope,2))+'x+'+str(round(intercept,2)))

show(f)

Using matplotlib, I've created another plot showing a third degree polynomial fit on the data

In [6]:
import matplotlib.pyplot as plt
import scipy
from scipy.optimize import curve_fit

def func(x, a, b, c, d):
    return a*x**3 + b*x**2 +c*x + d

popt, pcov = curve_fit(func, x, y)
plt.plot(x, func(x, *popt), label="Fitted Curve") 
plt.plot(x, y, 'ro', label="Data")
plt.legend(loc='upper right')
plt.show()

<Figure size 640x480 with 1 Axes>