In [1]:
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from scipy import stats
from bokeh.io import output_notebook, show, curdoc, save
from bokeh.plotting import figure, reset_output
from bokeh.layouts import gridplot, column, row
from bokeh.models import (
    ColumnDataSource,
    ColorBar,
    LinearColorMapper,
    LogColorMapper, HoverTool, 
    ResetTool, ResizeTool, WheelZoomTool, LabelSet, Div
)
from bokeh.palettes import Plasma256, Category20c
from bokeh.embed import file_html
from bokeh.resources import CDN
from bokeh.embed import components

In [2]:
output_notebook()

# Does A Warm December Mean More Snow in the Winter?

First, we are going to read in some csv files create with data from the NOWData section of the Applied Climate Information System (ACIS) from the National Weather Service.

In [3]:
ls

[0m[01;34mdays_of_snow[0m/  [01;34mhtml_charts[0m/  [01;34msnowfall[0m/  snow.ipynb  [01;34mtemps[0m/


In [4]:
boston_snow = pd.read_csv('snowfall/boston_snowfall_clean.csv', index_col = 0)
hartford_snow = pd.read_csv('snowfall/hartford_snowfall_clean.csv', index_col = 0)
milton_snow = pd.read_csv('snowfall/milton_snowfall_clean.csv', index_col = 0)
pvd_snow = pd.read_csv('snowfall/pvd_snowfall_clean.csv', index_col = 0)
worcester_snow = pd.read_csv('snowfall/worcester_snowfall_clean.csv', index_col = 0)

In [5]:
boston_temp = pd.read_csv('temps/boston_temp.csv')
hartford_temp = pd.read_csv('temps/hartford_temp.csv')
milton_temp = pd.read_csv('temps/milton_temp.csv')
pvd_temp = pd.read_csv('temps/pvd_temp.csv')
worcester_temp = pd.read_csv('temps/worcester_temp.csv')

In [6]:
boston_snow_days = pd.read_csv('days_of_snow/boston_days_of_snow_clean.csv', index_col = 0)
hartford_snow_days = pd.read_csv('days_of_snow/hartford_days_of_snow_clean.csv', index_col = 0)
milton_snow_days = pd.read_csv('days_of_snow/milton_days_of_snow_clean.csv', index_col = 0)
pvd_snow_days = pd.read_csv('days_of_snow/pvd_days_of_snow_clean.csv', index_col = 0)
worcester_snow_days = pd.read_csv('days_of_snow/worcester_days_of_snow_clean.csv', index_col = 0)

The files are little messy with some values such as "T" for trace amounts and "M" for missing data. Columns with these types of data values are also not currently numeric, so we will convert them to make working with the data easier.

The dataframes for the amount of snow and number of snowdays initials ran through a season instead of just through a year. It seemed easier to just work with the snow value as Jan-Dec yearly data, like the tempature data. I'm not sure this was really necessary, but it did make joining the dataframes together pretty straight forward. A python script was run to move the columns into the desired order and changing the year values could have been done in that script, but instead was done in the for loops below.

In [7]:
snow_files = [boston_snow, hartford_snow, milton_snow, pvd_snow, worcester_snow]
temp_files = [boston_temp, hartford_temp, milton_temp, pvd_temp, worcester_temp]
snow_day_files = [boston_snow_days, hartford_snow_days, milton_snow_days, pvd_snow_days, worcester_snow_days]

In [8]:
for file in snow_files:
    file["Year"] = file["Year"].apply(lambda text: text.split("-")[1])
    file.replace(to_replace = ["T", "M"], value=[0,np.nan], inplace = True)
    for column in file.columns:
        file[column] = pd.to_numeric(file[column])

In [9]:
for file in temp_files:
    file.replace(to_replace = ["T", "M"], value=[0,np.nan], inplace = True)
    for column in file.columns:
        file[column] = pd.to_numeric(file[column])

In [10]:
for file in snow_day_files:
    file["Year"] = file["Year"].apply(lambda text: text.split("-")[1])
    file.replace(to_replace = ["T", "M"], value=[0,np.nan], inplace = True)
    for column in file.columns:
        file[column] = pd.to_numeric(file[column])

We want to look at the difference each December's average temperature is from the overall average tempature for a December in that area.

In [11]:
for file in temp_files:
    file["Difference from Avg"] = file["Dec"] - file["Dec"].mean()

Some years had missing data, so if two or more months are missing, we'll simply drop the row.

In [12]:
boston_snow_data = boston_snow[boston_snow.columns[0:6]].dropna(thresh=2)
hartford_snow_data = hartford_snow[hartford_snow.columns[0:6]].dropna(thresh=2)
milton_snow_data = milton_snow[milton_snow.columns[0:6]].dropna(thresh=2)
pvd_snow_data = pvd_snow[pvd_snow.columns[0:6]].dropna(thresh=2)
worcester_snow_data = worcester_snow[worcester_snow.columns[0:6]].dropna(thresh=2)

In [13]:
boston_snowdays_data = boston_snow_days[boston_snow_days.columns[0:6]].dropna(thresh=2)
hartford_snowdays_data = hartford_snow_days[hartford_snow_days.columns[0:6]].dropna(thresh=2)
milton_snowdays_data = milton_snow_days[milton_snow_days.columns[0:6]].dropna(thresh=2)
pvd_snowdays_data = pvd_snow_days[pvd_snow_days.columns[0:6]].dropna(thresh=2)
worcester_snowdays_data = worcester_snow_days[worcester_snow_days.columns[0:6]].dropna(thresh=2)

We don't need all of the average monthly tempatures the tempature dataset comes with, so we'll condense this to just the Year and the Difference from Average columns.

In [14]:
boston_temp_data = boston_temp[["Year", "Difference from Avg"]]
hartford_temp_data = hartford_temp[["Year", "Difference from Avg"]]
milton_temp_data = milton_temp[["Year", "Difference from Avg"]]
pvd_temp_data = pvd_temp[["Year", "Difference from Avg"]]
worcester_temp_data = worcester_temp[["Year", "Difference from Avg"]]

To make sure the December from the previous year is corresponding with the following years January-May data, I added a year to the year column in the tempature data we are using. This means that the year 1986 in this dataframe corresponds to the December tempature from 1985. (There may have been a better way to do this.)

In [15]:
temp_data = [boston_temp_data,hartford_temp_data, milton_temp_data, pvd_temp_data, worcester_temp_data]
for file in temp_data:
    file["Year"] = file["Year"]+1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [16]:
boston_SD = pd.merge(boston_snowdays_data, boston_temp_data, how='inner', on="Year")
hartford_SD = pd.merge(hartford_snowdays_data, hartford_temp_data, how='inner', on="Year")
milton_SD = pd.merge(milton_snowdays_data, milton_temp_data, how='inner', on="Year")
pvd_SD = pd.merge(pvd_snowdays_data, pvd_temp_data, how='inner', on="Year")
worcester_SD = pd.merge(worcester_snowdays_data, worcester_temp_data, how='inner', on="Year")

In [17]:
boston_data = pd.merge(boston_snow_data, boston_temp_data, how='inner', on="Year")
hartford_data = pd.merge(hartford_snow_data, hartford_temp_data, how='inner', on="Year")
milton_data = pd.merge(milton_snow_data, milton_temp_data, how='inner', on="Year")
pvd_data = pd.merge(pvd_snow_data, pvd_temp_data, how='inner', on="Year")
worcester_data = pd.merge(worcester_snow_data, worcester_temp_data, how='inner', on="Year")

In [18]:
data_files = [boston_data,hartford_data, milton_data, pvd_data, worcester_data]
for file in data_files:
    file["Total Snow"] = file[file.columns[1:6]].sum(axis=1)
    
data_filesSD = [boston_SD,hartford_SD, milton_SD, pvd_SD, worcester_SD]
for file in data_filesSD:
    file["Total Snow Days"] = file[file.columns[1:6]].sum(axis=1)

In [19]:
def r2_correlation(x, y):
    mask = ~np.isnan(x) & ~np.isnan(y)
    slope, _, r_value, _, _ = stats.linregress(x[mask], y[mask])
    return (r_value**2)

I tried using matplotlib and seaborn, but couldn't quite get the result I wanted. This is the first time I've tried Bokeh. It wasn't quite as intuitive as matplotlib, but it can be customized to a further extent. 

In [20]:
def make_plot(mapper, title, tools, minX, maxX, minY, maxY):
    mapper.low_color = 'blue'
    mapper.high_color = 'red'
    p = figure(plot_width=750, plot_height=500, x_range=(minX, maxX), y_range=(minY,maxY), 
               toolbar_location='right',toolbar_sticky=False, tools=tools, title=title)
    color_bar = ColorBar(color_mapper=mapper, location=(0, 0))
    p.circle(
        x='x', y='y', size='z', fill_alpha=0.5,
        fill_color={'field': 'y', 'transform': mapper}, line_color=None,
        source=source
    )
    p.add_layout(color_bar, 'right')
    return p

In [21]:
cities = ["Boston", "Hartford", "Milton", "Providence", "Worcester"]
snowfall_graphs = {'p0':None, 'p1':None,'p2':None,'p3':None,'p4':None} 
snowday_graphs = {'p5':None, 'p6':None,'p7':None,'p8':None,'p9':None} 
r2_values_snowfall = defaultdict(int)
r2_values_snowdays = defaultdict(int)

In [22]:
curdoc().clear()

In [23]:
i = 0

for file in data_files:
    x = file["Total Snow"]
    y = file["Difference from Avg"]
    sizes = 12 * np.log10(file["Total Snow"])
    source = ColumnDataSource(dict(x=x,y=y,z=sizes))
    hover = HoverTool(
    tooltips=[
        ( 'Total Snow',   '@x{00.00}"'),
        ( 'Difference from Avg',  '@y{00.00}\N{DEGREE SIGN}')
    ])
    TOOLS = [hover, ResetTool(), ResizeTool(), WheelZoomTool()]
    graph_title = cities[i]+' December Temp v. Snowfall'
    current_graph = "p" + str(i)
    p = make_plot(LinearColorMapper(palette=Plasma256, low=-15, high=15), title=graph_title, tools = TOOLS,minX=0,maxX=150,minY=-15,maxY=15)
    p.title.align = "center"
    p.title.text_font_size = "12.5px"
    p.line(range(0,141), [0]*141, line_width=2, line_dash='dashed', line_color='black', line_alpha=0.5)
    p.xaxis[0].axis_label = 'Snowfall Jan-May'
    p.yaxis[0].axis_label = 'Difference from Avg Temp (F)'
    snowfall_graphs[current_graph] = p
    r2_values_snowfall[graph_title] = r2_correlation(x, y)
    i += 1

  


In [24]:
i = 5

for file in data_filesSD:
    x = file["Total Snow Days"]
    y = file["Difference from Avg"]
    sizes = 12 * np.log10(file["Total Snow Days"])
    source = ColumnDataSource(dict(x=x,y=y,z=sizes))
    hover = HoverTool(
    tooltips=[
        ( 'Total Snow Days',   '@x{00.00}"'),
        ( 'Difference from Avg',  '@y{00.00}\N{DEGREE SIGN}')
    ])
    TOOLS = [hover, ResetTool(), ResizeTool(), WheelZoomTool()]
    graph_title = cities[i-5]+' December Temp v. Days of Snowfall'
    current_graph = "p" + str(i)
    p = make_plot(LinearColorMapper(palette=Plasma256, low=-15, high=15), title=graph_title, tools = TOOLS, minX=0, maxX=50, minY=-15, maxY=15)
    p.title.align = "center"
    p.title.text_font_size = "12.5px"
    p.line(range(0,141), [0]*141, line_width=2, line_dash='dashed', line_color='black', line_alpha=0.5)
    p.xaxis[0].axis_label = 'Snowdays Jan-May'
    p.yaxis[0].axis_label = 'Difference from Avg Temp (F)'
    snowday_graphs[current_graph] = p
    r2_values_snowdays[graph_title] = r2_correlation(x, y)
    i += 1

  


I couldn't find any continuous color palettes that would represent tempature using a blue to red scale, the purple to yellow isn't ideal, but I think it works well enough. The circles at each data point are also sized using the amount of snow fall for that period. Bokeh's hover and scroll zoom tools are very useful in poking around the visualiztions. 

In [25]:
#show(gridplot([snowfall_graphs['p0'], snowfall_graphs['p1'], snowfall_graphs['p2'], snowfall_graphs['p3'], snowfall_graphs['p4']], ncols=2, plot_width=400, plot_height=300, toolbar_location="left"))
plots = gridplot([snowfall_graphs['p0'], snowfall_graphs['p1'], snowfall_graphs['p2'], snowfall_graphs['p3'], snowfall_graphs['p4']], ncols=2, plot_width=400, plot_height=300, toolbar_location="left")

In [26]:
save(plots, 'html_charts/TempvsSnowFall.html', CDN, 'December Temp vs Snowfall')

'/home/scott/ds/snowfall/html_charts/TempvsSnowFall.html'

In [32]:
from IPython.display import IFrame
IFrame('html_charts/TempvsSnowFall.html', width=900, height=900)

In [33]:
#show(gridplot([snowday_graphs['p5'], snowday_graphs['p6'], snowday_graphs['p7'], snowday_graphs['p8'], snowday_graphs['p9']], ncols=2, plot_width=400, plot_height=300, toolbar_location="left"))
plots2 = gridplot([snowday_graphs['p5'], snowday_graphs['p6'], snowday_graphs['p7'], snowday_graphs['p8'], snowday_graphs['p9']], ncols=2, plot_width=400, plot_height=300, toolbar_location="left")

In [34]:
save(plots2, 'html_charts/TempvsSnowDays.html', CDN, 'December Temp vs Days of Snowdays')

'/home/scott/ds/snowfall/html_charts/TempvsSnowDays.html'

In [35]:
from IPython.display import IFrame
IFrame('html_charts/TempvsSnowDays.html', width=900, height=900)

The charts above seem to show little correlation between December's tempature and the following months snowfall. Initially after not finding much in the total snowfall data, I looked at the days of snow in the season. This did not produce any findings either. The R^2 values are below and you can see that they are small and not of signifigance.

In [36]:
r2_values_snowfall

defaultdict(int,
            {'Boston December Temp v. Snowfall': 0.013958792077908446,
             'Hartford December Temp v. Snowfall': 0.071711042452869883,
             'Milton December Temp v. Snowfall': 0.0092174191061341337,
             'Providence December Temp v. Snowfall': 0.03278116658761554,
             'Worcester December Temp v. Snowfall': 0.0053879797261546999})

In [37]:
r2_values_snowdays

defaultdict(int,
            {'Boston December Temp v. Days of Snowfall': 0.014225972419857496,
             'Hartford December Temp v. Days of Snowfall': 0.068383620216129412,
             'Milton December Temp v. Days of Snowfall': 0.0099339380592175387,
             'Providence December Temp v. Days of Snowfall': 0.048451686786728759,
             'Worcester December Temp v. Days of Snowfall': 0.01411728494082137})

After not finding much in the way of my original question, I decided to look at some fun observational questions. The first being; how much does it normal snow before New Years?

In [38]:
avgsnow_before_newyear = {}
for i, file in enumerate(snow_files):
    avgsnow_before_newyear[cities[i]] = file[["Sep", "Oct", "Nov", "Dec"]].sum().sum()/file[["Sep", "Oct", "Nov", "Dec"]].shape[0]

In [39]:
avgsnow_before_newyear

{'Boston': 8.880158730158731,
 'Hartford': 10.665178571428571,
 'Milton': 13.7,
 'Providence': 7.480530973451325,
 'Worcester': 13.457600000000005}

Worcester and Milton lead the way, and that will end up being a trend through the rest of these results.

In [40]:
curdoc().clear()

In [41]:
x = list(avgsnow_before_newyear.keys())
y = list(avgsnow_before_newyear.values())
p = figure(x_range=x, plot_height=350, y_range=(0,15), title="Average Snowfall before New Year",
           toolbar_location=None, tools="save")
c1 = random.choice(list(Category20c.keys()))
c2 = random.choice(range(0,len(Category20c[c1])))
p.vbar(x=x, top=y, width=0.75, color=Category20c[c1][c2])
p.xgrid.grid_line_color = None
p.xaxis[0].axis_label = 'City'
p.yaxis[0].axis_label = 'Avg Snowfall (in)'
save(p, 'html_charts/SnowBeforeNY.html', CDN, 'Average Snowfall Before New Year')

'/home/scott/ds/snowfall/html_charts/SnowBeforeNY.html'

In [48]:
from IPython.display import IFrame
IFrame('html_charts/SnowBeforeNY.html', width=650, height=400)

Next, let's look at some the average and total snowfall for each month.

In [49]:
total_by_month_BOS = {}
total_by_month_HAR = {}
total_by_month_MIL = {}
total_by_month_PVD = {}
total_by_month_WOO = {}
avg_by_month_BOS = {}
avg_by_month_HAR = {}
avg_by_month_MIL = {}
avg_by_month_PVD = {}
avg_by_month_WOO = {}

In [50]:
monthly_snowfall_total = {"Monthly Snow Totals (Boston)":total_by_month_BOS, "Monthly Snow Totals (Hartford)":total_by_month_HAR, 
                          "Monthly Snow Totals (Milton)":total_by_month_MIL, "Monthly Snow Totals (Providence)": total_by_month_PVD, 
                          "Monthly Snow Totals (Worcester)": total_by_month_WOO}
monthly_snowfall_avg = {"Average Monthly Snowfall (Boston)":avg_by_month_BOS, "Average Monthly Snowfall (Hartford)":avg_by_month_HAR,
                       "Average Monthly Snowfall (Milton)":avg_by_month_MIL, "Average Monthly Snowfall (Providence)":avg_by_month_PVD,
                       "Average Monthly Snowfall (Worcester)":avg_by_month_WOO}

In [51]:
i=0
for name in monthly_snowfall_total:
    file =  snow_files[i]
    for month in file.columns[1:]:
        monthly_snowfall_total[name][month] = round(file[month].sum(),2)
    i += 1

In [52]:
i=0
for name in monthly_snowfall_avg:
    file =  snow_files[i]
    for month in file.columns[1:]:
        monthly_snowfall_avg[name][month] = round((file[month].sum()/len(file[month])),2)
    i += 1

In [53]:
total_snow_plots = {"p10":None, "p11":None, "p12":None, "p13":None, "p14":None}
i = 10

for data in monthly_snowfall_total:
    x = list(monthly_snowfall_total[data].keys())
    y = list(monthly_snowfall_total[data].values())
    x = x[:5] + x[9:]
    y = y[:5] + y[9:]
    p = figure(x_range=x, plot_height=250, y_range=(0,2100), title=data,
           toolbar_location=None, tools="save")
    c1 = random.choice(list(Category20c.keys()))
    c2 = random.choice(range(0,len(Category20c[c1])))
    p.vbar(x=x, top=y, width=0.75, color=Category20c[c1][c2])
    p.xgrid.grid_line_color = None
    p.xaxis[0].axis_label = 'Month'
    p.yaxis[0].axis_label = 'Total Snow (in)'
    total_snow_plots["p"+str(i)] = p
    i += 1

In [54]:
#show(gridplot([total_snow_plots['p10'], total_snow_plots['p11'], total_snow_plots['p12'], total_snow_plots['p13'], total_snow_plots['p14']], ncols=2, plot_width=400, plot_height=300, toolbar_location="left"))
plots3 = gridplot([total_snow_plots['p10'], total_snow_plots['p11'], total_snow_plots['p12'], total_snow_plots['p13'], total_snow_plots['p14']], ncols=2, plot_width=400, plot_height=300, toolbar_location="left")
save(plots3, 'html_charts/TotalSnowfall.html', CDN, 'Total Snowfall')

'/home/scott/ds/snowfall/html_charts/TotalSnowfall.html'

In [59]:
from IPython.display import IFrame
IFrame('html_charts/TotalSnowfall.html', width=900, height=900)

In [60]:
avg_snow_plots = {"p15":None, "p16":None, "17":None, "p18":None, "p19":None}
i = 15

for data in monthly_snowfall_avg:
    x = list(monthly_snowfall_avg[data].keys())
    y = list(monthly_snowfall_avg[data].values())
    x = x[:5] + x[9:]
    y = y[:5] + y[9:]
    p = figure(x_range=x, plot_height=250, y_range=(0,18), title=data,
           toolbar_location=None, tools="save")
    c1 = random.choice(list(Category20c.keys()))
    c2 = random.choice(range(0,len(Category20c[c1])))
    p.vbar(x=x, top=y, width=0.75, color=Category20c[c1][c2])
    p.xgrid.grid_line_color = None
    p.xaxis[0].axis_label = 'Month'
    p.yaxis[0].axis_label = 'Average Snow (in)'
    avg_snow_plots["p"+str(i)] = p
    i += 1

In [61]:
#show(gridplot([avg_snow_plots['p15'], avg_snow_plots['p16'], avg_snow_plots['p17'], avg_snow_plots['p18'], avg_snow_plots['p19']], ncols=2, plot_width=400, plot_height=300, toolbar_location="left"))
plots4 = gridplot([avg_snow_plots['p15'], avg_snow_plots['p16'], avg_snow_plots['p17'], avg_snow_plots['p18'], avg_snow_plots['p19']], ncols=2, plot_width=400, plot_height=300, toolbar_location="left")
save(plots4, 'html_charts/AvgSnowfall.html', CDN, 'Average Snowfall')

'/home/scott/ds/snowfall/html_charts/AvgSnowfall.html'

In [62]:
from IPython.display import IFrame
IFrame('html_charts/AvgSnowfall.html', width=900, height=900)

In the charts it doesn't look like much, but both Milton and Worcester have a larger Octber snowfall total than one might expect looking at their neighbors. A large amount of this total is due to a storm from 2011. At the time city officials even recommended that Trick-or-Treating be done on the Thursday instead of Halloween, the Monday, to give crews a chance to clean up.

May is also a little weird due to a storm in Worcester and Milton in 1977, it dropped 12.7 inches.

The average amount of snowfall also feels lower than I would imagine, but I guess you tend to remember the terrible months and forget the mild ones.

Next I wanted to look at the top 20 snowiest months. Living in Providence and being from Rhode Island, I decided to concentrate on just that data. It wouldn't be too difficult to look at the other data sets, but I already feel like I should move on from this data. 

In [63]:
month_year_snow = {}
#create list of top 20, and if a value in spot is larger, replace [-1] index with it, resort list, and continue?
pvd_snow.max()

Year    2016.0
Jan       36.7
Feb       31.8
Mar       31.6
Apr        9.6
May        7.0
Jun        0.0
Jul        0.0
Aug        0.0
Sep        0.0
Oct        2.3
Nov       10.2
Dec       26.7
dtype: float64

In [64]:
cols = list(pvd_snow.columns[:4]) + list(pvd_snow.columns[11:])
max_snow_pvd_df = pvd_snow.loc[:, cols]
max_snow_pvd_df.fillna(0, inplace=True)

In [65]:
max_snow_pvd_df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,103,104,105,106,107,108,109,110,111,112
Year,1904.0,1905.0,1906.0,1907.0,1908.0,1909.0,1910.0,1911.0,1912.0,1913.0,...,2007.0,2008.0,2009.0,2010.0,2011.0,2012.0,2013.0,2014.0,2015.0,2016.0
Jan,0.0,13.6,8.4,15.3,7.4,10.4,15.7,0.4,22.9,1.6,...,1.2,2.7,14.9,4.3,30.7,10.7,6.5,15.0,25.6,7.3
Feb,0.0,6.1,3.8,26.2,9.8,3.2,10.6,14.0,0.6,5.0,...,7.3,6.2,3.9,4.9,8.5,4.6,24.3,20.0,31.8,11.5
Mar,0.0,1.2,16.9,9.4,6.0,3.5,0.5,4.6,9.5,0.2,...,5.8,1.2,11.6,0.6,3.7,0.2,4.9,1.2,16.3,2.6
Nov,2.5,0.0,1.9,0.0,0.0,0.0,0.8,0.0,2.2,0.4,...,0.0,0.0,0.0,0.1,0.0,1.3,0.1,1.6,0.0,0.0
Dec,17.4,3.6,9.0,10.9,2.1,10.3,8.5,3.2,7.0,0.2,...,14.4,20.6,21.7,10.2,0.0,9.3,7.0,0.9,0.6,4.4


In [66]:
max_snow_pvd_df.set_index('Year', inplace=True)

In [67]:
master = pd.DataFrame(max_snow_pvd_df['Jan'].nlargest(20))
master.columns = ['Snowfall']
master['Month'] = 'Jan'

In [68]:
master.head()

Unnamed: 0_level_0,Snowfall,Month
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2005,36.7,Jan
1948,31.7,Jan
2011,30.7,Jan
1965,28.7,Jan
2015,25.6,Jan


In [69]:
for month in ['Feb', 'Mar', 'Nov', 'Dec']:
    temp = pd.DataFrame(max_snow_pvd_df[month].nlargest(20))
    temp.columns = ['Snowfall']
    temp['Month'] = month
    master = master.append(temp)

In [70]:
snowiest_months = master.nlargest(20, 'Snowfall')

In [71]:
snowiest_months

Unnamed: 0_level_0,Snowfall,Month
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2005,36.7,Jan
2015,31.8,Feb
1948,31.7,Jan
1956,31.6,Mar
1962,30.9,Feb
2011,30.7,Jan
1965,28.7,Jan
1978,28.6,Feb
1969,26.7,Feb
1945,26.7,Dec


In [72]:
snowiest_months.index.value_counts()

2015    2
1945    2
1907    1
1956    1
1994    1
1962    1
1965    1
1967    1
1969    1
2005    1
2013    1
2003    1
1943    1
1935    1
1978    1
2011    1
1948    1
1923    1
Name: Year, dtype: int64

In [73]:
snowiest_months.Month.value_counts()

Feb    9
Jan    8
Mar    2
Dec    1
Name: Month, dtype: int64

Two year appear in the dataframe twice, 1945's December and February and 2015's January and February. Feburary has 9 of the top 20 snowiest months with January right behind, together they make up 17 out of the 20 (85%) snowiest months.

I realized the easiest way to look at the average snowfall in a season would be to use the original csv files that run from July of one year to Jun of the next.

In [74]:
boston_seasonal_snow = pd.read_csv('snowfall/old_snowfall/boston_snowfall.csv', index_col = 0)
hartford_seasonal_snow = pd.read_csv('snowfall/old_snowfall/hartford_snowfall.csv', index_col = 0)
milton_seasonal_snow = pd.read_csv('snowfall/old_snowfall/milton_snowfall.csv', index_col = 0)
pvd_seasonal_snow = pd.read_csv('snowfall/old_snowfall/pvd_snowfall.csv', index_col = 0)
worcester_seasonal_snow = pd.read_csv('snowfall/old_snowfall/worcester_snowfall.csv', index_col = 0)

In [75]:
old_snow_files = [boston_seasonal_snow,hartford_seasonal_snow, milton_seasonal_snow, pvd_seasonal_snow, worcester_seasonal_snow]

In [76]:
boston_seasonal_snow.head()

Unnamed: 0_level_0,Jul,Aug,Sep,Oct,Nov,Dec,Jan,Feb,Mar,Apr,May,Jun,Season
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1890-1891,M,M,M,M,M,M,14.8,11.8,16,0.0,0,0,M
1891-1892,0,0,0,T,0,T,12.5,14.0,20.3,0.0,0,0,46.8
1892-1893,0,0,0,0,1.2,2.5,14.6,35.3,4.5,7.9,0,0,66
1893-1894,0,0,0,0,0.4,18.5,15.0,21.6,T,8.5,0,0,64
1894-1895,0,0,0,0,6.4,13.5,13.9,8.8,3.8,0.5,0,0,46.9


In [77]:
for file in old_snow_files:
    file.replace(to_replace = ["T", "M"], value=[0,np.nan], inplace = True)
    for column in file.columns:
        file[column] = pd.to_numeric(file[column])

In [78]:
avg_season_snowfall = {}
i = 0
for file in old_snow_files:
    avg_season_snowfall[cities[i]] = file["Season"].mean()
    i += 1

In [79]:
x = list(avg_season_snowfall.keys())
y = list(avg_season_snowfall.values())
p = figure(x_range=x, plot_height=350, y_range=(0,70), title="Average Seasonal Snowfall",
           toolbar_location=None, tools="save")
c1 = random.choice(list(Category20c.keys()))
c2 = random.choice(range(0,len(Category20c[c1])))
p.vbar(x=x, top=y, width=0.75, color=Category20c[c1][c2])
p.xgrid.grid_line_color = None
p.xaxis[0].axis_label = 'City'
p.yaxis[0].axis_label = 'Avg Snowfall (in)'
save(p, 'html_charts/SeasonalSnow.html', CDN, 'Average Seasonal Snowfall')

'/home/scott/ds/snowfall/html_charts/SeasonalSnow.html'

In [81]:
from IPython.display import IFrame
IFrame('html_charts/SeasonalSnow.html', width=650, height=400)

No surprise here as Milton and Worcester lead the way for the most snow on average. Below we can see the max snow in a season for Providence was 76.2 inches in the 2014-2015 season, this checks out when we look at our top 20 snowiest months for the city. The lowest total ever was the 1979-1980 season, which happens to come the season after the Blizzard of '78.

In [82]:
pvd_seasonal_snow[pvd_seasonal_snow["Season"] == pvd_seasonal_snow["Season"].max()]

Unnamed: 0_level_0,Jul,Aug,Sep,Oct,Nov,Dec,Jan,Feb,Mar,Apr,May,Jun,Season
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2014-2015,0.0,0.0,0.0,0.0,1.6,0.9,25.6,31.8,16.3,0.0,0.0,0.0,76.2


In [83]:
pvd_seasonal_snow[pvd_seasonal_snow["Season"] == pvd_seasonal_snow["Season"].min()]

Unnamed: 0_level_0,Jul,Aug,Sep,Oct,Nov,Dec,Jan,Feb,Mar,Apr,May,Jun,Season
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1979-1980,0.0,0.0,0.0,0.0,0.0,0.0,0.6,3.8,5.3,0.0,0.0,0.0,9.7
