# Weather Data

In [82]:
#Import libraries for webscraping
import urllib.request
from bs4 import BeautifulSoup
#Import libraries for data manipulation and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
#import datetime
from datetime import datetime

In [109]:
#Government of Canada Website
#let's scrape 5 year worth of data. That is from Jan 2017
pagelist=[]
now = datetime.now()

for year in range(2008, 2019):
    for month in range(1, 13):
        if now.year > year or (now.year==year and month<=now.month):
            url='http://climate.weather.gc.ca/climate_data/daily_data_e.html?StationID=51459&timeframe=2&StartYear=1840&EndYear=2018&Day=7&Year='+str(year)+'&Month='+str(month)+'#'
            pagelist.append(url)

In [110]:
def ScrapeWeather(page):
    #Query the website
    f=urllib.request.urlopen(page).read()
    
    #Parse the page using beautiful soup
    soup = BeautifulSoup(f, 'html.parser')
    
    #Find the section that contains the year and the month
    option = soup.find_all('div', class_='col-md-6 col-sm-6 col-xs-6 text-center mrgn-tp-md mrgn-bttm-md')
    option=option[0]
    option=option.find_all('option', selected=True)
    
    #Find the table
    table = soup.find_all('table')
    
    #Find title
    th_all = soup.find_all('th')
    result = []
    for th in th_all:
        result.append(th.find('abbr', text=True))

    title=[]
    for abbr in result:
        if abbr is not None:
            title.append(abbr.string)
            
    #clean up title
    for i in range(len(title)):
        if title[i]=='mm':
            title[i]="Total Rain"
        elif title[i]=='cm':
            title[i]="Total Snow"
    
    #Find data
    table_body = soup.find('tbody')
    tr = table_body.find_all('tr')
    td=[]
    date=[]
    data=[]
    for row in tr:
        d=[]
        for r in row.find_all('td'):
            date.append(r.find('abbr'))
            d.append(r.string)
        data.append(d)
    
    #clean date arrays
    t=[]
    for d in date:
        if d is not None:
            t.append(d.get('title'))
    
    date=[]
    for j in t:
        date.append(datetime.strptime(j, '%B %d, %Y'))

    #clean up data array
    for entry in data:
        if entry == []:
            data.remove(entry)

    for entry in data:
        if len(entry)>11:
            entry.pop(0)

    #There should be as many rows as the number dates
    for rm in range(len(data)-len(date)):
        data.pop(len(data)-1)
        rm -=1
    
    for rm in range(len(title)-len(data[0])):
        title.pop(len(title)-1)
        rm-=1
    
    #Construct dataframe for scraped data
    df = pd.DataFrame(data=data,index=date,columns=title)
    df=df.apply(pd.to_numeric, errors="coerce")
    
    #Aggregate average
    Mmean=df["Mean Temp"].mean()
    Mmax=max(df['Max Temp'])
    Mmin=min(df['Min Temp'])
    Mindex=df.index[0]
    
    md={'Mean Temp': Mmean, 'Delta':Mmax-Mmin}
    mdf=pd.DataFrame(md, index=[Mindex])
    return(mdf)

In [111]:
frames=[]
for page in pagelist:
    frames.append(ScrapeWeather(page))

result = pd.concat(frames)

print(result)

            Delta  Mean Temp
2013-06-11    NaN  17.937500
2013-06-11    NaN  17.937500
2013-06-11    NaN  17.937500
2013-06-11    NaN  17.937500
2013-06-11    NaN  17.937500
2013-06-11    NaN  17.937500
2013-06-11    NaN  17.937500
2013-06-11    NaN  17.937500
2013-06-11    NaN  17.937500
2013-06-11    NaN  17.937500
2013-06-11    NaN  17.937500
2013-06-11    NaN  17.937500
2013-06-11    NaN  17.937500
2013-06-11    NaN  17.937500
2013-06-11    NaN  17.937500
2013-06-11    NaN  17.937500
2013-06-11    NaN  17.937500
2013-06-11    NaN  17.937500
2013-06-11    NaN  17.937500
2013-06-11    NaN  17.937500
2013-06-11    NaN  17.937500
2013-06-11    NaN  17.937500
2013-06-11    NaN  17.937500
2013-06-11    NaN  17.937500
2013-06-11    NaN  17.937500
2013-06-11    NaN  17.937500
2013-06-11    NaN  17.937500
2013-06-11    NaN  17.937500
2013-06-11    NaN  17.937500
2013-06-11    NaN  17.937500
...           ...        ...
2015-08-01   21.7  20.716129
2015-09-01   26.7  19.693333
2015-10-01   2

In [112]:
import plotly.plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import Scatter, Figure, Layout
import plotly.graph_objs as go
init_notebook_mode(connected=True)

In [115]:
trace1 = go.Bar(
    x=result.index,
    y=result['Delta'],
    name='Delta',
    marker=dict(
        color='rgb(158,202,225)',
        line=dict(
            color='rgb(8,48,107)',
            width=1.5,
        )
    ),
    opacity=0.6
)

trace2 = go.Scatter(
    x = result.index,
    y = result['Mean Temp'],
    name='Mean Temp',
    line = dict(
        color = ('rgb(255, 175, 102)'),
        width = 4,
        dash = 'dot')
)
data = [trace1, trace2]
layout = go.Layout(
    title = 'Toronto Weather Data'
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='MonthlyDeltaWeather')