In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import seaborn as sns
import numpy as np

In [None]:
pd.set_option('display.max_columns', 700)
pd.set_option('display.max_rows', 100)
pd.set_option('display.min_rows', 10)
pd.set_option('display.expand_frame_repr', True)

In [None]:
plt.rcParams['figure.figsize'] = (16.0, 10.0)
# plt.style.use('ggplot')
# sns.set_style("white")

# Load and Clean Data

In [None]:
# local use
p = Path('data/Portland_dailyclimatedata1940-2019.xlsx')
pdx_19 = pd.read_excel(p, sheet_name='Portland_dailyclimatedata1940-2')

- Data from [NOAA Nation Weather Service Forecast Office: Portland, OR][1]

  [1]: https://w2.weather.gov/climate/local_data.php?wfo=pqr

In [None]:
url = 'http://www.weather.gov/source/pqr/climate/webdata/Portland_dailyclimatedata.csv'

In [None]:
pdx_19 = pd.read_csv(url, skiprows=list(range(6)))
pdx_19

In [None]:
pdx_19.drop(columns=['AVG or Total'], inplace=True)

In [None]:
# add prefix to days for use with wide_to_long
pdx_19.columns = list(pdx_19.columns[:3]) + [f'v_{day}' for day in pdx_19.columns[3:]]

In [None]:
# Select TX (max temp) and TN (min temp)
pdx_19.rename(columns={'Unnamed: 2': 'TYPE'}, inplace=True)
pdx_19 = pdx_19[pdx_19.TYPE.isin(['TX', 'TN'])]

In [None]:
# reshape the data to tidy
pdx = pd.wide_to_long(pdx_19, stubnames='v', sep='_', i=['YR', 'MO', 'TYPE'], j='day').reset_index()
pdx

In [None]:
# Give a more descriptive name
pdx.TYPE = pdx.TYPE.map({'TX': 'MAX', 'TN': 'MIN'})

In [None]:
# - denote days that don't exist for a given month; drop those
pdx = pdx[pdx.v != '-'].copy()

In [None]:
# rename so they can be used to create a date column
pdx.rename(columns={'YR': 'year', 'MO': 'month'}, inplace=True)

In [None]:
# create date column
pdx['date'] = pd.to_datetime(pdx[['year', 'month', 'day']])

In [None]:
# no longer needed
pdx.drop(columns=['year', 'month', 'day'], inplace=True)

In [None]:
pdx.v.replace({'M': np.nan, 'T': np.nan}, inplace=True)

In [None]:
# convert from str to float
pdx.v = pdx.v.astype('float')

In [None]:
# add bins
pdx['range'] = pd.cut(pdx.v, bins=[0, 64, 74, 84, 94, 200], labels=['< 65', '65 - 74', '75 - 84', '85 - 94', '>= 95'])

In [None]:
display(pdx.head())
display(pdx.tail())

# Create max temperature dataframe

In [None]:
pdx_max = pdx[pdx.TYPE == 'MAX'].reset_index(drop=True)
pdx_max

# Create January to May (inclusive) dataframe

In [None]:
pdx_max_jan_may = pdx_max[(pdx_max.date.dt.month >= 1) & (pdx_max.date.dt.month < 6)]

## Groupby `year` and `range`

In [None]:
pdx_max_g = pdx_max_jan_may.groupby([pdx_max_jan_may.date.dt.year, 'range'])['v'].agg('count').reset_index(level=0)

In [None]:
display(pdx_max_g.head())
display(pdx_max_g.tail())

## Plot the `groupby` dataframe

In [None]:
years = list(range(1950, 2021, 10))

with sns.axes_style("darkgrid"):
    for year in years:
        plt.figure()
        data = pdx_max_g[(pdx_max_g.date >= year - 10) & (pdx_max_g.date < year)]
        ax = sns.barplot(x=data.index, y=data.v, hue=data.date)
        
        for p in ax.patches:
            if p.get_height() > 0:
                ax.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
        
        plt.ylim(0, 150)
        plt.ylabel('Days')
        plt.xlabel('High Temperatures °F')
        plt.title(f"Portland, OR\nJan - May High Temperature Days: {year-10}'s")

# Create June dataframe

In [None]:
pdx_max_june = pdx_max[(pdx_max.date.dt.month == 6)]
pdx_max_june = pdx_max_june.groupby([pdx_max_june.date.dt.year, 'range'])['v'].agg('count').reset_index(level=0)
pdx_max_june

## Plot June

In [None]:
years = list(range(1950, 2021, 10))

with sns.axes_style("darkgrid"):
    for year in years:
        plt.figure()
        data = pdx_max_june[(pdx_max_june.date >= year - 10) & (pdx_max_june.date < year)]
        ax = sns.barplot(x=data.index, y=data.v, hue=data.date)
        
        for p in ax.patches:
            if p.get_height() > 0:
                ax.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
        
        plt.ylim(0, 25)
        plt.ylabel('Days')
        plt.xlabel('High Temperatures °F')
        plt.title(f"Portland, OR\nJune High Temperature Days: {year-10}'s")