# When do cherry blossoms reach peak bloom?

* [DC cherry blossoms](https://www.epa.gov/climate-indicators/cherry-blossoms)
* [cherry blossom data](https://www.epa.gov/sites/production/files/2021-04/cherry-blossoms_fig-1.csv)

In [1]:
import altair as alt
import datetime
import numpy as np
import pandas as pd

alt.themes.enable("quartz")

ThemeRegistry.enable('quartz')

In [2]:
def clean_data():
    URL = "https://www.epa.gov/sites/production/files/2021-04/cherry-blossoms_fig-1.csv"
    df = pd.read_csv(URL, skiprows=6)
    
    # https://stackoverflow.com/questions/2427555/python-question-year-and-day-of-year-to-date
    df = (df.rename(columns = {
            "Year": "year",
            "Yoshino peak bloom date": "peak_bloom",
            "Cherry blossom festival start date": "festival_start",
            "Cherry blossom festival duration": "festival_duration",
        }).astype({
            "year": "string",
            "festival_duration": "Int64",
        })
    )
    
    df = df.assign(
        peak_bloom_date = df.apply(lambda x: pd.to_datetime(x["peak_bloom"], unit="D", 
                                                            origin=x.year), axis=1),
        festival_start_date = df.apply(lambda x: pd.to_datetime(x["festival_start"], unit="D", 
                                                                origin=x.year), axis=1),
    )
    
    df = df.assign(
        festival_end_date = df.apply(lambda x: 
                                     x.festival_start_date + datetime.timedelta(x.festival_duration) 
                                     if x.notnull().all() else np.nan, axis=1),
    )
    
    return df

In [3]:
df = clean_data()

In [4]:
df.tail(10)

Unnamed: 0,year,peak_bloom,festival_start,festival_duration,peak_bloom_date,festival_start_date,festival_end_date
91,2012,80,80.0,38,2012-03-21,2012-03-21,2012-04-28
92,2013,99,79.0,25,2013-04-10,2013-03-21,2013-04-15
93,2014,100,79.0,24,2014-04-11,2014-03-21,2014-04-14
94,2015,100,79.0,23,2015-04-11,2015-03-21,2015-04-13
95,2016,85,80.0,28,2016-03-26,2016-03-21,2016-04-18
96,2017,84,79.0,27,2017-03-26,2017-03-21,2017-04-17
97,2018,95,79.0,26,2018-04-06,2018-03-21,2018-04-16
98,2019,91,79.0,24,2019-04-02,2019-03-21,2019-04-14
99,2020,80,80.0,23,2020-03-21,2020-03-21,2020-04-13
100,2021,87,79.0,22,2021-03-29,2021-03-21,2021-04-12


In [5]:
(alt.Chart(df)
 .mark_line()
 .encode(
     x=alt.X("year:T"),
     y=alt.Y("monthdate(peak_bloom_date):T", title="Date")
 ).properties(title="Cherry Blossom Peak Bloom",
             width=500, height=250)
)

In [6]:
(alt.Chart(df)
 .mark_area(color="lightblue")
 .encode(
     x="year:T",
     y=alt.Y("monthdate(festival_start_date):T", title="Festival Duration"),
     y2="monthdate(festival_end_date):T"
 ).properties(title="Cherry Blossom Festival Duration", width=700)
)

In [7]:
chart1 = (alt.Chart(df)
 .mark_line()
 .encode(
     x=alt.X("year:T"),
     y=alt.Y("monthdate(peak_bloom_date):T", title="Date")
 ).properties(title="Cherry Blossom Peak Bloom",
             width=500, height=250)
)

chart2 = (alt.Chart(df)
 .mark_area(color="#d3d3d3")
 .encode(
     x="year:T",
     y="monthdate(festival_start_date):T",
     y2="monthdate(festival_end_date):T"
 ).properties(title="Cherry Blossom Festival Duration")
)


(chart2 + chart1).properties(title="Cherry Blossom Peak Bloom & Festival")