Climate analysis: 
    Vacation dates May 25-June 10


In [1]:
# Python SQL toolkit and Object Relational Mapper
import pandas as pd
import matplotlib
matplotlib.use('nbagg')
from matplotlib import style
style.use('seaborn')
import matplotlib.pyplot as plt


import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine

In [2]:
# Create engine using the `demographics.sqlite` database file

engine = create_engine("sqlite:///hawaii.sqlite")


In [3]:
# Declare a Base using `automap_base()`

Base = automap_base()


In [4]:
# Use the Base class to reflect the database tables

Base.prepare(engine, reflect=True)


In [5]:
# Print all of the classes mapped to the Base

Base.classes.keys()


['hawaii_measurement', 'hawaii_station']

In [6]:
# Assign the demographics class to a variable called `Demographics`

Measurements = Base.classes.hawaii_measurement
Stations = Base.classes.hawaii_station


In [7]:
# Create a session

session = Session(engine)


In [8]:
# Use the session to query Demographics table and display the first 5 locations

for row in session.query(Measurements).limit(5).all():
    print(row)


<sqlalchemy.ext.automap.hawaii_measurement object at 0x1182a1208>
<sqlalchemy.ext.automap.hawaii_measurement object at 0x1182a1278>
<sqlalchemy.ext.automap.hawaii_measurement object at 0x1182a12e8>
<sqlalchemy.ext.automap.hawaii_measurement object at 0x1182a1358>
<sqlalchemy.ext.automap.hawaii_measurement object at 0x1182a13c8>


In [9]:

results_measurement = session.query(Measurements.station,Measurements.date,Measurements.prcp, Measurements.tobs).all()

results_measurement

[('USC00519397', '2010-01-01', 0.08, 65),
 ('USC00519397', '2010-01-02', 0.0, 63),
 ('USC00519397', '2010-01-03', 0.0, 74),
 ('USC00519397', '2010-01-04', 0.0, 76),
 ('USC00519397', '2010-01-07', 0.06, 70),
 ('USC00519397', '2010-01-08', 0.0, 64),
 ('USC00519397', '2010-01-09', 0.0, 68),
 ('USC00519397', '2010-01-10', 0.0, 73),
 ('USC00519397', '2010-01-11', 0.01, 64),
 ('USC00519397', '2010-01-12', 0.0, 61),
 ('USC00519397', '2010-01-14', 0.0, 66),
 ('USC00519397', '2010-01-15', 0.0, 65),
 ('USC00519397', '2010-01-16', 0.0, 68),
 ('USC00519397', '2010-01-17', 0.0, 64),
 ('USC00519397', '2010-01-18', 0.0, 72),
 ('USC00519397', '2010-01-19', 0.0, 66),
 ('USC00519397', '2010-01-20', 0.0, 66),
 ('USC00519397', '2010-01-21', 0.0, 69),
 ('USC00519397', '2010-01-22', 0.0, 67),
 ('USC00519397', '2010-01-23', 0.0, 67),
 ('USC00519397', '2010-01-24', 0.01, 71),
 ('USC00519397', '2010-01-25', 0.0, 67),
 ('USC00519397', '2010-01-26', 0.04, 76),
 ('USC00519397', '2010-01-27', 0.12, 68),
 ('USC0051

In [10]:

results_station = session.query(Stations.station,Stations.name,Stations.latitude, Stations.longitude,Stations.elevation).all()
results_station

[('USC00519397', 'WAIKIKI 717.2, HI US', 21.2716, -157.8168, 3.0),
 ('USC00513117', 'KANEOHE 838.1, HI US', 21.4234, -157.8015, 14.6),
 ('USC00514830',
  'KUALOA RANCH HEADQUARTERS 886.9, HI US',
  21.5213,
  -157.8374,
  7.0),
 ('USC00517948', 'PEARL CITY, HI US', 21.3934, -157.9751, 11.9),
 ('USC00518838', 'UPPER WAHIAWA 874.3, HI US', 21.4992, -158.0111, 306.6),
 ('USC00519523',
  'WAIMANALO EXPERIMENTAL FARM, HI US',
  21.33556,
  -157.71139,
  19.5),
 ('USC00519281', 'WAIHEE 837.5, HI US', 21.45167, -157.84888999999998, 32.9),
 ('USC00511918', 'HONOLULU OBSERVATORY 702.2, HI US', 21.3152, -157.9992, 0.9),
 ('USC00516128', 'MANOA LYON ARBO 785.2, HI US', 21.3331, -157.8025, 152.4)]

In [11]:

# Load the results into a pandas dataframe. 

hawaii_measurement_df = pd.DataFrame(results_measurement[:], columns=['Station', 'Date', 'Percipitation','Temperature'])
hawaii_measurement_df['Date'] =  pd.to_datetime(hawaii_measurement_df['Date'])
hawaii_measurement_df.set_index('Station', inplace=True)
hawaii_measurement_df


Unnamed: 0_level_0,Date,Percipitation,Temperature
Station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
USC00519397,2010-01-01,0.08,65
USC00519397,2010-01-02,0.00,63
USC00519397,2010-01-03,0.00,74
USC00519397,2010-01-04,0.00,76
USC00519397,2010-01-07,0.06,70
USC00519397,2010-01-08,0.00,64
USC00519397,2010-01-09,0.00,68
USC00519397,2010-01-10,0.00,73
USC00519397,2010-01-11,0.01,64
USC00519397,2010-01-12,0.00,61


In [12]:
hawaii_station_df = pd.DataFrame(results_station[:], columns=['Station', 'StationName', 'Latitude','Longitute','Elevation'])
hawaii_station_df.set_index('Station', inplace=True, )
hawaii_station_df 

Unnamed: 0_level_0,StationName,Latitude,Longitute,Elevation
Station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
USC00519397,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0
USC00513117,"KANEOHE 838.1, HI US",21.4234,-157.8015,14.6
USC00514830,"KUALOA RANCH HEADQUARTERS 886.9, HI US",21.5213,-157.8374,7.0
USC00517948,"PEARL CITY, HI US",21.3934,-157.9751,11.9
USC00518838,"UPPER WAHIAWA 874.3, HI US",21.4992,-158.0111,306.6
USC00519523,"WAIMANALO EXPERIMENTAL FARM, HI US",21.33556,-157.71139,19.5
USC00519281,"WAIHEE 837.5, HI US",21.45167,-157.84889,32.9
USC00511918,"HONOLULU OBSERVATORY 702.2, HI US",21.3152,-157.9992,0.9
USC00516128,"MANOA LYON ARBO 785.2, HI US",21.3331,-157.8025,152.4


### Percipitation Analysis

In [13]:
#Filter for Date and percipitation column and select rows from 2017

percipitation_2017 = hawaii_measurement_df[['Date','Percipitation']]


In [14]:
percipitation_2017_df = pd.DataFrame(percipitation_2017[(percipitation_2017['Date'] >= '2016-08-01')\
                                & (percipitation_2017['Date'] <= '2017-08-23')].set_index('Date'))
percipitation_2017_df



Unnamed: 0_level_0,Percipitation
Date,Unnamed: 1_level_1
2016-08-01,0.08
2016-08-02,0.05
2016-08-03,0.00
2016-08-04,0.04
2016-08-05,0.01
2016-08-06,0.00
2016-08-07,0.39
2016-08-08,0.02
2016-08-09,0.00
2016-08-10,0.00


In [15]:
percipitation_2017_df.plot()
plt.show()

<IPython.core.display.Javascript object>

In [16]:
percipitation_2017_df.describe()

Unnamed: 0,Percipitation
count,2148.0
mean,0.182896
std,0.467339
min,0.0
25%,0.0
50%,0.02
75%,0.14
max,6.7


### Station Analysis


In [17]:
#query to calculate the total number of stations.


station = pd.DataFrame(hawaii_measurement_df.groupby('Station').count()).rename(columns={'Date':'Count'})

station_count = station[['Count']]
station_count 


Unnamed: 0_level_0,Count
Station,Unnamed: 1_level_1
USC00511918,1932
USC00513117,2696
USC00514830,1937
USC00516128,2484
USC00517948,683
USC00518838,342
USC00519281,2772
USC00519397,2685
USC00519523,2572


In [18]:
#query to find the most active stations.
station_count.loc[(station_count['Count'] >= 2000)]

Unnamed: 0_level_0,Count
Station,Unnamed: 1_level_1
USC00513117,2696
USC00516128,2484
USC00519281,2772
USC00519397,2685
USC00519523,2572


In [19]:
#List of stations and observation counts in descending order
station_count.sort_values(['Count'],ascending=False, inplace=False, kind='quicksort', na_position='last')

Unnamed: 0_level_0,Count
Station,Unnamed: 1_level_1
USC00519281,2772
USC00513117,2696
USC00519397,2685
USC00519523,2572
USC00516128,2484
USC00514830,1937
USC00511918,1932
USC00517948,683
USC00518838,342


In [20]:
#Which station has the highest number of observations?
station_count.iloc[:,0].idxmax()


'USC00519281'

In [21]:
#query to retrieve the last 12 months of temperature observation data (tobs).
#Filter for Date and temperature column and select rows from 2017
temperature_2017= hawaii_measurement_df[['Date','Temperature']]

temperature_2017_df = pd.DataFrame(temperature_2017[(temperature_2017['Date'] >= '2016-08-01')\
                                & (temperature_2017['Date'] <= '2017-08-23')]).set_index('Date')
temperature_2017_df.head()

Unnamed: 0_level_0,Temperature
Date,Unnamed: 1_level_1
2016-08-01,77
2016-08-02,78
2016-08-03,80
2016-08-04,80
2016-08-05,80


In [22]:
#Filter by station with the highest number of observations.

station = pd.DataFrame(hawaii_measurement_df.groupby('Station').count()).rename(columns={'Date':'Count'})

station_count = station[['Count']]
station_count 

Unnamed: 0_level_0,Count
Station,Unnamed: 1_level_1
USC00511918,1932
USC00513117,2696
USC00514830,1937
USC00516128,2484
USC00517948,683
USC00518838,342
USC00519281,2772
USC00519397,2685
USC00519523,2572


In [23]:
#Plot the results as a histogram with bins=12.
station_count["Count"].hist(bins=12, color="darkblue")
plt.title("Histogram: Observation count by station")
plt.savefig("Histogram Observation count by station")


In [24]:
plt.show()

<IPython.core.display.Javascript object>

### Temperature Analysis

In [None]:

#Write a function called calc_temps that will accept a start date and end date in 
#the format %Y-%m-%d and return the minimum, average, and maximum temperatures for 
#that range of dates.

def calc_temp():
    start=input("Start Date: (YYYY-mm-dd)  ")
    end =input("End Date:(YYYY-mm-d)  ")
    vacation_df = temperature_2017[(temperature_2017['Date'] >= start) & (temperature_2017['Date']<= end)]
    low_temp = vacation_df.min()
    high_temp = vacation_df.max()
    avg_temp = vacation_df.mean()
    print(f"For the dates between {start} and {end}: ")
    print(f"     Low Temperature was {low_temp}")
    print(f"     High Temperature was {high_temp}")
    print(f"     Average Temperature was {avg_temp}")
  

In [None]:
#Use the calc_temps function to calculate the min, avg, and max temperatures for your trip using the matching dates from the previous year 

calc_temp()

In [None]:
#Plot the min, avg, and max temperature from your previous query as a bar chart.
#Use the average temperature as the bar height.
#Use the peak-to-peak (tmax-tmin) value as the y error bar (yerr).
plt.plot(kind='bar')
plt.show()

### Daily Normals

In [None]:
import datetime

#### Rainfall per weather station by month and day

In [None]:
# Reset index to make groupby easier.
#Average rainfall
reset = hawaii_measurement_df.reset_index()

prc_avg_by_station_mon_day = reset[["Percipitation"]] \
.groupby([ \
     reset["Station"],
    reset["Date"].dt.strftime('%m-%d')
]).mean()
prc_avg_by_station_mon_day.head()

In [None]:
#Min rainfall

prc_min_by_station_mon_day = reset[["Percipitation"]] \
.groupby([ \
    reset["Station"],
    reset["Date"].dt.strftime('%m-%d')
]).min()
prc_min_by_station_mon_day.head()

In [None]:
#Max rainfall
reset = hawaii_measurement_df.reset_index()

prc_max_by_station_mon_day = reset[["Percipitation"]] \
.groupby([ \
    reset["Station"],
    reset["Date"].dt.strftime('%m-%d')
         
]).max()
prc_max_by_station_mon_day.head()

#### Total station rainfall grouped by month and day

In [None]:
#Average rainfall
total_avg_prc_mon_day = hawaii_measurement_df[["Percipitation"]] \
.groupby([ \
   hawaii_measurement_df["Date"].dt.strftime('%m-%d')
]).mean()
total_avg_prc_mon_day.head()

In [None]:
#Min rainfall
total_min_prc_mon_day = hawaii_measurement_df[["Percipitation"]] \
.groupby([ \
    hawaii_measurement_df["Date"].dt.strftime('%m-%d')
]).min()
total_min_prc_mon_day.head()

In [None]:
#Max rainfall
total_max_prc_mon_day = hawaii_measurement_df[["Percipitation"]] \
.groupby([ \
    hawaii_measurement_df["Date"].dt.strftime('%m-%d')
]).max()
total_max_prc_mon_day.head()

#### Temperature avg, min, max grouped by month, day

In [None]:
#Average temperature by month/day over years
avg_temp = hawaii_measurement_df[["Temperature"]] \
.groupby([ \
   hawaii_measurement_df["Date"].dt.strftime('%m-%d') \
]).mean()
avg_temp = avg_temp.reset_index()
avg_temp.head()

In [None]:
#Mmin temperature by month/day over years
min_temp = hawaii_measurement_df[["Temperature"]] \
.groupby([ \
   hawaii_measurement_df["Date"].dt.strftime('%m-%d') \
]).min()
min_temp = min_temp.reset_index()
min_temp.head()

In [None]:
#Max temperature by month/day over years
max_temp = hawaii_measurement_df[["Temperature"]] \
.groupby([ \
    hawaii_measurement_df["Date"].dt.strftime('%m-%d') \
]).max()
max_temp = max_temp.reset_index()
max_temp.head()

Create a function called daily_normals that will calculate the daily normals for a specific date. This date string will be in the format %m-%d. Be sure to use all historic tobs that match that date string.

Create a list of dates for your trip in the format %m-%d. Use the daily_normals function to calculate

the normals for each date string and append the results to a list.

Load the list of daily normals into a Pandas DataFrame and set the index equal to the date.

Use Pandas to plot an area plot (stacked=False) for the daily normals.

In [None]:
def daily_normals(dates):
    
    results = pd.DataFrame(columns = ["Date","avg_temp","min_temp","max_temp"])  #Empty Dataframe
                  
    for date in dates:
        result_avg_temp = int(avg_temp[(avg_temp['Date'] == date)]["Temperature"])
        result_min_temp = int(min_temp[(min_temp['Date'] == date)]["Temperature"])
        result_max_temp = int(max_temp[(max_temp['Date'] == date)]["Temperature"])
        results=results.append({"Date":date, "avg_temp":result_avg_temp, "min_temp": result_min_temp,
                                          "max_temp": result_max_temp,}, ignore_index=True)    
    return results.set_index("Date")


In [None]:
daily_normals(['05-25','05-26','05-27','05-28','05-29','05-30','05-31','06-01','06-02','06-03','06-04','06-05','06-06','06-07','06-08'])

In [None]:
results.plot(kind='area', stacked=False)
plt.show()