# Bonus: Temperature Analysis I

In [6]:
import pandas as pd
from datetime import datetime as dt
from collections import namedtuple
import matplotlib

In [7]:
# "tobs" is "temperature observations"
df = pd.read_csv('Resources/hawaii_measurements.csv')
df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [8]:
# Convert the date column format from string to datetime
# df['DataFrame Column'] = pd.to_datetime(df['DataFrame Column'], format=specify your format)
df['date'] = pd.to_datetime(df['date'])

In [9]:
# Set the date column as the DataFrame index
date_column_df = df.set_index('date')
date_column_df

Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,USC00519397,0.08,65
2010-01-02,USC00519397,0.00,63
2010-01-03,USC00519397,0.00,74
2010-01-04,USC00519397,0.00,76
2010-01-06,USC00519397,,73
...,...,...,...
2017-08-19,USC00516128,0.09,71
2017-08-20,USC00516128,,78
2017-08-21,USC00516128,0.56,76
2017-08-22,USC00516128,0.50,76


In [10]:
# Drop the date column
# not sure why this is needed if dates will be needed for the next part

### Compare June and December data across all years 

In [11]:
from scipy import stats

In [12]:
# Filter data for desired months
df['month'] = pd.DatetimeIndex(df['date']).month
df.head()

Unnamed: 0,station,date,prcp,tobs,month
0,USC00519397,2010-01-01,0.08,65,1
1,USC00519397,2010-01-02,0.0,63,1
2,USC00519397,2010-01-03,0.0,74,1
3,USC00519397,2010-01-04,0.0,76,1
4,USC00519397,2010-01-06,,73,1


In [13]:
# Drop NaN 
df_drop_na = df.dropna()
df_drop_na.head()

Unnamed: 0,station,date,prcp,tobs,month
0,USC00519397,2010-01-01,0.08,65,1
1,USC00519397,2010-01-02,0.0,63,1
2,USC00519397,2010-01-03,0.0,74,1
3,USC00519397,2010-01-04,0.0,76,1
5,USC00519397,2010-01-07,0.06,70,1


In [14]:
# Reset index to date
add_month_df = df_drop_na.set_index('date')
add_month_df.head()

Unnamed: 0_level_0,station,prcp,tobs,month
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-01,USC00519397,0.08,65,1
2010-01-02,USC00519397,0.0,63,1
2010-01-03,USC00519397,0.0,74,1
2010-01-04,USC00519397,0.0,76,1
2010-01-07,USC00519397,0.06,70,1


In [15]:
# Identify the average temperature for June
june_temp_avg = add_month_df[add_month_df["month"] == 6]
june_temp_avg['tobs'].mean()

74.88754764930114

In [16]:
june_temp_avg = pd.DataFrame(add_month_df[add_month_df["month"] == 6])
june_temp_avg['tobs'].head()

date
2010-06-01    78
2010-06-02    76
2010-06-03    78
2010-06-04    76
2010-06-05    77
Name: tobs, dtype: int64

In [17]:
# Identify the average temperature for December
dec_temp_avg = add_month_df[add_month_df["month"] == 12]
dec_temp_avg['tobs'].mean()

70.93024911032029

In [18]:
dec_temp_avg = pd.DataFrame(add_month_df[add_month_df["month"] == 12])
dec_temp_avg['tobs'].head()

date
2010-12-01    76
2010-12-03    74
2010-12-04    74
2010-12-06    64
2010-12-07    64
Name: tobs, dtype: int64

In [19]:
# Create collections of temperature data
collections_df = add_month_df.describe()
collections_df

Unnamed: 0,prcp,tobs,month
count,18103.0,18103.0,18103.0
mean,0.160644,72.994863,6.34602
std,0.468746,4.512107,3.420051
min,0.0,53.0,1.0
25%,0.0,70.0,3.0
50%,0.01,73.0,6.0
75%,0.11,76.0,9.0
max,11.53,87.0,12.0


In [20]:
# Run paired t-test
stats.ttest_ind(june_temp_avg.tobs, dec_temp_avg.tobs)

Ttest_indResult(statistic=30.865349991562194, pvalue=9.8415346259008e-182)

### Analysis

Although the data reflects only a ~4 degree difference in average tempertures between the months of June and December in Hawaii, the outcome of the performed paired t-test for the datasets shows to have a strong statistical significance. This analysis is indicated by a resulting p-value of less than 0.05 (pvalue=9.8415346259008e-182). Therefore, excluding abnormalities in weather patterns, we should expect to see temperature to be ~4 degrees lower in December than in June. 