# Bonus: Temperature Analysis I

In [30]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime as dt


In [2]:
# "tobs" is "temperature observations"
df = pd.read_csv('Resources/hawaii_measurements.csv')
df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [3]:
# Convert the date column format from string to datetime
df['date']= pd.to_datetime(df['date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19550 entries, 0 to 19549
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   station  19550 non-null  object        
 1   date     19550 non-null  datetime64[ns]
 2   prcp     18103 non-null  float64       
 3   tobs     19550 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 611.1+ KB


In [4]:
# Set the date column as the DataFrame index
df.set_index('date')

Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,USC00519397,0.08,65
2010-01-02,USC00519397,0.00,63
2010-01-03,USC00519397,0.00,74
2010-01-04,USC00519397,0.00,76
2010-01-06,USC00519397,,73
...,...,...,...
2017-08-19,USC00516128,0.09,71
2017-08-20,USC00516128,,78
2017-08-21,USC00516128,0.56,76
2017-08-22,USC00516128,0.50,76


In [5]:
# Drop the date column
df.drop(columns = 'date')

Unnamed: 0,station,prcp,tobs
0,USC00519397,0.08,65
1,USC00519397,0.00,63
2,USC00519397,0.00,74
3,USC00519397,0.00,76
4,USC00519397,,73
...,...,...,...
19545,USC00516128,0.09,71
19546,USC00516128,,78
19547,USC00516128,0.56,76
19548,USC00516128,0.50,76


### Compare June and December data across all years 

In [6]:
from scipy import stats

In [13]:
# Filter data for desired months
# June
june_df = df[df['date'].dt.month == 6]
june_df

Unnamed: 0,station,date,prcp,tobs
133,USC00519397,2010-06-01,0.00,78
134,USC00519397,2010-06-02,0.01,76
135,USC00519397,2010-06-03,0.00,78
136,USC00519397,2010-06-04,0.00,76
137,USC00519397,2010-06-05,0.00,77
...,...,...,...,...
19492,USC00516128,2017-06-26,0.02,79
19493,USC00516128,2017-06-27,0.10,74
19494,USC00516128,2017-06-28,0.02,74
19495,USC00516128,2017-06-29,0.04,76


In [18]:
# December
dec_df = df[df['date'].dt.month == 12]
dec_df

Unnamed: 0,station,date,prcp,tobs
305,USC00519397,2010-12-01,0.04,76
306,USC00519397,2010-12-03,0.00,74
307,USC00519397,2010-12-04,0.00,74
308,USC00519397,2010-12-06,0.00,64
309,USC00519397,2010-12-07,0.00,64
...,...,...,...,...
19323,USC00516128,2016-12-27,0.14,71
19324,USC00516128,2016-12-28,0.14,71
19325,USC00516128,2016-12-29,1.03,69
19326,USC00516128,2016-12-30,2.37,65


In [27]:

june_df_yr = june_df.groupby(df['date'].dt.year)
june_avg = june_df_yr.mean()
june_avg = june_avg.rename(columns={"prcp":"prcp_jun_avg","tobs":"tobs_jun_avg"})
june_avg

Unnamed: 0_level_0,prcp_jun_avg,tobs_jun_avg
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010,0.042241,74.92562
2011,0.240142,73.938326
2012,0.097062,74.0
2013,0.144195,74.599078
2014,0.124372,75.027907
2015,0.12516,74.990148
2016,0.212312,75.175258
2017,0.12,77.219895


In [28]:

dec_df_yr = dec_df.groupby(df['date'].dt.year)
dec_avg = dec_df_yr.mean()
dec_avg = dec_avg.rename(columns={"prcp":"prcp_dec_avg","tobs":"tobs_dec_avg"})
dec_avg

Unnamed: 0_level_0,prcp_dec_avg,tobs_dec_avg
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010,0.459087,70.208511
2011,0.201581,70.820628
2012,0.089604,71.188073
2013,0.169014,71.094017
2014,0.188439,69.896861
2015,0.169506,73.423913
2016,0.199494,71.13


In [43]:
# Identify the average temperature for June
june_df['tobs'].mean()

74.94411764705882

In [41]:
# Identify the average temperature for December
dec_df['tobs'].mean()

71.04152933421226

In [32]:
# Create collections of temperature data
test_df = pd.merge(june_avg, dec_avg, how="outer", on="date")
test_df

Unnamed: 0_level_0,prcp_jun_avg,tobs_jun_avg,prcp_dec_avg,tobs_dec_avg
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010,0.042241,74.92562,0.459087,70.208511
2011,0.240142,73.938326,0.201581,70.820628
2012,0.097062,74.0,0.089604,71.188073
2013,0.144195,74.599078,0.169014,71.094017
2014,0.124372,75.027907,0.188439,69.896861
2015,0.12516,74.990148,0.169506,73.423913
2016,0.212312,75.175258,0.199494,71.13
2017,0.12,77.219895,,


In [39]:
test_pair_df = test_df.drop(labels = 2017)
test_pair_df

Unnamed: 0_level_0,prcp_jun_avg,tobs_jun_avg,prcp_dec_avg,tobs_dec_avg
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010,0.042241,74.92562,0.459087,70.208511
2011,0.240142,73.938326,0.201581,70.820628
2012,0.097062,74.0,0.089604,71.188073
2013,0.144195,74.599078,0.169014,71.094017
2014,0.124372,75.027907,0.188439,69.896861
2015,0.12516,74.990148,0.169506,73.423913
2016,0.212312,75.175258,0.199494,71.13


In [40]:
# Run paired t-test

t_test = stats.ttest_rel(test_pair_df['tobs_jun_avg'], test_pair_df['tobs_dec_avg'])
t_test

Ttest_relResult(statistic=7.780060705002921, pvalue=0.00023742611093245777)

### Analysis

In [None]:
# pvalue of paired t-test is less than 0.001, it means the dataset is very convincing. The differences of average temperature in paired groups did not happen by chance.