# Bonus: Temperature Analysis I

In [1]:
import pandas as pd
from scipy import stats
import scipy.stats as stats
from datetime import datetime as dt

In [2]:
# "tobs" is "temperature observations"
df = pd.read_csv('Resources/hawaii_measurements.csv')
df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [3]:
# Format dataframe
df = df.rename(columns={'station': 'Station Name',
                        'date': 'Date',
                        'prcp': 'Precipitation (In)',
                        'tobs': 'Observed Temperature'})
df.head()

Unnamed: 0,Station Name,Date,Precipitation (In),Observed Temperature
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [4]:
# Convert the date column format from string to datetime
df['Date'] = pd.to_datetime(df['Date'])
df.head()

Unnamed: 0,Station Name,Date,Precipitation (In),Observed Temperature
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [5]:
# Check dataframe info to confirm date column is formatted as datetime
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19550 entries, 0 to 19549
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Station Name          19550 non-null  object        
 1   Date                  19550 non-null  datetime64[ns]
 2   Precipitation (In)    18103 non-null  float64       
 3   Observed Temperature  19550 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 611.1+ KB


In [6]:
# Set the date column as the DataFrame index
df = df.set_index('Date', drop=True)
df.head()

Unnamed: 0_level_0,Station Name,Precipitation (In),Observed Temperature
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,USC00519397,0.08,65
2010-01-02,USC00519397,0.0,63
2010-01-03,USC00519397,0.0,74
2010-01-04,USC00519397,0.0,76
2010-01-06,USC00519397,,73


### Compare June and December data across all years 

In [7]:
# Filter data for June
jun_df = df.loc[(df.index.month == 6)]
jun_df.head()

Unnamed: 0_level_0,Station Name,Precipitation (In),Observed Temperature
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-06-01,USC00519397,0.0,78
2010-06-02,USC00519397,0.01,76
2010-06-03,USC00519397,0.0,78
2010-06-04,USC00519397,0.0,76
2010-06-05,USC00519397,0.0,77


In [8]:
# Filter data for December
dec_df = df.loc[(df.index.month == 12)]
dec_df.head()

Unnamed: 0_level_0,Station Name,Precipitation (In),Observed Temperature
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-12-01,USC00519397,0.04,76
2010-12-03,USC00519397,0.0,74
2010-12-04,USC00519397,0.0,74
2010-12-06,USC00519397,0.0,64
2010-12-07,USC00519397,0.0,64


In [9]:
# Get combined DataFrame
months_df = pd.concat([jun_df,dec_df])
months_df

Unnamed: 0_level_0,Station Name,Precipitation (In),Observed Temperature
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-06-01,USC00519397,0.00,78
2010-06-02,USC00519397,0.01,76
2010-06-03,USC00519397,0.00,78
2010-06-04,USC00519397,0.00,76
2010-06-05,USC00519397,0.00,77
...,...,...,...
2016-12-27,USC00516128,0.14,71
2016-12-28,USC00516128,0.14,71
2016-12-29,USC00516128,1.03,69
2016-12-30,USC00516128,2.37,65


In [10]:
# Identify the average temperature for June
temp_june = round(jun_df['Observed Temperature'].mean(),2)
print(f'The average temperature for June is {temp_june}')

The average temperature for June is 74.94


In [11]:
# Identify the average temperature for December
temp_dec = round(dec_df['Observed Temperature'].mean(),2)
print(f'The average temperature for December is {temp_dec}')

The average temperature for December is 71.04


In [12]:
# Create collections of temperature data
jun_temps = jun_df['Observed Temperature'].to_list()
dec_temps = dec_df['Observed Temperature'].to_list()

In [13]:
# Run paired t-test
stats.ttest_ind(jun_temps, dec_temps)

Ttest_indResult(statistic=31.60372399000329, pvalue=3.9025129038616655e-191)

### Analysis

The t-statistic value has a magnitude of 31.6. The greater this value, the more likely the two datasets are different in terms of their standard variation, and 31.6 is considerably large. On the other hand, the p-value is practically zero, which is well below the treshold of 0.05, which means it can be concluded that the two samples are significantly different. This makes sense, as June and December are at opposite ends of the seasons, and climates are vastly different in each of them. Personally, I would use a paired t-test, as these two samples come from the same place but under different circumstances.