In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.preprocessing import MinMaxScaler

file_path = '/content/houston-aqi-2010-2021.csv'
df = pd.read_csv(file_path)

print (df.head())

   Unnamed: 0  day_of_year  year   latitude  longitude  avg_pm10  aqi_pm10  \
0           0            2  2010  29.733726 -95.257593        13        12   
1           1            2  2010  29.733726 -95.257593        13        12   
2           2            2  2010  29.733726 -95.257593        13        12   
3           3            2  2010  29.733726 -95.257593        13        12   
4           4            2  2010  29.733726 -95.257593        13        12   

     avg_co  aqi_co    avg_no2  ...    avg_o3  aqi_o3  avg_pm25  aqi_pm25  \
0  0.297667     NaN  17.258333  ...  0.027294      32      11.6        48   
1  0.297667     NaN  17.258333  ...  0.027294      32      11.6        48   
2  0.297667     NaN  17.258333  ...  0.027294      32       9.7        40   
3  0.297667     NaN  17.258333  ...  0.027294      32       9.7        40   
4  0.325000     6.0  17.258333  ...  0.027294      32      11.6        48   

    avg_so2  aqi_so2  avg_humidity  avg_temperature  avg_wind  avg_p

In [2]:
temperature_compare = df[['avg_humidity', 'avg_pressure']].values
scaler = MinMaxScaler()
temperature_compare = scaler.fit_transform(temperature_compare)
print(f"Mean Humidity: {round(temperature_compare[:, 0].mean(), 2)}")
print(f"Mean Pressure: {round(temperature_compare[:, 1].mean(), 2)}")

Mean Humidity: 0.59
Mean Pressure: 0.43


In [3]:
n = 100

def get_random_sample(variable):
    return np.random.choice(temperature_compare[:, variable], size=n, replace=False)

#Temperature vs Humidity
##Hypothesis test

In [4]:
alpha = 0.05
temperature_samples = get_random_sample(0)
humidity_samples = get_random_sample(1)


temperature_var = np.var(temperature_samples, ddof=1)
humidity_var = np.var(humidity_samples, ddof=1)
A, B = temperature_var / n, humidity_var / n

degrees_of_freedom = int(((A + B) ** 2) / (((A ** 2) / (n - 1)) + ((B ** 2) / (n - 1))))
critical_value = stats.t.ppf(1 - alpha / 2, degrees_of_freedom)

t_statistic, p_value = stats.ttest_ind(temperature_samples, humidity_samples)


print(f'The sample means are {round(temperature_samples.mean(), 2)} (for temperature) and {round(humidity_samples.mean(), 2)} (for humidity) ')
print(f'The degrees of freedom is {degrees_of_freedom}')
print(f'The t-statistic is: {t_statistic}')
print(f'The p-value is: {p_value}')
print(f'The critical value is: {critical_value}')

The sample means are 0.58 (for temperature) and 0.41 (for humidity) 
The degrees of freedom is 186
The t-statistic is: 7.105344743782793
The p-value is: 2.118180874751526e-11
The critical value is: 1.9728001139921347


###Since the p-value is greater than our significance level of 0.05, we fail to reject the null hypothesis, which states that the sample means are statistically different to a significant extent.

#Confidence Interval

In [5]:
margin_of_error = critical_value * np.sqrt(A + B)
mean_difference = humidity_samples.mean() - temperature_samples.mean()
print(f'The difference between sample means is: {round(mean_difference, 2)}')
print(f'Confidence Interval: ({round(mean_difference - margin_of_error, 2)}, {round(mean_difference + margin_of_error, 2)})')

The difference between sample means is: -0.17
Confidence Interval: (-0.21, -0.12)


#Temperature vs Pressure
##Hypothesis test

In [6]:
alpha = 0.05
temperature_samples = get_random_sample(0)
pressure_samples = get_random_sample(1)

temperature_var = np.var(temperature_samples, ddof=1)
pressure_var = np.var(pressure_samples, ddof=1)
A, B = temperature_var / n, pressure_var / n

degrees_of_freedom = int(((A + B) ** 2) / (((A ** 2) / (n - 1)) + ((B ** 2) / (n - 1))))
critical_value = stats.t.ppf(1 - alpha / 2, degrees_of_freedom)

t_statistic, p_value = stats.ttest_ind(temperature_samples, pressure_samples)


print(f'The sample means are {round(temperature_samples.mean(), 2)} (for temperature) and {round(pressure_samples.mean(), 2)} (for pressure) ')
print(f'The degrees of freedom is {degrees_of_freedom}')
print(f'The t-statistic is: {t_statistic}')
print(f'The p-value is: {p_value}')
print(f'The critical value is: {critical_value}')

The sample means are 0.58 (for temperature) and 0.43 (for pressure) 
The degrees of freedom is 195
The t-statistic is: 5.779713642731657
The p-value is: 2.872328144187963e-08
The critical value is: 1.9722040512658325


###Since the p-value is greater than our significance level of 0.05, we fail to reject the null hypothesis, which states that the sample means are statistically different to a significant extent.

#Confidence Interval

In [7]:
margin_of_error = critical_value * np.sqrt(A + B)
mean_difference = pressure_samples.mean() - temperature_samples.mean()
print(f'The difference between sample means is: {round(mean_difference, 2)}')
print(f'Confidence Interval: ({round(mean_difference - margin_of_error, 2)}, {round(mean_difference + margin_of_error, 2)})')

The difference between sample means is: -0.15
Confidence Interval: (-0.2, -0.1)
