In [2]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.preprocessing import MinMaxScaler

file_path = '/content/houston-aqi-2010-2021.csv'
df = pd.read_csv(file_path)

print (df.head())

   Unnamed: 0  day_of_year  year   latitude  longitude  avg_pm10  aqi_pm10  \
0           0            2  2010  29.733726 -95.257593        13        12   
1           1            2  2010  29.733726 -95.257593        13        12   
2           2            2  2010  29.733726 -95.257593        13        12   
3           3            2  2010  29.733726 -95.257593        13        12   
4           4            2  2010  29.733726 -95.257593        13        12   

     avg_co  aqi_co    avg_no2  ...    avg_o3  aqi_o3  avg_pm25  aqi_pm25  \
0  0.297667     NaN  17.258333  ...  0.027294      32      11.6        48   
1  0.297667     NaN  17.258333  ...  0.027294      32      11.6        48   
2  0.297667     NaN  17.258333  ...  0.027294      32       9.7        40   
3  0.297667     NaN  17.258333  ...  0.027294      32       9.7        40   
4  0.325000     6.0  17.258333  ...  0.027294      32      11.6        48   

    avg_so2  aqi_so2  avg_humidity  avg_temperature  avg_wind  avg_p

In [15]:
wind_compare = df[['avg_temperature', 'avg_humidity']].values
scaler = MinMaxScaler()
wind_compare = scaler.fit_transform(wind_compare)
print(f"Mean Temperature: {round(wind_compare[:, 0].mean(), 2)}")
print(f"Mean Humidity: {round(wind_compare[:, 1].mean(), 2)}")

Mean Temperature: 0.62
Mean Humidity: 0.59


In [16]:
n = 100

def get_random_sample(variable):
    return np.random.choice(wind_compare[:, variable], size=n, replace=False)

#Wind vs. Temperature
##Requirements Check
###The requirements are all satisified, as we assume the standard deviations of both samples are unknown, they are random and independently distributed random samples, and n is large (n=100).

##Hypothesis Test
###We conduct a hypothesis test that compares the sample means for our wind variable with the next two variables of temperature and humidity. We use a significance level of 0.05 and calculate the degrees of freedom using Welch's statistic i.e, $(A + B)^2 / ((A^2 / (n_1 - 1)) + (B^2 / (n_2 - 1)))$ where $A = s_1^2 / n_1$ and $B = s_2^2 / n_2$)

In [17]:
alpha = 0.05
wind_samples = get_random_sample(0)
temperature_samples = get_random_sample(1)

wind_var = np.var(wind_samples, ddof=1)
temperature_var = np.var(temperature_samples, ddof=1)
A, B = temperature_var / n, wind_var / n

degrees_of_freedom = int(((A + B) ** 2) / (((A ** 2) / (n - 1)) + ((B ** 2) / (n - 1))))
critical_value = stats.t.ppf(1 - alpha / 2, degrees_of_freedom)

t_statistic, p_value = stats.ttest_ind(temperature_samples, wind_samples)


print(f'The sample means are {round(wind_samples.mean(), 2)} (for wind) and {round(temperature_samples.mean(), 2)} (for temperature) ')
print(f'The degrees of freedom is {degrees_of_freedom}')
print(f'The t-statistic is: {t_statistic}')
print(f'The p-value is: {p_value}')
print(f'The critical value is: {critical_value}')

The sample means are 0.65 (for wind) and 0.61 (for temperature) 
The degrees of freedom is 191
The t-statistic is: -1.6631005729395016
The p-value is: 0.09787436099292669
The critical value is: 1.9724619897643145


###Since the p-value is greater than our significance level of 0.05, we fail to reject the null hypothesis, which states that the sample means are statistically indifferent to a significant extent.

#Confidence Interval

In [18]:
margin_of_error = critical_value * np.sqrt(A + B)
mean_difference = wind_samples.mean() - temperature_samples.mean()
print(f'The difference between sample means is: {round(mean_difference, 2)}')
print(f'Confidence Interval: ({round(mean_difference - margin_of_error, 2)}, {round(mean_difference + margin_of_error, 2)})')

The difference between sample means is: 0.04
Confidence Interval: (-0.01, 0.09)


###Based on the above confidence interval, we can assume with 95% confidence that the actual difference between population means is between 7% and 15% of their normalized values.

#Wind vs. Humidity
##Hypothesis Test


In [19]:
alpha = 0.05
wind_samples = get_random_sample(0)
humidity_samples = get_random_sample(1)

wind_var = np.var(wind_samples, ddof=1)
humidity_var = np.var(humidity_samples, ddof=1)
A, B = humidity_var / n, wind_var / n

degrees_of_freedom = int(((A + B) ** 2) / (((A ** 2) / (n - 1)) + ((B ** 2) / (n - 1))))
critical_value = stats.t.ppf(1 - alpha / 2, degrees_of_freedom)

t_statistic, p_value = stats.ttest_ind(humidity_samples, wind_samples)


print(f'The sample means are {round(wind_samples.mean(), 2)} (for wind) and {round(humidity_samples.mean(), 2)} (for humidity) ')
print(f'The degrees of freedom is {degrees_of_freedom}')
print(f'The t-statistic is: {t_statistic}')
print(f'The p-value is: {p_value}')
print(f'The critical value is: {critical_value}')

The sample means are 0.66 (for wind) and 0.59 (for humidity) 
The degrees of freedom is 188
The t-statistic is: -2.9489494837754755
The p-value is: 0.003572303582368143
The critical value is: 1.9726626923781652


###Since the p-value is less than our significance level of 0.05, we reject the null hypothesis in favor of the alternative hypothesis, which states that the sample means are statistically different to a significant extent.

#Confidence Interval

In [20]:
margin_of_error = critical_value * np.sqrt(A + B)
mean_difference = wind_samples.mean() - humidity_samples.mean()
print(f'The difference between sample means is: {round(mean_difference, 2)}')
print(f'Confidence Interval: ({round(mean_difference - margin_of_error, 2)}, {round(mean_difference + margin_of_error, 2)})')

The difference between sample means is: 0.07
Confidence Interval: (0.02, 0.12)


###Based on the above confidence interval, we can assume with 95% confidence that the actual difference between population means is between 10% and 18% of their normalized values.