# Inferences about Two Means (Independent Samples)

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.preprocessing import MinMaxScaler

file_path = '~/Documents/UNT/csce5310/houston-aqi-2010-2021.csv'
df = pd.read_csv(file_path)

In [2]:
wind_compare = df[['avg_wind', 'avg_temperature', 'avg_humidity']].values
scaler = MinMaxScaler()
wind_compare = scaler.fit_transform(wind_compare)
print(f"Mean Wind: {round(wind_compare[:, 0].mean(), 2)}")
print(f"Mean Temperature: {round(wind_compare[:, 1].mean(), 2)}")
print(f"Mean Humidity: {round(wind_compare[:, 2].mean(), 2)}")

Mean Wind: 0.29
Mean Temperature: 0.62
Mean Humidity: 0.59


In [3]:
n = 100

def get_random_sample(variable):
    return np.random.choice(wind_compare[:, variable], size=n, replace=False)

## Wind vs. Temperature

### Hypothesis Test

In [4]:
alpha = 0.05
wind_samples = get_random_sample(0)
temperature_samples = get_random_sample(1)
wind_var = np.var(wind_samples, ddof=1)
temperature_var = np.var(temperature_samples, ddof=1)
A, B = wind_var / n, temperature_var / n
degrees_of_freedom = int(((A + B) ** 2) / (((A ** 2) / (n - 1)) + ((B ** 2) / (n - 1))))
critical_value = stats.t.ppf(1 - alpha / 2, degrees_of_freedom)
t_statistic, p_value = stats.ttest_ind(wind_samples, temperature_samples)
print(f'The sample means are {round(wind_samples.mean(), 2)} (for wind) and {round(temperature_samples.mean(), 2)} (for temperature) ')
print(f'The degrees of freedom is {degrees_of_freedom}')
print(f'The t-statistic is: {t_statistic}')
print(f'The p-value is: {p_value}')
print(f'The critical value is: {critical_value}')

The sample means are 0.27 (for wind) and 0.65 (for temperature) 
The degrees of freedom is 186
The t-statistic is: -14.216956094588925
The p-value is: 4.4881450498330003e-32
The critical value is: 1.9728001139921347


Since the p-value is less than our significance level of 0.05, we reject the null hypothesis that the two means are equal to a statistically significant extent.

### Confidence Interval

In [5]:
margin_of_error = critical_value * np.sqrt(A + B)
mean_difference = wind_samples.mean() - temperature_samples.mean()
print(f'The difference between sample means is: {round(mean_difference, 2)}')
print(f'Confidence Interval: ({round(mean_difference - margin_of_error, 2)}, {round(mean_difference + margin_of_error, 2)})')

The difference between sample means is: -0.38
Confidence Interval: (-0.43, -0.32)


Based on the above confidence interval, we can assume with 95% confidence that the actual difference between the population means of wind and temperature is between 32% and 43% of their normalized values.

## Wind vs. Humidity

### Hypothesis Test

In [6]:
alpha = 0.05
wind_samples = get_random_sample(0)
humidity_samples = get_random_sample(1)
wind_var = np.var(wind_samples, ddof=1)
humidity_var = np.var(humidity_samples, ddof=1)
A, B = wind_var / n, humidity_var / n
degrees_of_freedom = int(((A + B) ** 2) / (((A ** 2) / (n - 1)) + ((B ** 2) / (n - 1))))
critical_value = stats.t.ppf(1 - alpha / 2, degrees_of_freedom)
t_statistic, p_value = stats.ttest_ind(wind_samples, humidity_samples)
print(f'The sample means are {round(wind_samples.mean(), 2)} (for wind) and {round(humidity_samples.mean(), 2)} (for humidity) ')
print(f'The degrees of freedom is {degrees_of_freedom}')
print(f'The t-statistic is: {t_statistic}')
print(f'The p-value is: {p_value}')
print(f'The critical value is: {critical_value}')

The sample means are 0.28 (for wind) and 0.6 (for humidity) 
The degrees of freedom is 183
The t-statistic is: -12.026817775936658
The p-value is: 2.2790130342689293e-25
The critical value is: 1.973011915132679


Since the p-value is less than our significance level of 0.05, we reject the null hypothesis in favor of the alternative hypothesis, which states that the sample means are statistically different to a significant extent.

### Confidence Interval

In [7]:
margin_of_error = critical_value * np.sqrt(A + B)
mean_difference = wind_samples.mean() - humidity_samples.mean()
print(f'The difference between sample means is: {round(mean_difference, 2)}')
print(f'Confidence Interval: ({round(mean_difference - margin_of_error, 2)}, {round(mean_difference + margin_of_error, 2)})')

The difference between sample means is: -0.32
Confidence Interval: (-0.37, -0.27)


Based on the above confidence interval, we can assume with 95% confidence that the actual difference between population means is between 27% and 18% of their normalized values.