# Inferences about Two Means (Independent Samples)

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.preprocessing import MinMaxScaler

file_path = '~/Documents/UNT/csce5310/houston-aqi-2010-2021.csv'
df = pd.read_csv(file_path)

In [2]:
humidity_compare = df[['avg_humidity', 'avg_pressure', 'avg_wind']].values
scaler = MinMaxScaler()
humidity_compare = scaler.fit_transform(humidity_compare)
print(f"Mean Humidity: {round(humidity_compare[:, 0].mean(), 2)}")
print(f"Mean Pressure: {round(humidity_compare[:, 1].mean(), 2)}")
print(f"Mean Wind: {round(humidity_compare[:, 2].mean(), 2)}")

Mean Humidity: 0.59
Mean Pressure: 0.43
Mean Wind: 0.29


In [4]:
n = 100

def get_random_sample(variable):
    return np.random.choice(humidity_compare[:, variable], size=n, replace=False)

## Humidity vs Pressure

### Hypothesis Test

In [7]:
alpha = 0.05
humidity_samples = get_random_sample(0)
pressure_samples = get_random_sample(1)
humidity_var = np.var(humidity_samples, ddof=1)
pressure_var = np.var(pressure_samples, ddof=1)
A, B = humidity_var / n, pressure_var / n
degrees_of_freedom = int(((A + B) ** 2) / (((A ** 2) / (n - 1)) + ((B ** 2) / (n - 1))))
critical_value = stats.t.ppf(1 - alpha / 2, degrees_of_freedom)
t_statistic, p_value = stats.ttest_ind(humidity_samples, pressure_samples)
print(f'The sample means are {round(humidity_samples.mean(), 2)} (for humidity) and {round(pressure_samples.mean(), 2)} (for pressure) ')
print(f'The degrees of freedom is {degrees_of_freedom}')
print(f'The t-statistic is: {t_statistic}')
print(f'The p-value is: {p_value}')
print(f'The critical value is: {critical_value}')

The sample means are 0.61 (for humidity) and 0.46 (for pressure) 
The degrees of freedom is 193
The t-statistic is: 5.506061915877539
The p-value is: 1.1295900759273068e-07
The critical value is: 1.9723316757930007


Since the p-value is less than the significance level of 0.05, we reject the null hypothesis in favor of the alternate, and conclude that the temperature and humidity mean difference in the population are statistically different to a significant extent.

### Confidence Interval

In [8]:
margin_of_error = critical_value * np.sqrt(A + B)
mean_difference = humidity_samples.mean() - pressure_samples.mean()
print(f'The difference between sample means is: {round(mean_difference, 2)}')
print(f'Confidence Interval: ({round(mean_difference - margin_of_error, 2)}, {round(mean_difference + margin_of_error, 2)})')

The difference between sample means is: 0.15
Confidence Interval: (0.09, 0.2)


Based on the confidence level, we can infer that the actual distance between population means may be somewhere around 9% and 20%.

## Humidity vs Wind

### Hypothesis Test

In [9]:
alpha = 0.05
humidity_samples = get_random_sample(0)
wind_samples = get_random_sample(2)
humidity_var = np.var(humidity_samples, ddof=1)
wind_var = np.var(wind_samples, ddof=1)
A, B = humidity_var / n, wind_var / n
degrees_of_freedom = int(((A + B) ** 2) / (((A ** 2) / (n - 1)) + ((B ** 2) / (n - 1))))
critical_value = stats.t.ppf(1 - alpha / 2, degrees_of_freedom)
t_statistic, p_value = stats.ttest_ind(humidity_samples, wind_samples)
print(f'The sample means are {round(humidity_samples.mean(), 2)} (for humidity) and {round(wind_samples.mean(), 2)} (for wind) ')
print(f'The degrees of freedom is {degrees_of_freedom}')
print(f'The t-statistic is: {t_statistic}')
print(f'The p-value is: {p_value}')
print(f'The critical value is: {critical_value}')

The sample means are 0.59 (for humidity) and 0.33 (for wind) 
The degrees of freedom is 197
The t-statistic is: 9.753991275120304
The p-value is: 1.3254885758367994e-18
The critical value is: 1.9720790337760217


Since the p-value is less than the significance level of 0.05, we reject the null hypothesis in support of the alternative hypothesis that the means are not equal.

### Confidence Interval

In [10]:
margin_of_error = critical_value * np.sqrt(A + B)
mean_difference = humidity_samples.mean() - wind_samples.mean()
print(f'The difference between sample means is: {round(mean_difference, 2)}')
print(f'Confidence Interval: ({round(mean_difference - margin_of_error, 2)}, {round(mean_difference + margin_of_error, 2)})')

The difference between sample means is: 0.26
Confidence Interval: (0.21, 0.31)


Based on the confidence interval, we can conclude that the difference between population means may be somewhere between 21% and 31% of their normalized values.