# Inferences about Two Means (Independent Samples)

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.preprocessing import MinMaxScaler

file_path = '~/Documents/UNT/csce5310/houston-aqi-2010-2021.csv'
df = pd.read_csv(file_path)

In [2]:
temperature_compare = df[['avg_temperature', 'avg_humidity', 'avg_pressure']].values
scaler = MinMaxScaler()
temperature_compare = scaler.fit_transform(temperature_compare)
print(f"Mean Temperature: {round(temperature_compare[:, 0].mean(), 2)}")
print(f"Mean Humidity: {round(temperature_compare[:, 1].mean(), 2)}")
print(f"Mean Pressure: {round(temperature_compare[:, 2].mean(), 2)}")

Mean Temperature: 0.62
Mean Humidity: 0.59
Mean Pressure: 0.43


In [3]:
n = 100

def get_random_sample(variable):
    return np.random.choice(temperature_compare[:, variable], size=n, replace=False)

## Temperature vs Humidity

### Hypothesis Test

In [4]:
alpha = 0.05
temperature_samples = get_random_sample(0)
humidity_samples = get_random_sample(1)
temperature_var = np.var(temperature_samples, ddof=1)
humidity_var = np.var(humidity_samples, ddof=1)
A, B = temperature_var / n, humidity_var / n
degrees_of_freedom = int(((A + B) ** 2) / (((A ** 2) / (n - 1)) + ((B ** 2) / (n - 1))))
critical_value = stats.t.ppf(1 - alpha / 2, degrees_of_freedom)
t_statistic, p_value = stats.ttest_ind(temperature_samples, humidity_samples)
print(f'The sample means are {round(temperature_samples.mean(), 2)} (for temperature) and {round(humidity_samples.mean(), 2)} (for humidity) ')
print(f'The degrees of freedom is {degrees_of_freedom}')
print(f'The t-statistic is: {t_statistic}')
print(f'The p-value is: {p_value}')
print(f'The critical value is: {critical_value}')

The sample means are 0.6 (for temperature) and 0.59 (for humidity) 
The degrees of freedom is 186
The t-statistic is: 0.23753234801749432
The p-value is: 0.8124894311353636
The critical value is: 1.9728001139921347


Since the p-value is greater than our significance level of 0.05, we fail to reject the null hypothesis, which states that the population means are equivalent for temperature and humidity variables.

### Confidence Interval

In [5]:
margin_of_error = critical_value * np.sqrt(A + B)
mean_difference = humidity_samples.mean() - temperature_samples.mean()
print(f'The difference between sample means is: {round(mean_difference, 2)}')
print(f'Confidence Interval: ({round(mean_difference - margin_of_error, 2)}, {round(mean_difference + margin_of_error, 2)})')

The difference between sample means is: -0.01
Confidence Interval: (-0.07, 0.05)


Using the confidence interval, we can assume with 95% confidence that the actual difference between population means in between 5% and 7% of their normalized values.

## Temperature vs Pressure

### Hypothesis Test

In [7]:
alpha = 0.05
temperature_samples = get_random_sample(0)
pressure_samples = get_random_sample(2)
temperature_var = np.var(temperature_samples, ddof=1)
pressure_var = np.var(pressure_samples, ddof=1)
A, B = temperature_var / n, pressure_var / n
degrees_of_freedom = int(((A + B) ** 2) / (((A ** 2) / (n - 1)) + ((B ** 2) / (n - 1))))
critical_value = stats.t.ppf(1 - alpha / 2, degrees_of_freedom)
t_statistic, p_value = stats.ttest_ind(temperature_samples, pressure_samples)
print(f'The sample means are {round(temperature_samples.mean(), 2)} (for temperature) and {round(pressure_samples.mean(), 2)} (for pressure) ')
print(f'The degrees of freedom is {degrees_of_freedom}')
print(f'The t-statistic is: {t_statistic}')
print(f'The p-value is: {p_value}')
print(f'The critical value is: {critical_value}')

The sample means are 0.67 (for temperature) and 0.43 (for pressure) 
The degrees of freedom is 182
The t-statistic is: 8.585975468543129
The p-value is: 2.6464044655816585e-15
The critical value is: 1.9730840773322158


Since the p-value is less than our significance level of 0.05, we reject the null hypothesis in support of the alternate that the population means for temperature and pressure are not equal for this two-tailed test.

### Confidence Interval

In [8]:
margin_of_error = critical_value * np.sqrt(A + B)
mean_difference = temperature_samples.mean() - pressure_samples.mean()
print(f'The difference between sample means is: {round(mean_difference, 2)}')
print(f'Confidence Interval: ({round(mean_difference - margin_of_error, 2)}, {round(mean_difference + margin_of_error, 2)})')

The difference between sample means is: 0.23
Confidence Interval: (0.18, 0.29)


Based on the confidence interval, we can estimate that the actual difference between population means for temperature and pressure is somewhere between 18% and 29% of their normalized values.