# Two Variances or Standard Deviations

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import f

file_path = '~/Documents/UNT/csce5310/houston-aqi-2010-2021.csv'
df = pd.read_csv(file_path)

In [2]:
humidity_compare = df[['avg_humidity', 'avg_pressure', 'avg_wind']].values
print(f"Standard Deviation Humidity: {round(humidity_compare[:, 0].std(), 2)} (variance={round(humidity_compare[:, 0].var(), 2)})")
print(f"Standard Deviation Pressure: {round(humidity_compare[:, 1].std(), 2)} (variance={round(humidity_compare[:, 1].var(), 2)})")
print(f"Standard Deviation Wind: {round(humidity_compare[:, 2].std(), 2)} (variance={round(humidity_compare[:, 2].var(), 2)})")

Standard Deviation Humidity: 12.68 (variance=160.76)
Standard Deviation Pressure: 5.99 (variance=35.92)
Standard Deviation Wind: 2.21 (variance=4.89)


In [4]:
n = 100

def get_random_sample(variable):
    return np.random.choice(humidity_compare[:, variable], size=n, replace=False)

## Humidity vs Pressure

### Hypothesis Test

In [5]:
alpha = 0.05
humidity_samples = get_random_sample(0)
pressure_samples = get_random_sample(1)
humidity_var = np.var(humidity_samples, ddof=1)
pressure_var = np.var(pressure_samples, ddof=1)
s1, s2 = sorted([humidity_var, pressure_var], reverse=True) 
f_statistic = s1 / s2
degrees_of_freedom = (n - 1,) * 2
p_value = f.sf(f_statistic, *degrees_of_freedom)
critical_value = f.ppf(1 - alpha / 2, *degrees_of_freedom)
print(f'The sample variances are {round(humidity_var, 2)} (for humidity) and {round(pressure_var, 2)} (for pressure)')
print(f'The sample standard deviations are {round(np.sqrt(humidity_var), 2)} (for humidity) and {round(np.sqrt(pressure_var), 2)} (for pressure)')
print(f'The degrees of freedom is {degrees_of_freedom}')
print(f'The f-statistic is: {f_statistic}')
print(f'The p-value is: {p_value}')
print(f'The critical value is: {critical_value}')

The sample variances are 156.0 (for humidity) and 34.83 (for pressure)
The sample standard deviations are 12.49 (for humidity) and 5.9 (for pressure)
The degrees of freedom is (99, 99)
The f-statistic is: 4.479186057168877
The p-value is: 4.971865742443967e-13
The critical value is: 1.486233767619293


Since the p-value is less than the significance level of 0.05, we reject the null hypothesis in favor of the alternate, which states that there is a statistically significant difference between the variances or standard deviations for humidity and pressure.

### Confidence Interval

In [6]:
ci_var_ratio = 1 / f.ppf(1 - alpha / 2, *degrees_of_freedom), f.ppf(1 - alpha / 2, *degrees_of_freedom)
var_difference = humidity_var - pressure_var
std_difference = np.sqrt(humidity_var) - np.sqrt(pressure_var)
print(f'The difference between sample variances is: {round(var_difference, 3)}')
print(f'The difference between sample standard deviations is: {round(std_difference, 3)}')
print(f'The confidence interval for variance is: ({round(var_difference - ci_var_ratio[0] * f_statistic, 3)}, {round(var_difference + ci_var_ratio[1] * f_statistic, 3)})')
print(f'The confidence interval for standard deviation is: ({round(std_difference - np.sqrt(ci_var_ratio[0] * f_statistic), 3)}, {round(std_difference + np.sqrt(ci_var_ratio[1] * f_statistic), 3)})')

The difference between sample variances is: 121.171
The difference between sample standard deviations is: 6.588
The confidence interval for variance is: (118.157, 127.828)
The confidence interval for standard deviation is: (4.852, 9.169)


Based on the confidence interval, we can assume that the estimated difference in the population variances of humidity and pressure is somewhere between (118.157, 127.828) with a 95% confidence level. For standard deviation, we simply take the square root of those numbers, and the confidence interval is (4.852, 9.169) for the population standard deviation with confidence level of 95%.

## Humidity vs Wind

### Hypothesis Test

In [7]:
alpha = 0.05
humidity_samples = get_random_sample(0)
wind_samples = get_random_sample(2)
humidity_var = np.var(humidity_samples, ddof=1)
wind_var = np.var(wind_samples, ddof=1)
s1, s2 = sorted([humidity_var, wind_var], reverse=True) 
f_statistic = s1 / s2
degrees_of_freedom = (n - 1,) * 2
p_value = f.sf(f_statistic, *degrees_of_freedom)
critical_value = f.ppf(1 - alpha / 2, *degrees_of_freedom)
print(f'The sample variances are {round(humidity_var, 2)} (for humidity) and {round(wind_var, 2)} (for wind)')
print(f'The sample standard deviations are {round(np.sqrt(humidity_var), 2)} (for humidity) and {round(np.sqrt(wind_var), 2)} (for wind)')
print(f'The degrees of freedom is {degrees_of_freedom}')
print(f'The f-statistic is: {f_statistic}')
print(f'The p-value is: {p_value}')
print(f'The critical value is: {critical_value}')

The sample variances are 141.12 (for humidity) and 4.88 (for wind)
The sample standard deviations are 11.88 (for humidity) and 2.21 (for wind)
The degrees of freedom is (99, 99)
The f-statistic is: 28.94100171160658
The p-value is: 4.245308518490124e-46
The critical value is: 1.486233767619293


Since the p-value is less than the significance level of 0.05, we reject the null hypothesis in favor of the alternate, which states that there is a statistically significant difference between the variances or standard deviations for humidity and wind.

### Confidence Interval

In [8]:
ci_var_ratio = 1 / f.ppf(1 - alpha / 2, *degrees_of_freedom), f.ppf(1 - alpha / 2, *degrees_of_freedom)
var_difference = humidity_var - wind_var
std_difference = np.sqrt(humidity_var) - np.sqrt(wind_var)
print(f'The difference between sample variances is: {round(var_difference, 3)}')
print(f'The difference between sample standard deviations is: {round(std_difference, 3)}')
print(f'The confidence interval for variance is: ({round(var_difference - ci_var_ratio[0] * f_statistic, 3)}, {round(var_difference + ci_var_ratio[1] * f_statistic, 3)})')
print(f'The confidence interval for standard deviation is: ({round(std_difference - np.sqrt(ci_var_ratio[0] * f_statistic), 3)}, {round(std_difference + np.sqrt(ci_var_ratio[1] * f_statistic), 3)})')

The difference between sample variances is: 136.245
The difference between sample standard deviations is: 9.671
The confidence interval for variance is: (116.773, 179.258)
The confidence interval for standard deviation is: (5.258, 16.23)


Based on the confidence interval, we can assume that the estimated difference in the population variances of humidity and wind is somewhere between (116.773, 179.258) with a 95% confidence level. For standard deviation, we simply take the square root of those numbers, and the confidence interval is (5.258, 16.23) for the population standard deviation with confidence level of 95%.