# Two Variances or Standard Deviations

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import f

file_path = '~/Documents/UNT/csce5310/houston-aqi-2010-2021.csv'
df = pd.read_csv(file_path)

In [2]:
wind_compare = df[['avg_wind', 'avg_temperature', 'avg_humidity']].values
print(f"Standard Deviation Wind: {round(wind_compare[:, 0].std(), 2)} (variance={round(wind_compare[:, 0].var(), 2)})")
print(f"Standard Deviation Temperature: {round(wind_compare[:, 1].std(), 2)} (variance={round(wind_compare[:, 1].var(), 2)})")
print(f"Standard Deviation Humidity: {round(wind_compare[:, 2].std(), 2)} (variance={round(wind_compare[:, 2].var(), 2)})")

Standard Deviation Wind: 2.21 (variance=4.89)
Standard Deviation Temperature: 13.81 (variance=190.85)
Standard Deviation Humidity: 12.68 (variance=160.76)


In [3]:
n = 100

def get_random_sample(variable):
    return np.random.choice(wind_compare[:, variable], size=n, replace=False)

## Wind vs Temperature

### Hypothesis Test

In [4]:
alpha = 0.05
wind_samples = get_random_sample(0)
temperature_samples = get_random_sample(1)
wind_var = np.var(wind_samples, ddof=1)
temperature_var = np.var(temperature_samples, ddof=1)
s1, s2 = sorted([wind_var, temperature_var], reverse=True) 
f_statistic = s1 / s2
degrees_of_freedom = (n - 1,) * 2
p_value = f.sf(f_statistic, *degrees_of_freedom)
critical_value = f.ppf(1 - alpha / 2, *degrees_of_freedom)
print(f'The sample variances are {round(wind_var, 2)} (for wind) and {round(temperature_var, 2)} (for temperature)')
print(f'The sample standard deviations are {round(np.sqrt(wind_var), 2)} (for wind) and {round(np.sqrt(temperature_var), 2)} (for temperature)')
print(f'The degrees of freedom is {degrees_of_freedom}')
print(f'The f-statistic is: {f_statistic}')
print(f'The p-value is: {p_value}')
print(f'The critical value is: {critical_value}')

The sample variances are 4.51 (for wind) and 205.49 (for temperature)
The sample standard deviations are 2.12 (for wind) and 14.34 (for temperature)
The degrees of freedom is (99, 99)
The f-statistic is: 45.55191058419134
The p-value is: 2.471187810489526e-55
The critical value is: 1.486233767619293


Since the p-value is less than the significance level of 0.05, we reject the null hypothesis in favor of the alternate, which states that there is a statistically significant difference between the sample variances or standard deviations.

### Confidence Interval

In [5]:
ci_var_ratio = 1 / f.ppf(1 - alpha / 2, *degrees_of_freedom), f.ppf(1 - alpha / 2, *degrees_of_freedom)
var_difference = wind_var - temperature_var
std_difference = np.sqrt(wind_var) - np.sqrt(temperature_var)
print(f'The difference between sample variances is: {round(var_difference, 3)}')
print(f'The difference between sample standard deviations is: {round(std_difference, 3)}')
print(f'The confidence interval for variance is: ({round(var_difference - ci_var_ratio[0] * f_statistic, 3)}, {round(var_difference + ci_var_ratio[1] * f_statistic, 3)})')
print(f'The confidence interval for standard deviation is: ({round(std_difference - np.sqrt(ci_var_ratio[0] * f_statistic), 3)}, {round(std_difference + np.sqrt(ci_var_ratio[1] * f_statistic), 3)})')

The difference between sample variances is: -200.983
The difference between sample standard deviations is: -12.211
The confidence interval for variance is: (-231.632, -133.282)
The confidence interval for standard deviation is: (-17.747, -3.983)


Based on the confidence interval, we can assume that the estimated difference in the population variances of wind and temperature is somewhere between -231.632 and -133.282 with 95% confidence level. For standard deviation, we simply take the square root of those numbers, and the confidence interval becomes -17.747 and -3.983 for the population standard deviation with confidence level of 95%.

## Wind vs Humidity

### Hypothesis Test

In [6]:
alpha = 0.05
wind_samples = get_random_sample(0)
humidity_samples = get_random_sample(2)
wind_var = np.var(wind_samples, ddof=1)
humidity_var = np.var(humidity_samples, ddof=1)
s1, s2 = sorted([wind_var, humidity_var], reverse=True) 
f_statistic = s1 / s2
degrees_of_freedom = (n - 1,) * 2
p_value = f.sf(f_statistic, *degrees_of_freedom)
critical_value = f.ppf(1 - alpha / 2, *degrees_of_freedom)
print(f'The sample variances are {round(wind_var, 2)} (for wind) and {round(humidity_var, 2)} (for humidity)')
print(f'The sample standard deviations are {round(np.sqrt(wind_var), 2)} (for wind) and {round(np.sqrt(humidity_var), 2)} (for humidity)')
print(f'The degrees of freedom is {degrees_of_freedom}')
print(f'The f-statistic is: {f_statistic}')
print(f'The p-value is: {p_value}')
print(f'The critical value is: {critical_value}')

The sample variances are 6.97 (for wind) and 138.68 (for humidity)
The sample standard deviations are 2.64 (for wind) and 11.78 (for humidity)
The degrees of freedom is (99, 99)
The f-statistic is: 19.90344841637803
The p-value is: 1.1017062206064681e-38
The critical value is: 1.486233767619293


Since the p-value is less than the significance level of 0.05, we reject the null hypothesis in favor of the alternate, which states that there is a statistically significant difference between the sample variances or standard deviations of wind and humidity.

### Confidence Interval

In [7]:
ci_var_ratio = 1 / f.ppf(1 - alpha / 2, *degrees_of_freedom), f.ppf(1 - alpha / 2, *degrees_of_freedom)
var_difference = wind_var - humidity_var
std_difference = np.sqrt(wind_var) - np.sqrt(humidity_var)
print(f'The difference between sample variances is: {round(var_difference, 3)}')
print(f'The difference between sample standard deviations is: {round(std_difference, 3)}')
print(f'The confidence interval for variance is: ({round(var_difference - ci_var_ratio[0] * f_statistic, 3)}, {round(var_difference + ci_var_ratio[1] * f_statistic, 3)})')
print(f'The confidence interval for standard deviation is: ({round(std_difference - np.sqrt(ci_var_ratio[0] * f_statistic), 3)}, {round(std_difference + np.sqrt(ci_var_ratio[1] * f_statistic), 3)})')

The difference between sample variances is: -131.712
The difference between sample standard deviations is: -9.137
The confidence interval for variance is: (-145.104, -102.131)
The confidence interval for standard deviation is: (-12.796, -3.698)


Based on the confidence interval, we can assume that the estimated difference in the population variances of wind and humidity is somewhere between (-145.104, -102.131) with 95% confidence level. For standard deviation, we simply take the square root of those numbers, and the confidence interval becomes (-12.796, -3.698) for the population standard deviation with confidence level of 95%.