# Two Variances or Standard Deviations

In [21]:
import numpy as np
import pandas as pd
from scipy.stats import f

file_path = '~/Documents/UNT/csce5310/houston-aqi-2010-2021.csv'
df = pd.read_csv(file_path)

In [9]:
pressure_compare = df[['avg_pressure', 'avg_wind', 'avg_temperature']].values
print(f"Standard Deviation Pressure: {round(pressure_compare[:, 0].std(), 2)} (variance={round(pressure_compare[:, 0].var(), 2)})")
print(f"Standard Deviation Wind: {round(pressure_compare[:, 1].std(), 2)} (variance={round(pressure_compare[:, 1].var(), 2)})")
print(f"Standard Deviation Temperature: {round(pressure_compare[:, 2].std(), 2)} (variance={round(pressure_compare[:, 2].var(), 2)})")

Standard Deviation Pressure: 5.99 (variance=35.92)
Standard Deviation Wind: 2.21 (variance=4.89)
Standard Deviation Temperature: 13.81 (variance=190.85)


In [10]:
n = 100

def get_random_sample(variable):
    return np.random.choice(pressure_compare[:, variable], size=n, replace=False)

## Pressure vs Wind

### Hypothesis Test

In [19]:
alpha = 0.05
pressure_samples = get_random_sample(0)
wind_samples = get_random_sample(1)
pressure_var = np.var(pressure_samples, ddof=1)
wind_var = np.var(wind_samples, ddof=1)
s1, s2 = sorted([pressure_var, wind_var], reverse=True) 
f_statistic = s1 / s2
degrees_of_freedom = (n - 1,) * 2
p_value = f.sf(f_statistic, *degrees_of_freedom)
critical_value = f.ppf(1 - alpha / 2, *degrees_of_freedom)
print(f'The sample variances are {round(pressure_var, 2)} (for pressure) and {round(wind_var, 2)} (for wind)')
print(f'The sample standard deviations are {round(np.sqrt(pressure_var), 2)} (for pressure) and {round(np.sqrt(wind_var), 2)} (for wind)')
print(f'The degrees of freedom is {degrees_of_freedom}')
print(f'The f-statistic is: {f_statistic}')
print(f'The p-value is: {p_value}')
print(f'The critical value is: {critical_value}')

The sample variances are 37.78 (for pressure) and 5.88 (for wind)
The sample standard deviations are 6.15 (for pressure) and 2.42 (for wind)
The degrees of freedom is (99, 99)
The f-statistic is: 6.427827450296863
The p-value is: 2.0911451344265867e-18
The critical value is: 1.486233767619293


Since the p-value is  greater than 0.05, it follows for this two-tailed test that there is not sufficient evidence to warrant rejection about the claim that the two standard deviations are equal.

### Confidence Interval

In [39]:
ci_var_ratio = 1 / f.ppf(1 - alpha / 2, *degrees_of_freedom), f.ppf(1 - alpha / 2, *degrees_of_freedom)
var_difference = pressure_var - wind_var
std_difference = np.sqrt(pressure_var) - np.sqrt(wind_var)
print(f'The difference between sample variances is: {round(var_difference, 3)}')
print(f'The difference between sample standard deviations is: {round(std_difference, 3)}')
print(f'The confidence interval for variance is: ({round(var_difference - ci_var_ratio[0] * f_statistic, 3)}, {round(var_difference + ci_var_ratio[1] * f_statistic, 3)})')
print(f'The confidence interval for standard deviation is: ({round(std_difference - np.sqrt(ci_var_ratio[0] * f_statistic), 3)}, {round(std_difference + np.sqrt(ci_var_ratio[1] * f_statistic), 3)})')

The difference between sample variances is: 31.9
The difference between sample standard deviations is: 3.722
The confidence interval for variance is: (27.575, 41.453)
The confidence interval for standard deviation is: (1.642, 6.813)


Based on the confidence interval, we can assume that the estimated difference in the population variances of pressure and wind is somewhere between 27.575 and 41.453 with 95% confidence level. For standard deviation, we simply take the square root, and the confidence interval becomes 1.642 and 6.813 for the population standard deviation with confidence level of 95%.

## Pressure vs Temperature

### Hypothesis Test

In [42]:
alpha = 0.05
pressure_samples = get_random_sample(0)
temperature_samples = get_random_sample(2)
pressure_var = np.var(pressure_samples, ddof=1)
temperature_var = np.var(temperature_samples, ddof=1)
s1, s2 = sorted([pressure_var, temperature_var], reverse=True) 
f_statistic = s1 / s2
degrees_of_freedom = (n - 1,) * 2
p_value = f.sf(f_statistic, *degrees_of_freedom)
critical_value = f.ppf(1 - alpha / 2, *degrees_of_freedom)
print(f'The sample variances are {round(pressure_var, 2)} (for pressure) and {round(temperature_var, 2)} (for temperature)')
print(f'The sample standard deviations are {round(np.sqrt(pressure_var), 2)} (for pressure) and {round(np.sqrt(temperature_var), 2)} (for temperature)')
print(f'The degrees of freedom is {degrees_of_freedom}')
print(f'The f-statistic is: {f_statistic}')
print(f'The p-value is: {p_value}')
print(f'The critical value is: {critical_value}')

The sample variances are 37.12 (for pressure) and 214.68 (for temperature)
The sample standard deviations are 6.09 (for pressure) and 14.65 (for temperature)
The degrees of freedom is (99, 99)
The f-statistic is: 5.783423063713187
The p-value is: 9.250043788923911e-17
The critical value is: 1.486233767619293


Since the p-value is less than the significance level of 0.05, we reject the null hypothesis that the population standard deviations (or variances) are equal.

### Confidence Interval

In [43]:
ci_var_ratio = 1 / f.ppf(1 - alpha / 2, *degrees_of_freedom), f.ppf(1 - alpha / 2, *degrees_of_freedom)
var_difference = pressure_var - temperature_var
std_difference = np.sqrt(pressure_var) - np.sqrt(temperature_var)
print(f'The difference between sample variances is: {round(var_difference, 3)}')
print(f'The difference between sample standard deviations is: {round(std_difference, 3)}')
print(f'The confidence interval for variance is: ({round(var_difference - ci_var_ratio[0] * f_statistic, 3)}, {round(var_difference + ci_var_ratio[1] * f_statistic, 3)})')
print(f'The confidence interval for standard deviation is: ({round(std_difference - np.sqrt(ci_var_ratio[0] * f_statistic), 3)}, {round(std_difference + np.sqrt(ci_var_ratio[1] * f_statistic), 3)})')

The difference between sample variances is: -177.556
The difference between sample standard deviations is: -8.559
The confidence interval for variance is: (-181.448, -168.961)
The confidence interval for standard deviation is: (-10.532, -5.627)


Based on the above confidence intervals, for both standard deviation and variance, we can conclude that the true estimate for difference between population variances is between -181.448 and -168.961, and between -10.532 and -5.627 for population standard deviation.