# Two Variances or Standard Deviations

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import f

file_path = '~/Documents/UNT/csce5310/houston-aqi-2010-2021.csv'
df = pd.read_csv(file_path)

In [2]:
temperature_compare = df[['avg_temperature', 'avg_humidity', 'avg_pressure']].values
print(f"Standard Deviation Temperature: {round(temperature_compare[:, 0].std(), 2)} (variance={round(temperature_compare[:, 0].var(), 2)})")
print(f"Standard Deviation Humidity: {round(temperature_compare[:, 1].std(), 2)} (variance={round(temperature_compare[:, 1].var(), 2)})")
print(f"Standard Deviation Pressure: {round(temperature_compare[:, 2].std(), 2)} (variance={round(temperature_compare[:, 2].var(), 2)})")

Standard Deviation Temperature: 13.81 (variance=190.85)
Standard Deviation Humidity: 12.68 (variance=160.76)
Standard Deviation Pressure: 5.99 (variance=35.92)


In [3]:
n = 100

def get_random_sample(variable):
    return np.random.choice(temperature_compare[:, variable], size=n, replace=False)

## Temperature vs Humidity

### Hypothesis Test

In [4]:
alpha = 0.05
temperature_samples = get_random_sample(0)
humidity_samples = get_random_sample(1)
temperature_var = np.var(temperature_samples, ddof=1)
humidity_var = np.var(humidity_samples, ddof=1)
s1, s2 = sorted([temperature_var, humidity_var], reverse=True) 
f_statistic = s1 / s2
degrees_of_freedom = (n - 1,) * 2
p_value = f.sf(f_statistic, *degrees_of_freedom)
critical_value = f.ppf(1 - alpha / 2, *degrees_of_freedom)
print(f'The sample variances are {round(temperature_var, 2)} (for temperature) and {round(humidity_var, 2)} (for humidity)')
print(f'The sample standard deviations are {round(np.sqrt(temperature_var), 2)} (for temperature) and {round(np.sqrt(humidity_var), 2)} (for humidity)')
print(f'The degrees of freedom is {degrees_of_freedom}')
print(f'The f-statistic is: {f_statistic}')
print(f'The p-value is: {p_value}')
print(f'The critical value is: {critical_value}')

The sample variances are 191.93 (for temperature) and 130.95 (for humidity)
The sample standard deviations are 13.85 (for temperature) and 11.44 (for humidity)
The degrees of freedom is (99, 99)
The f-statistic is: 1.465670546784038
The p-value is: 0.029280811910925218
The critical value is: 1.486233767619293


Since the p-value is less than the significance level of 0.05, we reject the null hypothesis in favor of the alternate, which states that there is a statistically significant difference between the variances or standard deviations for temperature and humidity.

### Confidence Interval

In [5]:
ci_var_ratio = 1 / f.ppf(1 - alpha / 2, *degrees_of_freedom), f.ppf(1 - alpha / 2, *degrees_of_freedom)
var_difference = temperature_var - humidity_var
std_difference = np.sqrt(temperature_var) - np.sqrt(humidity_var)
print(f'The difference between sample variances is: {round(var_difference, 3)}')
print(f'The difference between sample standard deviations is: {round(std_difference, 3)}')
print(f'The confidence interval for variance is: ({round(var_difference - ci_var_ratio[0] * f_statistic, 3)}, {round(var_difference + ci_var_ratio[1] * f_statistic, 3)})')
print(f'The confidence interval for standard deviation is: ({round(std_difference - np.sqrt(ci_var_ratio[0] * f_statistic), 3)}, {round(std_difference + np.sqrt(ci_var_ratio[1] * f_statistic), 3)})')

The difference between sample variances is: 60.981
The difference between sample standard deviations is: 2.411
The confidence interval for variance is: (59.995, 63.159)
The confidence interval for standard deviation is: (1.417, 3.886)


Based on the confidence interval, we can assume that the estimated difference in the population variances of temperature and humidity is somewhere between 59.995 and 63.159 with a 95% confidence level. For standard deviation, we simply take the square root of those numbers, and the confidence interval becomes 1.417 and 3.886 for the population standard deviation with confidence level of 95%.

## Temperature vs Pressure

### Hypothesis Test

In [6]:
alpha = 0.05
temperature_samples = get_random_sample(0)
pressure_samples = get_random_sample(2)
temperature_var = np.var(temperature_samples, ddof=1)
pressure_var = np.var(pressure_samples, ddof=1)
s1, s2 = sorted([temperature_var, pressure_var], reverse=True) 
f_statistic = s1 / s2
degrees_of_freedom = (n - 1,) * 2
p_value = f.sf(f_statistic, *degrees_of_freedom)
critical_value = f.ppf(1 - alpha / 2, *degrees_of_freedom)
print(f'The sample variances are {round(temperature_var, 2)} (for temperature) and {round(pressure_var, 2)} (for pressure)')
print(f'The sample standard deviations are {round(np.sqrt(temperature_var), 2)} (for temperature) and {round(np.sqrt(pressure_var), 2)} (for pressure)')
print(f'The degrees of freedom is {degrees_of_freedom}')
print(f'The f-statistic is: {f_statistic}')
print(f'The p-value is: {p_value}')
print(f'The critical value is: {critical_value}')

The sample variances are 186.76 (for temperature) and 35.76 (for pressure)
The sample standard deviations are 13.67 (for temperature) and 5.98 (for pressure)
The degrees of freedom is (99, 99)
The f-statistic is: 5.223241630833932
The p-value is: 3.1464480486571955e-15
The critical value is: 1.486233767619293


Since the p-value is less than the significance level of 0.05, we reject the null hypothesis in favor of the alternate, which states that there is a statistically significant difference between the variances or standard deviations for temperature and pressure.

### Confidence Interval

In [7]:
ci_var_ratio = 1 / f.ppf(1 - alpha / 2, *degrees_of_freedom), f.ppf(1 - alpha / 2, *degrees_of_freedom)
var_difference = temperature_var - pressure_var
std_difference = np.sqrt(temperature_var) - np.sqrt(pressure_var)
print(f'The difference between sample variances is: {round(var_difference, 3)}')
print(f'The difference between sample standard deviations is: {round(std_difference, 3)}')
print(f'The confidence interval for variance is: ({round(var_difference - ci_var_ratio[0] * f_statistic, 3)}, {round(var_difference + ci_var_ratio[1] * f_statistic, 3)})')
print(f'The confidence interval for standard deviation is: ({round(std_difference - np.sqrt(ci_var_ratio[0] * f_statistic), 3)}, {round(std_difference + np.sqrt(ci_var_ratio[1] * f_statistic), 3)})')

The difference between sample variances is: 151.003
The difference between sample standard deviations is: 7.686
The confidence interval for variance is: (147.489, 158.766)
The confidence interval for standard deviation is: (5.812, 10.473)


Based on the confidence interval, we can assume that the estimated difference in the population variances of temperature and pressure is somewhere between (147.489, 158.766) with a 95% confidence level. For standard deviation, we simply take the square root of those numbers, and the confidence interval becomes (5.812, 10.473) for the population standard deviation with confidence level of 95%.