[Link to videos and excercises](https://www.khanacademy.org/math/statistics-probability/inference-categorical-data-chi-square-tests)

In [25]:
# a cell to import modules and define helper functions
import math
import scipy.stats as ss

def calculate_chi_cdf(lower, upper, ddof):
    
    lower_bound = lower if isinstance(lower, float) else -math.inf
    upper_bound = upper if isinstance(upper, float) else math.inf
    
    # chi2 is for continuous variables
    cdf_lower = ss.chi2.cdf(lower_bound, ddof)
    cdf_upper = ss.chi2.cdf(upper_bound, ddof)
    interval = cdf_upper-cdf_lower
    
    print("Probability of %.2f < X < %.2f is %.3f" % (lower_bound, upper_bound, interval))
    
    return interval

Conditions for a goodness-of-fit test:
* random sampling
* large counts (at least 5 expected outcomes in each category)
* independent ( <10% of population or sampling with replacement)

# Test statistic and P-value in a goodness-of-fit test
![](img/chi_squared_p1.png)


In [26]:
# expected probability of events, our null hypothesis
P_EXPECTED = [20, 25, 20, 20, 15] # in percents, , must sum up to 100
# observed outcomes
OBSERVED = [16, 11, 16, 18, 19] # in units

# calculating total number of outcomes 
sample_size = sum(OBSERVED)
# expected number of outcomes
expected = [ sample_size * x/100 for x in P_EXPECTED ]
# degrees of freedom is total number of buckets - 1
ddof = len(OBSERVED) - 1

# summing squared differences of outcomes divided by expeted outcomes
chi_squared = sum((o-e)**2/e for o,e in zip(OBSERVED, expected))

p = calculate_chi_cdf(chi_squared, None, ddof)

Probability of 8.38 < X < inf is 0.079
